aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2022-04-19 17:26:01 +0200
committerTakashi Iwai <tiwai@suse.de>2022-04-19 17:26:01 +0200
commit0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61 (patch)
treeee7d7d116570f39e47399c8f691a5a7565077eeb /kernel
parentALSA: usb-audio: add mapping for MSI MAG X570S Torpedo MAX. (diff)
parentfirmware: cs_dsp: Fix overrun of unterminated control name string (diff)
downloadlinux-dev-0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61.tar.xz
linux-dev-0aea30a07ec6b50de0fc5f5b2ec34a68ead86b61.zip
Merge tag 'asoc-fix-v5.18-rc3' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v5.18 A collection of fixes that came in since the merge window, plus one new device ID for an x86 laptop. Nothing that really stands out with particularly big impact outside of the affected device.
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt3
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/async.c3
-rw-r--r--kernel/audit.c62
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/auditsc.c89
-rw-r--r--kernel/bpf/Kconfig5
-rw-r--r--kernel/bpf/arraymap.c4
-rw-r--r--kernel/bpf/bpf_inode_storage.c9
-rw-r--r--kernel/bpf/bpf_iter.c20
-rw-r--r--kernel/bpf/bpf_local_storage.c60
-rw-r--r--kernel/bpf/bpf_lsm.c23
-rw-r--r--kernel/bpf/bpf_task_storage.c10
-rw-r--r--kernel/bpf/btf.c688
-rw-r--r--kernel/bpf/cgroup.c187
-rw-r--r--kernel/bpf/core.c364
-rw-r--r--kernel/bpf/cpumap.c8
-rw-r--r--kernel/bpf/devmap.c3
-rw-r--r--kernel/bpf/hashtab.c2
-rw-r--r--kernel/bpf/helpers.c47
-rw-r--r--kernel/bpf/inode.c39
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/preload/Kconfig7
-rw-r--r--kernel/bpf/preload/Makefile41
-rw-r--r--kernel/bpf/preload/bpf_preload.h8
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c126
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/Makefile6
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.lskel.h425
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/reuseport_array.c2
-rw-r--r--kernel/bpf/ringbuf.c2
-rw-r--r--kernel/bpf/stackmap.c73
-rw-r--r--kernel/bpf/syscall.c100
-rw-r--r--kernel/bpf/trampoline.c33
-rw-r--r--kernel/bpf/verifier.c468
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup/cgroup-v1.c16
-rw-r--r--kernel/cgroup/cgroup.c31
-rw-r--r--kernel/cgroup/cpuset.c93
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/rstat.c18
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/configs/debug.config1
-rw-r--r--kernel/cpu.c15
-rw-r--r--kernel/cred.c11
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/dma/Kconfig7
-rw-r--r--kernel/dma/Makefile2
-rw-r--r--kernel/dma/contiguous.c4
-rw-r--r--kernel/dma/debug.c4
-rw-r--r--kernel/dma/direct.c28
-rw-r--r--kernel/dma/map_benchmark.c25
-rw-r--r--kernel/dma/mapping.c4
-rw-r--r--kernel/dma/swiotlb.c87
-rw-r--r--kernel/entry/common.c42
-rw-r--r--kernel/entry/kvm.c9
-rw-r--r--kernel/events/callchain.c4
-rw-r--r--kernel/events/core.c297
-rw-r--r--kernel/events/uprobes.c13
-rw-r--r--kernel/exit.c24
-rw-r--r--kernel/extable.c24
-rw-r--r--kernel/fork.c324
-rw-r--r--kernel/futex/core.c2
-rw-r--r--kernel/irq/chip.c29
-rw-r--r--kernel/irq/cpuhotplug.c4
-rw-r--r--kernel/irq/debugfs.c8
-rw-r--r--kernel/irq/irqdesc.c27
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kcov.c98
-rw-r--r--kernel/kprobes.c192
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/livepatch/core.c4
-rw-r--r--kernel/livepatch/patch.c19
-rw-r--r--kernel/livepatch/transition.c12
-rw-r--r--kernel/locking/lockdep.c47
-rw-r--r--kernel/locking/lockdep_internals.h6
-rw-r--r--kernel/locking/lockdep_proc.c51
-rw-r--r--kernel/locking/percpu-rwsem.c5
-rw-r--r--kernel/locking/rwsem.c2
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/module_decompress.c2
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/panic.c37
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c8
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/wakelock.c11
-rw-r--r--kernel/printk/printk.c85
-rw-r--r--kernel/printk/printk_ringbuffer.c52
-rw-r--r--kernel/printk/printk_ringbuffer.h2
-rw-r--r--kernel/printk/sysctl.c2
-rw-r--r--kernel/ptrace.c47
-rw-r--r--kernel/rcu/rcu_segcblist.h4
-rw-r--r--kernel/rcu/rcutorture.c41
-rw-r--r--kernel/rcu/tasks.h14
-rw-r--r--kernel/rcu/tree.c331
-rw-r--r--kernel/rcu/tree.h18
-rw-r--r--kernel/rcu/tree_exp.h5
-rw-r--r--kernel/rcu/tree_nocb.h18
-rw-r--r--kernel/rcu/tree_plugin.h37
-rw-r--r--kernel/rcu/tree_stall.h35
-rw-r--r--kernel/rcu/update.c7
-rw-r--r--kernel/resource.c41
-rw-r--r--kernel/rseq.c8
-rw-r--r--kernel/sched/Makefile28
-rw-r--r--kernel/sched/autogroup.c26
-rw-r--r--kernel/sched/autogroup.h6
-rw-r--r--kernel/sched/build_policy.c52
-rw-r--r--kernel/sched/build_utility.c109
-rw-r--r--kernel/sched/clock.c44
-rw-r--r--kernel/sched/completion.c2
-rw-r--r--kernel/sched/core.c532
-rw-r--r--kernel/sched/core_sched.c5
-rw-r--r--kernel/sched/cpuacct.c12
-rw-r--r--kernel/sched/cpudeadline.c3
-rw-r--r--kernel/sched/cpufreq.c3
-rw-r--r--kernel/sched/cpufreq_schedutil.c20
-rw-r--r--kernel/sched/cpupri.c1
-rw-r--r--kernel/sched/cputime.c1
-rw-r--r--kernel/sched/deadline.c155
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c232
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/isolation.c163
-rw-r--r--kernel/sched/loadavg.c1
-rw-r--r--kernel/sched/membarrier.c10
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c206
-rw-r--r--kernel/sched/rt.c51
-rw-r--r--kernel/sched/sched.h354
-rw-r--r--kernel/sched/stats.c1
-rw-r--r--kernel/sched/stats.h4
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/swait.c1
-rw-r--r--kernel/sched/topology.c268
-rw-r--r--kernel/sched/wait.c1
-rw-r--r--kernel/sched/wait_bit.c2
-rw-r--r--kernel/scs.c12
-rw-r--r--kernel/seccomp.c11
-rw-r--r--kernel/signal.c67
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/stackleak.c5
-rw-r--r--kernel/stacktrace.c3
-rw-r--r--kernel/static_call.c1
-rw-r--r--kernel/sys.c158
-rw-r--r--kernel/sysctl.c20
-rw-r--r--kernel/task_work.c4
-rw-r--r--kernel/taskstats.c5
-rw-r--r--kernel/time/Kconfig9
-rw-r--r--kernel/time/clocksource.c8
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/tick-sched.c77
-rw-r--r--kernel/time/tick-sched.h4
-rw-r--r--kernel/torture.c6
-rw-r--r--kernel/trace/Kconfig49
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c27
-rw-r--r--kernel/trace/bpf_trace.c357
-rw-r--r--kernel/trace/fgraph.c21
-rw-r--r--kernel/trace/fprobe.c332
-rw-r--r--kernel/trace/ftrace.c117
-rw-r--r--kernel/trace/rethook.c317
-rw-r--r--kernel/trace/trace.c101
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_eprobe.c16
-rw-r--r--kernel/trace/trace_events.c104
-rw-r--r--kernel/trace/trace_events_hist.c49
-rw-r--r--kernel/trace/trace_events_synth.c14
-rw-r--r--kernel/trace/trace_events_trigger.c73
-rw-r--r--kernel/trace/trace_events_user.c1628
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_osnoise.c88
-rw-r--r--kernel/trace/trace_probe.c10
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c6
-rw-r--r--kernel/ucount.c5
-rw-r--r--kernel/user_namespace.c14
-rw-r--r--kernel/watch_queue.c26
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c62
193 files changed, 8653 insertions, 3471 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index ce77f0265660..c2f1fd95a821 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -96,8 +96,9 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
- default y
+ default y if HAVE_PREEMPT_DYNAMIC_CALL
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption
diff --git a/kernel/Makefile b/kernel/Makefile
index 56f4ee97f328..471d71935e90 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,6 +108,7 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
+obj-$(CONFIG_RETHOOK) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
diff --git a/kernel/async.c b/kernel/async.c
index b8d7a663497f..b2c4ba5686ee 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
diff --git a/kernel/audit.c b/kernel/audit.c
index e4bbe2c70c26..7690c29d4ee4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -541,20 +541,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -564,19 +566,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -585,24 +599,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -640,7 +662,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -714,16 +736,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -731,7 +755,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
@@ -745,7 +769,7 @@ retry:
rc == -ECONNREFUSED || rc == -EPERM) {
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, rc);
if (rc == -EAGAIN)
rc = 0;
/* continue to drain the queue */
diff --git a/kernel/audit.h b/kernel/audit.h
index c4498090a5bd..58b66543b4d5 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -201,6 +201,10 @@ struct audit_context {
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index fce5d43a933f..ea2ee1181921 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -185,7 +185,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
case AUDITSC_OPENAT2:
- return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags);
+ return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@@ -1340,6 +1340,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1454,6 +1501,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -2849,31 +2901,26 @@ void __audit_fanotify(unsigned int response)
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index d24d518ddd63..d56ee177d5f8 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -30,6 +30,7 @@ config BPF_SYSCALL
select TASKS_TRACE_RCU
select BINARY_PRINTF
select NET_SOCK_MSG if NET
+ select PAGE_POOL if NET
default n
help
Enable the bpf() system call that allows to manipulate BPF programs
@@ -58,6 +59,10 @@ config BPF_JIT_ALWAYS_ON
Enables BPF JIT and removes BPF interpreter to avoid speculative
execution of BPF instructions by the interpreter.
+ When CONFIG_BPF_JIT_ALWAYS_ON is enabled, /proc/sys/net/core/bpf_jit_enable
+ is permanently set to 1 and setting any other value than that will
+ return failure.
+
config BPF_JIT_DEFAULT_ON
def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
depends on HAVE_EBPF_JIT && BPF_JIT
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c7a5be3bf8be..7f145aefbff8 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -837,13 +837,12 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog = bpf_prog_get(fd);
if (IS_ERR(prog))
return prog;
- if (!bpf_prog_array_compatible(array, prog)) {
+ if (!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
@@ -1071,7 +1070,6 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
- spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e29d9e3d853e..96be8d518885 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -136,7 +136,7 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(f->f_inode,
(struct bpf_local_storage_map *)map,
- value, map_flags);
+ value, map_flags, GFP_ATOMIC);
fput(f);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -169,8 +169,9 @@ static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
@@ -196,7 +197,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index b7aef5b3416d..110029ede71e 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -5,6 +5,7 @@
#include <linux/anon_inodes.h>
#include <linux/filter.h>
#include <linux/bpf.h>
+#include <linux/rcupdate_trace.h>
struct bpf_iter_target_info {
struct list_head list;
@@ -684,11 +685,20 @@ int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
{
int ret;
- rcu_read_lock();
- migrate_disable();
- ret = bpf_prog_run(prog, ctx);
- migrate_enable();
- rcu_read_unlock();
+ if (prog->aux->sleepable) {
+ rcu_read_lock_trace();
+ migrate_disable();
+ might_fault();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock_trace();
+ } else {
+ rcu_read_lock();
+ migrate_disable();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+ }
/* bpf program can only return 0 or 1:
* 0 : okay
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 71de2a89869c..01aa2b51ec4d 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -63,7 +63,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool charge_mem)
+ void *value, bool charge_mem, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -71,7 +71,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (selem) {
if (value)
memcpy(SDATA(selem)->data, value, smap->map.value_size);
@@ -136,7 +136,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
* will be done by the caller.
*
* Although the unlock will be done under
- * rcu_read_lock(), it is more intutivie to
+ * rcu_read_lock(), it is more intuitive to
* read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
@@ -282,7 +282,8 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem)
+ struct bpf_local_storage_elem *first_selem,
+ gfp_t gfp_flags)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -293,7 +294,7 @@ int bpf_local_storage_alloc(void *owner,
return err;
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- GFP_ATOMIC | __GFP_NOWARN);
+ gfp_flags | __GFP_NOWARN);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -350,10 +351,10 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags)
+ void *value, u64 map_flags, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
- struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_elem *selem = NULL;
struct bpf_local_storage *local_storage;
unsigned long flags;
int err;
@@ -365,6 +366,9 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
+ if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
+ return ERR_PTR(-EINVAL);
+
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -373,11 +377,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, true);
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem);
+ err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
if (err) {
kfree(selem);
mem_uncharge(smap, owner, smap->elem_size);
@@ -404,6 +408,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
}
}
+ if (gfp_flags == GFP_KERNEL) {
+ selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+ }
+
raw_spin_lock_irqsave(&local_storage->lock, flags);
/* Recheck local_storage->list under local_storage->lock */
@@ -429,19 +439,21 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
goto unlock;
}
- /* local_storage->lock is held. Hence, we are sure
- * we can unlink and uncharge the old_sdata successfully
- * later. Hence, instead of charging the new selem now
- * and then uncharge the old selem later (which may cause
- * a potential but unnecessary charge failure), avoid taking
- * a charge at all here (the "!old_sdata" check) and the
- * old_sdata will not be uncharged later during
- * bpf_selem_unlink_storage_nolock().
- */
- selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
- if (!selem) {
- err = -ENOMEM;
- goto unlock_err;
+ if (gfp_flags != GFP_KERNEL) {
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata, gfp_flags);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
}
/* First, link the new selem to the map */
@@ -463,6 +475,10 @@ unlock:
unlock_err:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ if (selem) {
+ mem_uncharge(smap, owner, smap->elem_size);
+ kfree(selem);
+ }
return ERR_PTR(err);
}
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 06062370c3b8..064eccba641d 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -99,6 +99,24 @@ static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
.allowed = bpf_ima_inode_hash_allowed,
};
+BPF_CALL_3(bpf_ima_file_hash, struct file *, file, void *, dst, u32, size)
+{
+ return ima_file_hash(file, dst, size);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_file_hash_btf_ids, struct, file)
+
+static const struct bpf_func_proto bpf_ima_file_hash_proto = {
+ .func = bpf_ima_file_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_file_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
static const struct bpf_func_proto *
bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -121,6 +139,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_bprm_opts_set_proto;
case BPF_FUNC_ima_inode_hash:
return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ case BPF_FUNC_ima_file_hash:
+ return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
default:
return tracing_prog_func_proto(func_id, prog);
}
@@ -167,6 +187,7 @@ BTF_ID(func, bpf_lsm_inode_setxattr)
BTF_ID(func, bpf_lsm_inode_symlink)
BTF_ID(func, bpf_lsm_inode_unlink)
BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernel_read_file)
BTF_ID(func, bpf_lsm_kernfs_init_security)
#ifdef CONFIG_KEYS
@@ -207,7 +228,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
-BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 5da7bed0f5f6..6638a0ecc3d2 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -174,7 +174,8 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
- task, (struct bpf_local_storage_map *)map, value, map_flags);
+ task, (struct bpf_local_storage_map *)map, value, map_flags,
+ GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -226,8 +227,9 @@ out:
return err;
}
-BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags)
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags, gfp_t, gfp_flags)
{
struct bpf_local_storage_data *sdata;
@@ -250,7 +252,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST);
+ BPF_NOEXIST, gfp_flags);
unlock:
bpf_task_storage_unlock();
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e16dafeb2450..0918a39279f6 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018 Facebook */
#include <uapi/linux/btf.h>
@@ -198,6 +198,21 @@
DEFINE_IDR(btf_idr);
DEFINE_SPINLOCK(btf_idr_lock);
+enum btf_kfunc_hook {
+ BTF_KFUNC_HOOK_XDP,
+ BTF_KFUNC_HOOK_TC,
+ BTF_KFUNC_HOOK_STRUCT_OPS,
+ BTF_KFUNC_HOOK_MAX,
+};
+
+enum {
+ BTF_KFUNC_SET_MAX_CNT = 32,
+};
+
+struct btf_kfunc_set_tab {
+ struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+};
+
struct btf {
void *data;
struct btf_type **types;
@@ -212,6 +227,7 @@ struct btf {
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+ struct btf_kfunc_set_tab *kfunc_set_tab;
/* split BTF support */
struct btf *base_btf;
@@ -403,6 +419,9 @@ static struct btf_type btf_void;
static int btf_resolve(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id);
+static int btf_func_check(struct btf_verifier_env *env,
+ const struct btf_type *t);
+
static bool btf_type_is_modifier(const struct btf_type *t)
{
/* Some of them is not strictly a C modifier
@@ -506,6 +525,50 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
return -ENOENT;
}
+static s32 bpf_find_btf_id(const char *name, u32 kind, struct btf **btf_p)
+{
+ struct btf *btf;
+ s32 ret;
+ int id;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+ if (!btf)
+ return -EINVAL;
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret > 0) {
+ btf_get(btf);
+ *btf_p = btf;
+ return ret;
+ }
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, btf, id) {
+ if (!btf_is_module(btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(btf, name, kind);
+ if (ret > 0) {
+ *btf_p = btf;
+ return ret;
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ return ret;
+}
+
const struct btf_type *btf_type_skip_modifiers(const struct btf *btf,
u32 id, u32 *res_id)
{
@@ -579,6 +642,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_func(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -1531,8 +1595,30 @@ static void btf_free_id(struct btf *btf)
spin_unlock_irqrestore(&btf_idr_lock, flags);
}
+static void btf_free_kfunc_set_tab(struct btf *btf)
+{
+ struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
+ int hook, type;
+
+ if (!tab)
+ return;
+ /* For module BTF, we directly assign the sets being registered, so
+ * there is nothing to free except kfunc_set_tab.
+ */
+ if (btf_is_module(btf))
+ goto free_tab;
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
+ for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
+ kfree(tab->sets[hook][type]);
+ }
+free_tab:
+ kfree(tab);
+ btf->kfunc_set_tab = NULL;
+}
+
static void btf_free(struct btf *btf)
{
+ btf_free_kfunc_set_tab(btf);
kvfree(btf->types);
kvfree(btf->resolved_sizes);
kvfree(btf->resolved_ids);
@@ -2505,7 +2591,7 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
*
* We now need to continue from the last-resolved-ptr to
* ensure the last-resolved-ptr will not referring back to
- * the currenct ptr (t).
+ * the current ptr (t).
*/
if (btf_type_is_modifier(next_type)) {
const struct btf_type *resolved_type;
@@ -3533,9 +3619,24 @@ static s32 btf_func_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_func_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ int err;
+
+ err = btf_func_check(env, t);
+ if (err)
+ return err;
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+ return 0;
+}
+
static struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
- .resolve = btf_df_resolve,
+ .resolve = btf_func_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
@@ -4156,7 +4257,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
- if (btf_type_is_decl_tag(t))
+ if (btf_type_is_decl_tag(t) || btf_type_is_func(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
@@ -4246,12 +4347,6 @@ static int btf_check_all_types(struct btf_verifier_env *env)
if (err)
return err;
}
-
- if (btf_type_is_func(t)) {
- err = btf_func_check(env, t);
- if (err)
- return err;
- }
}
return 0;
@@ -4387,8 +4482,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
btf = env->btf;
btf_data_size = btf->data_size;
- if (btf_data_size <
- offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) {
+ if (btf_data_size < offsetofend(struct btf_header, hdr_len)) {
btf_verifier_log(env, "hdr_len not found");
return -EINVAL;
}
@@ -4848,6 +4942,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
+ const char *tag_value;
u32 nr_args, arg;
int i, ret;
@@ -5000,6 +5095,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
+
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ if (strcmp(tag_value, "user") == 0)
+ info->reg_type |= MEM_USER;
+ if (strcmp(tag_value, "percpu") == 0)
+ info->reg_type |= MEM_PERCPU;
+ }
+
/* skip modifiers */
while (btf_type_is_modifier(t)) {
info->btf_id = t->type;
@@ -5026,12 +5130,12 @@ enum bpf_struct_walk_result {
static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
- const char *tname, *mname;
+ const char *tname, *mname, *tag_value;
u32 vlen, elem_id, mid;
again:
@@ -5215,7 +5319,8 @@ error:
}
if (btf_type_is_ptr(mtype)) {
- const struct btf_type *stype;
+ const struct btf_type *stype, *t;
+ enum bpf_type_flag tmp_flag = 0;
u32 id;
if (msize != size || off != moff) {
@@ -5224,9 +5329,23 @@ error:
mname, moff, tname, off, size);
return -EACCES;
}
+
+ /* check type tag */
+ t = btf_type_by_id(btf, mtype->type);
+ if (btf_type_is_type_tag(t)) {
+ tag_value = __btf_name_by_offset(btf, t->name_off);
+ /* check __user tag */
+ if (strcmp(tag_value, "user") == 0)
+ tmp_flag = MEM_USER;
+ /* check __percpu tag */
+ if (strcmp(tag_value, "percpu") == 0)
+ tmp_flag = MEM_PERCPU;
+ }
+
stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
+ *flag = tmp_flag;
return WALK_PTR;
}
}
@@ -5253,13 +5372,14 @@ error:
int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, int off, int size,
enum bpf_access_type atype __maybe_unused,
- u32 *next_btf_id)
+ u32 *next_btf_id, enum bpf_type_flag *flag)
{
+ enum bpf_type_flag tmp_flag = 0;
int err;
u32 id;
do {
- err = btf_struct_walk(log, btf, t, off, size, &id);
+ err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag);
switch (err) {
case WALK_PTR:
@@ -5267,6 +5387,7 @@ int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
* we're done.
*/
*next_btf_id = id;
+ *flag = tmp_flag;
return PTR_TO_BTF_ID;
case WALK_SCALAR:
return SCALAR_VALUE;
@@ -5311,6 +5432,7 @@ bool btf_struct_ids_match(struct bpf_verifier_log *log,
const struct btf *need_btf, u32 need_type_id)
{
const struct btf_type *type;
+ enum bpf_type_flag flag;
int err;
/* Are we already done? */
@@ -5321,7 +5443,7 @@ again:
type = btf_type_by_id(btf, id);
if (!type)
return false;
- err = btf_struct_walk(log, btf, type, off, 1, &id);
+ err = btf_struct_walk(log, btf, type, off, 1, &id, &flag);
if (err != WALK_STRUCT)
return false;
@@ -5385,7 +5507,7 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
}
args = (const struct btf_param *)(func + 1);
nargs = btf_type_vlen(func);
- if (nargs >= MAX_BPF_FUNC_ARGS) {
+ if (nargs > MAX_BPF_FUNC_ARGS) {
bpf_log(log,
"The function %s has %d arguments. Too many.\n",
tname, nargs);
@@ -5616,17 +5738,45 @@ static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
return true;
}
+static bool is_kfunc_arg_mem_size(const struct btf *btf,
+ const struct btf_param *arg,
+ const struct bpf_reg_state *reg)
+{
+ int len, sfx_len = sizeof("__sz") - 1;
+ const struct btf_type *t;
+ const char *param_name;
+
+ t = btf_type_skip_modifiers(btf, arg->type, NULL);
+ if (!btf_type_is_scalar(t) || reg->type != SCALAR_VALUE)
+ return false;
+
+ /* In the future, this can be ported to use BTF tagging */
+ param_name = btf_name_by_offset(btf, arg->name_off);
+ if (str_is_empty(param_name))
+ return false;
+ len = strlen(param_name);
+ if (len < sfx_len)
+ return false;
+ param_name += len - sfx_len;
+ if (strncmp(param_name, "__sz", sfx_len))
+ return false;
+
+ return true;
+}
+
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
+ u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
- u32 i, nargs, ref_id;
+ int ref_regno = 0, ret;
+ bool rel = false;
t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
@@ -5652,6 +5802,10 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* Only kfunc can be release func */
+ if (is_kfunc)
+ rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RELEASE, func_id);
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
@@ -5675,6 +5829,11 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+
+ ret = check_func_arg_reg_off(env, reg, regno, ARG_DONTCARE, rel);
+ if (ret < 0)
+ return ret;
+
if (btf_get_prog_ctx_type(log, btf, t,
env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
@@ -5686,9 +5845,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
i, btf_type_str(t));
return -EINVAL;
}
- if (check_ptr_off_reg(env, reg, regno))
- return -EINVAL;
- } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
+ } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
+ (reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
@@ -5704,9 +5862,23 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
+ /* Ensure only one argument is referenced
+ * PTR_TO_BTF_ID, check_func_arg_reg_off relies
+ * on only one referenced register being allowed
+ * for kfuncs.
+ */
+ if (reg->ref_obj_id) {
+ if (ref_obj_id) {
+ bpf_log(log, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+ regno, reg->ref_obj_id, ref_obj_id);
+ return -EFAULT;
+ }
+ ref_regno = regno;
+ ref_obj_id = reg->ref_obj_id;
+ }
} else {
reg_btf = btf_vmlinux;
- reg_ref_id = *reg2btf_ids[reg->type];
+ reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@@ -5727,17 +5899,33 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
u32 type_size;
if (is_kfunc) {
+ bool arg_mem_size = i + 1 < nargs && is_kfunc_arg_mem_size(btf, &args[i + 1], &regs[regno + 1]);
+
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
+ * When arg_mem_size is true, the pointer can be
+ * void *.
*/
if (!btf_type_is_scalar(ref_t) &&
- !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0) &&
+ (arg_mem_size ? !btf_type_is_void(ref_t) : 1)) {
bpf_log(log,
- "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
- i, btf_type_str(ref_t), ref_tname);
+ "arg#%d pointer type %s %s must point to %sscalar, or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname, arg_mem_size ? "void, " : "");
return -EINVAL;
}
+
+ /* Check for mem, len pair */
+ if (arg_mem_size) {
+ if (check_kfunc_mem_size_reg(env, &regs[regno + 1], regno + 1)) {
+ bpf_log(log, "arg#%d arg#%d memory, len pair leads to invalid memory access\n",
+ i, i + 1);
+ return -EINVAL;
+ }
+ i++;
+ continue;
+ }
}
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
@@ -5758,7 +5946,20 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
}
}
- return 0;
+ /* Either both are set, or neither */
+ WARN_ON_ONCE((ref_obj_id && !ref_regno) || (!ref_obj_id && ref_regno));
+ /* We already made sure ref_obj_id is set only for one argument. We do
+ * allow (!rel && ref_obj_id), so that passing such referenced
+ * PTR_TO_BTF_ID to other kfuncs works. Note that rel is only true when
+ * is_kfunc is true.
+ */
+ if (rel && !ref_obj_id) {
+ bpf_log(log, "release kernel function %s expects refcounted PTR_TO_BTF_ID\n",
+ func_name);
+ return -EINVAL;
+ }
+ /* returns argument register number > 0 in case of reference release kfunc */
+ return rel ? ref_regno : 0;
}
/* Compare BTF of a function with given bpf_reg_state.
@@ -6004,7 +6205,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- /* If we encontered an error, return it. */
+ /* If we encountered an error, return it. */
if (ssnprintf.show.state.status)
return ssnprintf.show.state.status;
@@ -6200,12 +6401,17 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
+enum {
+ BTF_MODULE_F_LIVE = (1 << 0),
+};
+
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
struct btf_module {
struct list_head list;
struct module *module;
struct btf *btf;
struct bin_attribute *sysfs_attr;
+ int flags;
};
static LIST_HEAD(btf_modules);
@@ -6233,7 +6439,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
int err = 0;
if (mod->btf_data_size == 0 ||
- (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_LIVE &&
+ op != MODULE_STATE_GOING))
goto out;
switch (op) {
@@ -6248,7 +6455,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
pr_warn("failed to validate module [%s] BTF: %ld\n",
mod->name, PTR_ERR(btf));
kfree(btf_mod);
- err = PTR_ERR(btf);
+ if (!IS_ENABLED(CONFIG_MODULE_ALLOW_BTF_MISMATCH))
+ err = PTR_ERR(btf);
goto out;
}
err = btf_alloc_id(btf);
@@ -6292,6 +6500,17 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
}
break;
+ case MODULE_STATE_LIVE:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_mod->flags |= BTF_MODULE_F_LIVE;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
case MODULE_STATE_GOING:
mutex_lock(&btf_module_mutex);
list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
@@ -6338,7 +6557,12 @@ struct module *btf_try_get_module(const struct btf *btf)
if (btf_mod->btf != btf)
continue;
- if (try_module_get(btf_mod->module))
+ /* We must only consider module whose __init routine has
+ * finished, hence we must check for BTF_MODULE_F_LIVE flag,
+ * which is set from the notifier callback for
+ * MODULE_STATE_LIVE.
+ */
+ if ((btf_mod->flags & BTF_MODULE_F_LIVE) && try_module_get(btf_mod->module))
res = btf_mod->module;
break;
@@ -6349,9 +6573,43 @@ struct module *btf_try_get_module(const struct btf *btf)
return res;
}
+/* Returns struct btf corresponding to the struct module.
+ * This function can return NULL or ERR_PTR.
+ */
+static struct btf *btf_get_module_btf(const struct module *module)
+{
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+#endif
+ struct btf *btf = NULL;
+
+ if (!module) {
+ btf = bpf_get_btf_vmlinux();
+ if (!IS_ERR_OR_NULL(btf))
+ btf_get(btf);
+ return btf;
+ }
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ btf_get(btf_mod->btf);
+ btf = btf_mod->btf;
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return btf;
+}
+
BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
{
- struct btf *btf;
+ struct btf *btf = NULL;
+ int btf_obj_fd = 0;
long ret;
if (flags)
@@ -6360,44 +6618,17 @@ BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
- btf = bpf_get_btf_vmlinux();
- if (IS_ERR(btf))
- return PTR_ERR(btf);
-
- ret = btf_find_by_name_kind(btf, name, kind);
- /* ret is never zero, since btf_find_by_name_kind returns
- * positive btf_id or negative error.
- */
- if (ret < 0) {
- struct btf *mod_btf;
- int id;
-
- /* If name is not found in vmlinux's BTF then search in module's BTFs */
- spin_lock_bh(&btf_idr_lock);
- idr_for_each_entry(&btf_idr, mod_btf, id) {
- if (!btf_is_module(mod_btf))
- continue;
- /* linear search could be slow hence unlock/lock
- * the IDR to avoiding holding it for too long
- */
- btf_get(mod_btf);
- spin_unlock_bh(&btf_idr_lock);
- ret = btf_find_by_name_kind(mod_btf, name, kind);
- if (ret > 0) {
- int btf_obj_fd;
-
- btf_obj_fd = __btf_new_fd(mod_btf);
- if (btf_obj_fd < 0) {
- btf_put(mod_btf);
- return btf_obj_fd;
- }
- return ret | (((u64)btf_obj_fd) << 32);
- }
- spin_lock_bh(&btf_idr_lock);
- btf_put(mod_btf);
+ ret = bpf_find_btf_id(name, kind, &btf);
+ if (ret > 0 && btf_is_module(btf)) {
+ btf_obj_fd = __btf_new_fd(btf);
+ if (btf_obj_fd < 0) {
+ btf_put(btf);
+ return btf_obj_fd;
}
- spin_unlock_bh(&btf_idr_lock);
+ return ret | (((u64)btf_obj_fd) << 32);
}
+ if (ret > 0)
+ btf_put(btf);
return ret;
}
@@ -6416,58 +6647,298 @@ BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
-/* BTF ID set registration API for modules */
+/* Kernel Function (kfunc) BTF ID set registration API */
-#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ struct btf_id_set *add_set, bool vmlinux_set)
+{
+ struct btf_kfunc_set_tab *tab;
+ struct btf_id_set *set;
+ u32 set_cnt;
+ int ret;
-void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ if (!add_set->cnt)
+ return 0;
+
+ tab = btf->kfunc_set_tab;
+ if (!tab) {
+ tab = kzalloc(sizeof(*tab), GFP_KERNEL | __GFP_NOWARN);
+ if (!tab)
+ return -ENOMEM;
+ btf->kfunc_set_tab = tab;
+ }
+
+ set = tab->sets[hook][type];
+ /* Warn when register_btf_kfunc_id_set is called twice for the same hook
+ * for module sets.
+ */
+ if (WARN_ON_ONCE(set && !vmlinux_set)) {
+ ret = -EINVAL;
+ goto end;
+ }
+
+ /* We don't need to allocate, concatenate, and sort module sets, because
+ * only one is allowed per hook. Hence, we can directly assign the
+ * pointer and return.
+ */
+ if (!vmlinux_set) {
+ tab->sets[hook][type] = add_set;
+ return 0;
+ }
+
+ /* In case of vmlinux sets, there may be more than one set being
+ * registered per hook. To create a unified set, we allocate a new set
+ * and concatenate all individual sets being registered. While each set
+ * is individually sorted, they may become unsorted when concatenated,
+ * hence re-sorting the final set again is required to make binary
+ * searching the set using btf_id_set_contains function work.
+ */
+ set_cnt = set ? set->cnt : 0;
+
+ if (set_cnt > U32_MAX - add_set->cnt) {
+ ret = -EOVERFLOW;
+ goto end;
+ }
+
+ if (set_cnt + add_set->cnt > BTF_KFUNC_SET_MAX_CNT) {
+ ret = -E2BIG;
+ goto end;
+ }
+
+ /* Grow set */
+ set = krealloc(tab->sets[hook][type],
+ offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!set) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ /* For newly allocated set, initialize set->cnt to 0 */
+ if (!tab->sets[hook][type])
+ set->cnt = 0;
+ tab->sets[hook][type] = set;
+
+ /* Concatenate the two sets */
+ memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+ set->cnt += add_set->cnt;
+
+ sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+
+ return 0;
+end:
+ btf_free_kfunc_set_tab(btf);
+ return ret;
+}
+
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ const struct btf_kfunc_id_set *kset)
{
- mutex_lock(&l->mutex);
- list_add(&s->list, &l->list);
- mutex_unlock(&l->mutex);
+ bool vmlinux_set = !btf_is_module(btf);
+ int type, ret = 0;
+
+ for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
+ if (!kset->sets[type])
+ continue;
+
+ ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
+ if (ret)
+ break;
+ }
+ return ret;
}
-EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
-void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
- struct kfunc_btf_id_set *s)
+static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+ enum btf_kfunc_hook hook,
+ enum btf_kfunc_type type,
+ u32 kfunc_btf_id)
{
- mutex_lock(&l->mutex);
- list_del_init(&s->list);
- mutex_unlock(&l->mutex);
+ struct btf_id_set *set;
+
+ if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
+ return false;
+ if (!btf->kfunc_set_tab)
+ return false;
+ set = btf->kfunc_set_tab->sets[hook][type];
+ if (!set)
+ return false;
+ return btf_id_set_contains(set, kfunc_btf_id);
}
-EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
-bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
- struct module *owner)
+static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
{
- struct kfunc_btf_id_set *s;
+ switch (prog_type) {
+ case BPF_PROG_TYPE_XDP:
+ return BTF_KFUNC_HOOK_XDP;
+ case BPF_PROG_TYPE_SCHED_CLS:
+ return BTF_KFUNC_HOOK_TC;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ return BTF_KFUNC_HOOK_STRUCT_OPS;
+ default:
+ return BTF_KFUNC_HOOK_MAX;
+ }
+}
- mutex_lock(&klist->mutex);
- list_for_each_entry(s, &klist->list, list) {
- if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
- mutex_unlock(&klist->mutex);
- return true;
+/* Caution:
+ * Reference to the module (obtained using btf_try_get_module) corresponding to
+ * the struct btf *MUST* be held when calling this function from verifier
+ * context. This is usually true as we stash references in prog's kfunc_btf_tab;
+ * keeping the reference for the duration of the call provides the necessary
+ * protection for looking up a well-formed btf->kfunc_set_tab.
+ */
+bool btf_kfunc_id_set_contains(const struct btf *btf,
+ enum bpf_prog_type prog_type,
+ enum btf_kfunc_type type, u32 kfunc_btf_id)
+{
+ enum btf_kfunc_hook hook;
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+}
+
+/* This function must be invoked only from initcalls/module init functions */
+int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
+ const struct btf_kfunc_id_set *kset)
+{
+ enum btf_kfunc_hook hook;
+ struct btf *btf;
+ int ret;
+
+ btf = btf_get_module_btf(kset->owner);
+ if (!btf) {
+ if (!kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) {
+ pr_err("missing vmlinux BTF, cannot register kfuncs\n");
+ return -ENOENT;
+ }
+ if (kset->owner && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) {
+ pr_err("missing module BTF, cannot register kfuncs\n");
+ return -ENOENT;
}
+ return 0;
}
- mutex_unlock(&klist->mutex);
- return false;
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ hook = bpf_prog_type_to_kfunc_hook(prog_type);
+ ret = btf_populate_kfunc_set(btf, hook, kset);
+ btf_put(btf);
+ return ret;
}
+EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set);
+
+#define MAX_TYPES_ARE_COMPAT_DEPTH 2
-#define DEFINE_KFUNC_BTF_ID_LIST(name) \
- struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
- __MUTEX_INITIALIZER(name.mutex) }; \
- EXPORT_SYMBOL_GPL(name)
+static
+int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id,
+ int level)
+{
+ const struct btf_type *local_type, *targ_type;
+ int depth = 32; /* max recursion depth */
-DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
-DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+ /* caller made sure that names match (ignoring flavor suffix) */
+ local_type = btf_type_by_id(local_btf, local_id);
+ targ_type = btf_type_by_id(targ_btf, targ_id);
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
-#endif
+recur:
+ depth--;
+ if (depth < 0)
+ return -EINVAL;
+
+ local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
+ targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
+ if (!local_type || !targ_type)
+ return -EINVAL;
+
+ if (btf_kind(local_type) != btf_kind(targ_type))
+ return 0;
+
+ switch (btf_kind(local_type)) {
+ case BTF_KIND_UNKN:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ case BTF_KIND_FWD:
+ return 1;
+ case BTF_KIND_INT:
+ /* just reject deprecated bitfield-like integers; all other
+ * integers are by default compatible between each other
+ */
+ return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
+ case BTF_KIND_PTR:
+ local_id = local_type->type;
+ targ_id = targ_type->type;
+ goto recur;
+ case BTF_KIND_ARRAY:
+ local_id = btf_array(local_type)->type;
+ targ_id = btf_array(targ_type)->type;
+ goto recur;
+ case BTF_KIND_FUNC_PROTO: {
+ struct btf_param *local_p = btf_params(local_type);
+ struct btf_param *targ_p = btf_params(targ_type);
+ __u16 local_vlen = btf_vlen(local_type);
+ __u16 targ_vlen = btf_vlen(targ_type);
+ int i, err;
+
+ if (local_vlen != targ_vlen)
+ return 0;
+
+ for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
+ if (level <= 0)
+ return -EINVAL;
+ btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
+ err = __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ level - 1);
+ if (err <= 0)
+ return err;
+ }
+
+ /* tail recurse for return type check */
+ btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
+ btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
+ goto recur;
+ }
+ default:
+ return 0;
+ }
+}
+
+/* Check local and target types for compatibility. This check is used for
+ * type-based CO-RE relocations and follow slightly different rules than
+ * field-based relocations. This function assumes that root types were already
+ * checked for name match. Beyond that initial root-level name check, names
+ * are completely ignored. Compatibility rules are as follows:
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ * kind should match for local and target types (i.e., STRUCT is not
+ * compatible with UNION);
+ * - for ENUMs, the size is ignored;
+ * - for INT, size and signedness are ignored;
+ * - for ARRAY, dimensionality is ignored, element types are checked for
+ * compatibility recursively;
+ * - CONST/VOLATILE/RESTRICT modifiers are ignored;
+ * - TYPEDEFs/PTRs are compatible if types they pointing to are compatible;
+ * - FUNC_PROTOs are compatible if they have compatible signature: same
+ * number of input args and compatible return and argument types.
+ * These rules are not set in stone and probably will be adjusted as we get
+ * more experience with using BPF CO-RE relocations.
+ */
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
- return -EOPNOTSUPP;
+ return __bpf_core_types_are_compat(local_btf, local_id,
+ targ_btf, targ_id,
+ MAX_TYPES_ARE_COMPAT_DEPTH);
}
static bool bpf_core_is_flavor_sep(const char *s)
@@ -6710,6 +7181,8 @@ bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
main_btf = bpf_get_btf_vmlinux();
if (IS_ERR(main_btf))
return ERR_CAST(main_btf);
+ if (!main_btf)
+ return ERR_PTR(-EINVAL);
local_type = btf_type_by_id(local_btf, local_type_id);
if (!local_type)
@@ -6788,6 +7261,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
{
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
struct bpf_core_cand_list cands = {};
+ struct bpf_core_relo_res targ_res;
struct bpf_core_spec *specs;
int err;
@@ -6827,13 +7301,19 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
cands.len = cc->cnt;
/* cand_cache_mutex needs to span the cache lookup and
* copy of btf pointer into bpf_core_cand_list,
- * since module can be unloaded while bpf_core_apply_relo_insn
+ * since module can be unloaded while bpf_core_calc_relo_insn
* is working with module's btf.
*/
}
- err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
- relo, relo_idx, ctx->btf, &cands, specs);
+ err = bpf_core_calc_relo_insn((void *)ctx->log, relo, relo_idx, ctx->btf, &cands, specs,
+ &targ_res);
+ if (err)
+ goto out;
+
+ err = bpf_core_patch_insn((void *)ctx->log, insn, relo->insn_off / 8, relo, relo_idx,
+ &targ_res);
+
out:
kfree(specs);
if (need_cands) {
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 514b4681a90a..128028efda64 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -1031,7 +1031,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* __cgroup_bpf_run_filter_skb() - Run a program for packet filtering
* @sk: The socket sending or receiving traffic
* @skb: The skb that is being sent or received
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* If no socket is passed, or the socket is not of type INET or INET6,
* this function does nothing and returns 0.
@@ -1044,7 +1044,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
* NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr
* NET_XMIT_CN (2) - continue with packet output and notify TCP
* to call cwr
- * -EPERM - drop packet
+ * -err - drop packet
*
* For ingress packets, this function will return -EPERM if any
* attached program was found and if it returned != 1 during execution.
@@ -1079,8 +1079,9 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
- __bpf_prog_run_save_cb);
- ret = (ret == 1 ? 0 : -EPERM);
+ __bpf_prog_run_save_cb, 0);
+ if (ret && !IS_ERR_VALUE((long)ret))
+ ret = -EFAULT;
}
bpf_restore_data_end(skb, saved_data_end);
__skb_pull(skb, offset);
@@ -1093,7 +1094,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
/**
* __cgroup_bpf_run_filter_sk() - Run a program on a sock
* @sk: sock structure to manipulate
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket is passed is expected to be of type INET or INET6.
*
@@ -1107,10 +1108,9 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -1119,7 +1119,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* provided by user sockaddr
* @sk: sock struct that will use sockaddr
* @uaddr: sockaddr struct provided by user
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
* @t_ctx: Pointer to attach type specific context
* @flags: Pointer to u32 which contains higher bits of BPF program
* return value (OR'ed together).
@@ -1142,7 +1142,6 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
};
struct sockaddr_storage unspec;
struct cgroup *cgrp;
- int ret;
/* Check socket family since not all sockets represent network
* endpoint (e.g. AF_UNIX).
@@ -1156,10 +1155,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run, flags);
-
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0, flags);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
@@ -1169,7 +1166,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
* @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
* sk with connection information (IP addresses, etc.) May not contain
* cgroup info if it is a req sock.
- * @type: The type of program to be exectuted
+ * @type: The type of program to be executed
*
* socket passed is expected to be of type INET or INET6.
*
@@ -1184,11 +1181,9 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- int ret;
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
- bpf_prog_run);
- return ret == 1 ? 0 : -EPERM;
+ return BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+ bpf_prog_run, 0);
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
@@ -1201,17 +1196,47 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow;
+ int ret;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
- bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
- return !allow;
+ return ret;
+}
+
+BPF_CALL_0(bpf_get_retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ return ctx->retval;
+}
+
+static const struct bpf_func_proto bpf_get_retval_proto = {
+ .func = bpf_get_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_set_retval, int, retval)
+{
+ struct bpf_cg_run_ctx *ctx =
+ container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx);
+
+ ctx->retval = retval;
+ return 0;
}
+static const struct bpf_func_proto bpf_set_retval_proto = {
+ .func = bpf_set_retval,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1224,6 +1249,10 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_cgroup_id_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_retval:
+ return &bpf_get_retval_proto;
+ case BPF_FUNC_set_retval:
+ return &bpf_set_retval_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -1337,7 +1366,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, 0);
rcu_read_unlock();
kfree(ctx.cur_val);
@@ -1350,24 +1380,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
kfree(ctx.new_val);
}
- return ret == 1 ? 0 : -EPERM;
+ return ret;
}
#ifdef CONFIG_NET
-static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum cgroup_bpf_attach_type attach_type)
-{
- struct bpf_prog_array *prog_array;
- bool empty;
-
- rcu_read_lock();
- prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]);
- empty = bpf_prog_array_is_empty(prog_array);
- rcu_read_unlock();
-
- return empty;
-}
-
static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
struct bpf_sockopt_buf *buf)
{
@@ -1426,19 +1442,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
};
int ret, max_optlen;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
- return 0;
-
/* Allocate a bit more than the initial user buffer for
* BPF program. The canonical use case is overriding
* TCP_CONGESTION(nv) to TCP_CONGESTION(cubic).
*/
max_optlen = max_t(int, 16, *optlen);
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1452,13 +1460,11 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, 0);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret)
goto out;
- }
if (ctx.optlen == -1) {
/* optlen set to -1, bypass kernel */
@@ -1518,19 +1524,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
+ .current_task = current,
};
int ret;
- /* Opportunistic check to see whether we have any BPF program
- * attached to the hook so we don't waste time allocating
- * memory and locking the socket.
- */
- if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
- return retval;
-
ctx.optlen = max_optlen;
-
max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
if (max_optlen < 0)
return max_optlen;
@@ -1562,27 +1560,17 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
lock_sock(sk);
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
+ &ctx, bpf_prog_run, retval);
release_sock(sk);
- if (!ret) {
- ret = -EPERM;
+ if (ret < 0)
goto out;
- }
if (ctx.optlen > max_optlen || ctx.optlen < 0) {
ret = -EFAULT;
goto out;
}
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval) {
- ret = -EFAULT;
- goto out;
- }
-
if (ctx.optlen != 0) {
if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
put_user(ctx.optlen, optlen)) {
@@ -1591,8 +1579,6 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
}
}
- ret = ctx.retval;
-
out:
sockopt_free_buf(&ctx, &buf);
return ret;
@@ -1607,10 +1593,10 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
.sk = sk,
.level = level,
.optname = optname,
- .retval = retval,
.optlen = *optlen,
.optval = optval,
.optval_end = optval + *optlen,
+ .current_task = current,
};
int ret;
@@ -1623,25 +1609,19 @@ int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
*/
ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
- &ctx, bpf_prog_run);
- if (!ret)
- return -EPERM;
+ &ctx, bpf_prog_run, retval);
+ if (ret < 0)
+ return ret;
if (ctx.optlen > *optlen)
return -EFAULT;
- /* BPF programs only allowed to set retval to 0, not some
- * arbitrary value.
- */
- if (ctx.retval != 0 && ctx.retval != retval)
- return -EFAULT;
-
/* BPF programs can shrink the buffer, export the modifications.
*/
if (ctx.optlen != 0)
*optlen = ctx.optlen;
- return ctx.retval;
+ return ret;
}
#endif
@@ -2057,10 +2037,39 @@ static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type,
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen);
break;
case offsetof(struct bpf_sockopt, retval):
- if (type == BPF_WRITE)
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval);
- else
- *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval);
+ BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0);
+
+ if (type == BPF_WRITE) {
+ int treg = BPF_REG_9;
+
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ if (si->src_reg == treg || si->dst_reg == treg)
+ --treg;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ treg, treg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ treg, si->src_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg,
+ offsetof(struct bpf_sockopt_kern, tmp_reg));
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sockopt_kern, current_task));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct task_struct, bpf_ctx));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct bpf_cg_run_ctx, retval));
+ }
break;
case offsetof(struct bpf_sockopt, optval):
*insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index de3e5bc6781f..13e9dbeeedf3 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -33,6 +33,7 @@
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
+#include <linux/nodemask.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@@ -105,6 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
+ fp->blinding_requested = bpf_jit_blinding_enabled(fp);
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -537,13 +539,10 @@ long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
- const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
- unsigned long addr = (unsigned long)hdr;
-
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
prog->aux->ksym.start = (unsigned long) prog->bpf_func;
- prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len;
}
static void
@@ -808,6 +807,176 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
return slot;
}
+/*
+ * BPF program pack allocator.
+ *
+ * Most BPF programs are pretty small. Allocating a hole page for each
+ * program is sometime a waste. Many small bpf program also adds pressure
+ * to instruction TLB. To solve this issue, we introduce a BPF program pack
+ * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86)
+ * to host BPF programs.
+ */
+#define BPF_PROG_CHUNK_SHIFT 6
+#define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT)
+#define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1))
+
+struct bpf_prog_pack {
+ struct list_head list;
+ void *ptr;
+ unsigned long bitmap[];
+};
+
+#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
+
+static size_t bpf_prog_pack_size = -1;
+static size_t bpf_prog_pack_mask = -1;
+
+static int bpf_prog_chunk_count(void)
+{
+ WARN_ON_ONCE(bpf_prog_pack_size == -1);
+ return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
+}
+
+static DEFINE_MUTEX(pack_mutex);
+static LIST_HEAD(pack_list);
+
+/* PMD_SIZE is not available in some special config, e.g. ARCH=arm with
+ * CONFIG_MMU=n. Use PAGE_SIZE in these cases.
+ */
+#ifdef PMD_SIZE
+#define BPF_HPAGE_SIZE PMD_SIZE
+#define BPF_HPAGE_MASK PMD_MASK
+#else
+#define BPF_HPAGE_SIZE PAGE_SIZE
+#define BPF_HPAGE_MASK PAGE_MASK
+#endif
+
+static size_t select_bpf_prog_pack_size(void)
+{
+ size_t size;
+ void *ptr;
+
+ size = BPF_HPAGE_SIZE * num_online_nodes();
+ ptr = module_alloc(size);
+
+ /* Test whether we can get huge pages. If not just use PAGE_SIZE
+ * packs.
+ */
+ if (!ptr || !is_vm_area_hugepages(ptr)) {
+ size = PAGE_SIZE;
+ bpf_prog_pack_mask = PAGE_MASK;
+ } else {
+ bpf_prog_pack_mask = BPF_HPAGE_MASK;
+ }
+
+ vfree(ptr);
+ return size;
+}
+
+static struct bpf_prog_pack *alloc_new_pack(void)
+{
+ struct bpf_prog_pack *pack;
+
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+ GFP_KERNEL);
+ if (!pack)
+ return NULL;
+ pack->ptr = module_alloc(bpf_prog_pack_size);
+ if (!pack->ptr) {
+ kfree(pack);
+ return NULL;
+ }
+ bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
+ list_add_tail(&pack->list, &pack_list);
+
+ set_vm_flush_reset_perms(pack->ptr);
+ set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ return pack;
+}
+
+static void *bpf_prog_pack_alloc(u32 size)
+{
+ unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size);
+ struct bpf_prog_pack *pack;
+ unsigned long pos;
+ void *ptr = NULL;
+
+ mutex_lock(&pack_mutex);
+ if (bpf_prog_pack_size == -1)
+ bpf_prog_pack_size = select_bpf_prog_pack_size();
+
+ if (size > bpf_prog_pack_size) {
+ size = round_up(size, PAGE_SIZE);
+ ptr = module_alloc(size);
+ if (ptr) {
+ set_vm_flush_reset_perms(ptr);
+ set_memory_ro((unsigned long)ptr, size / PAGE_SIZE);
+ set_memory_x((unsigned long)ptr, size / PAGE_SIZE);
+ }
+ goto out;
+ }
+ list_for_each_entry(pack, &pack_list, list) {
+ pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ nbits, 0);
+ if (pos < bpf_prog_chunk_count())
+ goto found_free_area;
+ }
+
+ pack = alloc_new_pack();
+ if (!pack)
+ goto out;
+
+ pos = 0;
+
+found_free_area:
+ bitmap_set(pack->bitmap, pos, nbits);
+ ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT);
+
+out:
+ mutex_unlock(&pack_mutex);
+ return ptr;
+}
+
+static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
+{
+ struct bpf_prog_pack *pack = NULL, *tmp;
+ unsigned int nbits;
+ unsigned long pos;
+ void *pack_ptr;
+
+ mutex_lock(&pack_mutex);
+ if (hdr->size > bpf_prog_pack_size) {
+ module_memfree(hdr);
+ goto out;
+ }
+
+ pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
+
+ list_for_each_entry(tmp, &pack_list, list) {
+ if (tmp->ptr == pack_ptr) {
+ pack = tmp;
+ break;
+ }
+ }
+
+ if (WARN_ONCE(!pack, "bpf_prog_pack bug\n"))
+ goto out;
+
+ nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
+ pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+
+ bitmap_clear(pack->bitmap, pos, nbits);
+ if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ bpf_prog_chunk_count(), 0) == 0) {
+ list_del(&pack->list);
+ module_memfree(pack->ptr);
+ kfree(pack);
+ }
+out:
+ mutex_unlock(&pack_mutex);
+}
+
static atomic_long_t bpf_jit_current;
/* Can be overridden by an arch's JIT compiler if it has a custom,
@@ -833,12 +1002,11 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 size)
{
- if (atomic_long_add_return(pages, &bpf_jit_current) >
- (bpf_jit_limit >> PAGE_SHIFT)) {
+ if (atomic_long_add_return(size, &bpf_jit_current) > bpf_jit_limit) {
if (!bpf_capable()) {
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
return -EPERM;
}
}
@@ -846,9 +1014,9 @@ int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 size)
{
- atomic_long_sub(pages, &bpf_jit_current);
+ atomic_long_sub(size, &bpf_jit_current);
}
void *__weak bpf_jit_alloc_exec(unsigned long size)
@@ -867,7 +1035,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_binary_header *hdr;
- u32 size, hole, start, pages;
+ u32 size, hole, start;
WARN_ON_ONCE(!is_power_of_2(alignment) ||
alignment > BPF_IMAGE_ALIGNMENT);
@@ -877,20 +1045,19 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
* random section of illegal instructions.
*/
size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
- pages = size / PAGE_SIZE;
- if (bpf_jit_charge_modmem(pages))
+ if (bpf_jit_charge_modmem(size))
return NULL;
hdr = bpf_jit_alloc_exec(size);
if (!hdr) {
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
return NULL;
}
/* Fill space with illegal/arch-dep instructions. */
bpf_fill_ill_insns(hdr, size);
- hdr->pages = pages;
+ hdr->size = size;
hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
PAGE_SIZE - sizeof(*hdr));
start = (get_random_int() % hole) & ~(alignment - 1);
@@ -903,10 +1070,117 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
void bpf_jit_binary_free(struct bpf_binary_header *hdr)
{
- u32 pages = hdr->pages;
+ u32 size = hdr->size;
bpf_jit_free_exec(hdr);
- bpf_jit_uncharge_modmem(pages);
+ bpf_jit_uncharge_modmem(size);
+}
+
+/* Allocate jit binary from bpf_prog_pack allocator.
+ * Since the allocated memory is RO+X, the JIT engine cannot write directly
+ * to the memory. To solve this problem, a RW buffer is also allocated at
+ * as the same time. The JIT engine should calculate offsets based on the
+ * RO memory address, but write JITed program to the RW buffer. Once the
+ * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies
+ * the JITed program to the RO memory.
+ */
+struct bpf_binary_header *
+bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ struct bpf_binary_header **rw_header,
+ u8 **rw_image,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *ro_header;
+ u32 size, hole, start;
+
+ WARN_ON_ONCE(!is_power_of_2(alignment) ||
+ alignment > BPF_IMAGE_ALIGNMENT);
+
+ /* add 16 bytes for a random section of illegal instructions */
+ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE);
+
+ if (bpf_jit_charge_modmem(size))
+ return NULL;
+ ro_header = bpf_prog_pack_alloc(size);
+ if (!ro_header) {
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ *rw_header = kvmalloc(size, GFP_KERNEL);
+ if (!*rw_header) {
+ bpf_arch_text_copy(&ro_header->size, &size, sizeof(size));
+ bpf_prog_pack_free(ro_header);
+ bpf_jit_uncharge_modmem(size);
+ return NULL;
+ }
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(*rw_header, size);
+ (*rw_header)->size = size;
+
+ hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)),
+ BPF_PROG_CHUNK_SIZE - sizeof(*ro_header));
+ start = (get_random_int() % hole) & ~(alignment - 1);
+
+ *image_ptr = &ro_header->image[start];
+ *rw_image = &(*rw_header)->image[start];
+
+ return ro_header;
+}
+
+/* Copy JITed text from rw_header to its final location, the ro_header. */
+int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
+ struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ void *ptr;
+
+ ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size);
+
+ kvfree(rw_header);
+
+ if (IS_ERR(ptr)) {
+ bpf_prog_pack_free(ro_header);
+ return PTR_ERR(ptr);
+ }
+ prog->aux->use_bpf_prog_pack = true;
+ return 0;
+}
+
+/* bpf_jit_binary_pack_free is called in two different scenarios:
+ * 1) when the program is freed after;
+ * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize).
+ * For case 2), we need to free both the RO memory and the RW buffer.
+ *
+ * bpf_jit_binary_pack_free requires proper ro_header->size. However,
+ * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size
+ * must be set with either bpf_jit_binary_pack_finalize (normal path) or
+ * bpf_arch_text_copy (when jit fails).
+ */
+void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
+ struct bpf_binary_header *rw_header)
+{
+ u32 size = ro_header->size;
+
+ bpf_prog_pack_free(ro_header);
+ kvfree(rw_header);
+ bpf_jit_uncharge_modmem(size);
+}
+
+static inline struct bpf_binary_header *
+bpf_jit_binary_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ if (fp->aux->use_bpf_prog_pack)
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ else
+ addr = real_start & PAGE_MASK;
+
+ return (void *)addr;
}
/* This symbol is only overridden by archs that have different
@@ -918,7 +1192,10 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- bpf_jit_binary_free(hdr);
+ if (fp->aux->use_bpf_prog_pack)
+ bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
+ else
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -1146,7 +1423,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
struct bpf_insn *insn;
int i, rewritten;
- if (!bpf_jit_blinding_enabled(prog) || prog->blinded)
+ if (!prog->blinding_requested || prog->blinded)
return prog;
clone = bpf_prog_clone_create(prog, GFP_USER);
@@ -1829,28 +2106,30 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
}
#endif
-bool bpf_prog_array_compatible(struct bpf_array *array,
- const struct bpf_prog *fp)
+bool bpf_prog_map_compatible(struct bpf_map *map,
+ const struct bpf_prog *fp)
{
bool ret;
if (fp->kprobe_override)
return false;
- spin_lock(&array->aux->owner.lock);
-
- if (!array->aux->owner.type) {
+ spin_lock(&map->owner.lock);
+ if (!map->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->owner.type = fp->type;
- array->aux->owner.jited = fp->jited;
+ map->owner.type = fp->type;
+ map->owner.jited = fp->jited;
+ map->owner.xdp_has_frags = fp->aux->xdp_has_frags;
ret = true;
} else {
- ret = array->aux->owner.type == fp->type &&
- array->aux->owner.jited == fp->jited;
+ ret = map->owner.type == fp->type &&
+ map->owner.jited == fp->jited &&
+ map->owner.xdp_has_frags == fp->aux->xdp_has_frags;
}
- spin_unlock(&array->aux->owner.lock);
+ spin_unlock(&map->owner.lock);
+
return ret;
}
@@ -1862,13 +2141,11 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
- struct bpf_array *array;
- if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
+ if (!map_type_contains_progs(map))
continue;
- array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp)) {
+ if (!bpf_prog_map_compatible(map, fp)) {
ret = -EINVAL;
goto out;
}
@@ -1968,18 +2245,10 @@ static struct bpf_prog_dummy {
},
};
-/* to avoid allocating empty bpf_prog_array for cgroups that
- * don't have bpf program attached use one global 'empty_prog_array'
- * It will not be modified the caller of bpf_prog_array_alloc()
- * (since caller requested prog_cnt == 0)
- * that pointer should be 'freed' by bpf_prog_array_free()
- */
-static struct {
- struct bpf_prog_array hdr;
- struct bpf_prog *null_prog;
-} empty_prog_array = {
+struct bpf_empty_prog_array bpf_empty_prog_array = {
.null_prog = NULL,
};
+EXPORT_SYMBOL(bpf_empty_prog_array);
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
{
@@ -1989,12 +2258,12 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
(prog_cnt + 1),
flags);
- return &empty_prog_array.hdr;
+ return &bpf_empty_prog_array.hdr;
}
void bpf_prog_array_free(struct bpf_prog_array *progs)
{
- if (!progs || progs == &empty_prog_array.hdr)
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
return;
kfree_rcu(progs, rcu);
}
@@ -2453,6 +2722,11 @@ int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
return -ENOTSUPP;
}
+void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len)
+{
+ return ERR_PTR(-ENOTSUPP);
+}
+
DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
EXPORT_SYMBOL(bpf_stats_enabled_key);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index b3e6b9422238..650e5d21f90d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -397,7 +397,8 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu,
+ struct bpf_map *map, int fd)
{
struct bpf_prog *prog;
@@ -405,7 +406,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP ||
+ !bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return -EINVAL;
}
@@ -457,7 +459,7 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
rcpu->map_id = map->id;
rcpu->value.qsize = value->qsize;
- if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, map, fd))
goto free_ptr_ring;
/* Setup kthread */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index fe019dbdb3f0..038f6d7a83e4 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -858,7 +858,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
BPF_PROG_TYPE_XDP, false);
if (IS_ERR(prog))
goto err_put_dev;
- if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
+ !bpf_prog_map_compatible(&dtab->map, prog))
goto err_put_prog;
}
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d29af9988f37..65877967f414 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1636,7 +1636,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
value_size = size * num_possible_cpus();
total = 0;
/* while experimenting with hash tables with sizes ranging from 10 to
- * 1000, it was observed that a bucket can have upto 5 entries.
+ * 1000, it was observed that a bucket can have up to 5 entries.
*/
bucket_size = 5;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 01cfdf40c838..315053ef6a75 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
@@ -16,6 +17,7 @@
#include <linux/pid_namespace.h>
#include <linux/proc_ns.h>
#include <linux/security.h>
+#include <linux/btf_ids.h>
#include "../../lib/kstrtox.h"
@@ -223,13 +225,8 @@ BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size)
if (unlikely(!task))
goto err_clear;
- strncpy(buf, task->comm, size);
-
- /* Verifier guarantees that size > 0. For task->comm exceeding
- * size, guarantee that buf is %NUL-terminated. Unconditionally
- * done here to save the size test.
- */
- buf[size - 1] = 0;
+ /* Verifier guarantees that size > 0 */
+ strscpy(buf, task->comm, size);
return 0;
err_clear:
memset(buf, 0, size);
@@ -671,6 +668,39 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_5(bpf_copy_from_user_task, void *, dst, u32, size,
+ const void __user *, user_ptr, struct task_struct *, tsk, u64, flags)
+{
+ int ret;
+
+ /* flags is not used yet */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(!size))
+ return 0;
+
+ ret = access_process_vm(tsk, (unsigned long)user_ptr, dst, size, 0);
+ if (ret == size)
+ return 0;
+
+ memset(dst, 0, size);
+ /* Return -EFAULT for partial read */
+ return ret < 0 ? ret : -EFAULT;
+}
+
+const struct bpf_func_proto bpf_copy_from_user_task_proto = {
+ .func = bpf_copy_from_user_task,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg2_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_BTF_ID,
+ .arg4_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg5_type = ARG_ANYTHING
+};
+
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
if (cpu >= nr_cpu_ids)
@@ -1058,7 +1088,7 @@ struct bpf_hrtimer {
struct bpf_timer_kern {
struct bpf_hrtimer *timer;
/* bpf_spin_lock is used here instead of spinlock_t to make
- * sure that it always fits into space resereved by struct bpf_timer
+ * sure that it always fits into space reserved by struct bpf_timer
* regardless of LOCKDEP and spinlock debug flags.
*/
struct bpf_spin_lock lock;
@@ -1075,6 +1105,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
void *key;
u32 idx;
+ BTF_TYPE_EMIT(struct bpf_timer);
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5a8d9f7467bf..4f841e16779e 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -710,11 +710,10 @@ static DEFINE_MUTEX(bpf_preload_lock);
static int populate_bpffs(struct dentry *parent)
{
struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
- struct bpf_link *links[BPF_PRELOAD_LINKS] = {};
int err = 0, i;
/* grab the mutex to make sure the kernel interactions with bpf_preload
- * UMD are serialized
+ * are serialized
*/
mutex_lock(&bpf_preload_lock);
@@ -722,40 +721,22 @@ static int populate_bpffs(struct dentry *parent)
if (!bpf_preload_mod_get())
goto out;
- if (!bpf_preload_ops->info.tgid) {
- /* preload() will start UMD that will load BPF iterator programs */
- err = bpf_preload_ops->preload(objs);
- if (err)
+ err = bpf_preload_ops->preload(objs);
+ if (err)
+ goto out_put;
+ for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
+ bpf_link_inc(objs[i].link);
+ err = bpf_iter_link_pin_kernel(parent,
+ objs[i].link_name, objs[i].link);
+ if (err) {
+ bpf_link_put(objs[i].link);
goto out_put;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- links[i] = bpf_link_by_id(objs[i].link_id);
- if (IS_ERR(links[i])) {
- err = PTR_ERR(links[i]);
- goto out_put;
- }
}
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- err = bpf_iter_link_pin_kernel(parent,
- objs[i].link_name, links[i]);
- if (err)
- goto out_put;
- /* do not unlink successfully pinned links even
- * if later link fails to pin
- */
- links[i] = NULL;
- }
- /* finish() will tell UMD process to exit */
- err = bpf_preload_ops->finish();
- if (err)
- goto out_put;
}
out_put:
bpf_preload_mod_put();
out:
mutex_unlock(&bpf_preload_lock);
- for (i = 0; i < BPF_PRELOAD_LINKS && err; i++)
- if (!IS_ERR_OR_NULL(links[i]))
- bpf_link_put(links[i]);
return err;
}
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 23f7f9d08a62..497916060ac7 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf-cgroup.h>
#include <linux/bpf.h>
#include <linux/bpf_local_storage.h>
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index 26bced262473..c9d45c9d6918 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -18,10 +18,9 @@ menuconfig BPF_PRELOAD
if BPF_PRELOAD
config BPF_PRELOAD_UMD
- tristate "bpf_preload kernel module with user mode driver"
- depends on CC_CAN_LINK
- depends on m || CC_CAN_LINK_STATIC
+ tristate "bpf_preload kernel module"
default m
help
- This builds bpf_preload kernel module with embedded user mode driver.
+ This builds bpf_preload kernel module with embedded BPF programs for
+ introspection in bpffs.
endif
diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile
index 1400ac58178e..20f89cc0a0a6 100644
--- a/kernel/bpf/preload/Makefile
+++ b/kernel/bpf/preload/Makefile
@@ -1,42 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
-LIBBPF_OUT = $(abspath $(obj))/libbpf
-LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
-LIBBPF_DESTDIR = $(LIBBPF_OUT)
-LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
-
-# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
-# in tools/scripts/Makefile.include always succeeds when building the kernel
-# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
-$(LIBBPF_A): | $(LIBBPF_OUT)
- $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
- DESTDIR=$(LIBBPF_DESTDIR) prefix= \
- $(LIBBPF_OUT)/libbpf.a install_headers
-
-libbpf_hdrs: $(LIBBPF_A)
-
-.PHONY: libbpf_hdrs
-
-$(LIBBPF_OUT):
- $(call msg,MKDIR,$@)
- $(Q)mkdir -p $@
-
-userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
- -I $(LIBBPF_INCLUDE) -Wno-unused-result
-
-userprogs := bpf_preload_umd
-
-clean-files := libbpf/
-
-$(obj)/iterators/iterators.o: | libbpf_hdrs
-
-bpf_preload_umd-objs := iterators/iterators.o
-bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
-
-$(obj)/bpf_preload_umd: $(LIBBPF_A)
-
-$(obj)/bpf_preload_umd_blob.o: $(obj)/bpf_preload_umd
+LIBBPF_INCLUDE = $(srctree)/tools/lib
obj-$(CONFIG_BPF_PRELOAD_UMD) += bpf_preload.o
-bpf_preload-objs += bpf_preload_kern.o bpf_preload_umd_blob.o
+CFLAGS_bpf_preload_kern.o += -I$(LIBBPF_INCLUDE)
+bpf_preload-objs += bpf_preload_kern.o
diff --git a/kernel/bpf/preload/bpf_preload.h b/kernel/bpf/preload/bpf_preload.h
index 2f9932276f2e..f065c91213a0 100644
--- a/kernel/bpf/preload/bpf_preload.h
+++ b/kernel/bpf/preload/bpf_preload.h
@@ -2,13 +2,13 @@
#ifndef _BPF_PRELOAD_H
#define _BPF_PRELOAD_H
-#include <linux/usermode_driver.h>
-#include "iterators/bpf_preload_common.h"
+struct bpf_preload_info {
+ char link_name[16];
+ struct bpf_link *link;
+};
struct bpf_preload_ops {
- struct umd_info info;
int (*preload)(struct bpf_preload_info *);
- int (*finish)(void);
struct module *owner;
};
extern struct bpf_preload_ops *bpf_preload_ops;
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 53736e52c1df..5106b5372f0c 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -2,101 +2,87 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/pid.h>
-#include <linux/fs.h>
-#include <linux/sched/signal.h>
#include "bpf_preload.h"
+#include "iterators/iterators.lskel.h"
-extern char bpf_preload_umd_start;
-extern char bpf_preload_umd_end;
+static struct bpf_link *maps_link, *progs_link;
+static struct iterators_bpf *skel;
-static int preload(struct bpf_preload_info *obj);
-static int finish(void);
+static void free_links_and_skel(void)
+{
+ if (!IS_ERR_OR_NULL(maps_link))
+ bpf_link_put(maps_link);
+ if (!IS_ERR_OR_NULL(progs_link))
+ bpf_link_put(progs_link);
+ iterators_bpf__destroy(skel);
+}
+
+static int preload(struct bpf_preload_info *obj)
+{
+ strlcpy(obj[0].link_name, "maps.debug", sizeof(obj[0].link_name));
+ obj[0].link = maps_link;
+ strlcpy(obj[1].link_name, "progs.debug", sizeof(obj[1].link_name));
+ obj[1].link = progs_link;
+ return 0;
+}
-static struct bpf_preload_ops umd_ops = {
- .info.driver_name = "bpf_preload",
+static struct bpf_preload_ops ops = {
.preload = preload,
- .finish = finish,
.owner = THIS_MODULE,
};
-static int preload(struct bpf_preload_info *obj)
+static int load_skel(void)
{
- int magic = BPF_PRELOAD_START;
- loff_t pos = 0;
- int i, err;
- ssize_t n;
+ int err;
- err = fork_usermode_driver(&umd_ops.info);
+ skel = iterators_bpf__open();
+ if (!skel)
+ return -ENOMEM;
+ err = iterators_bpf__load(skel);
if (err)
- return err;
-
- /* send the start magic to let UMD proceed with loading BPF progs */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- /* receive bpf_link IDs and names from UMD */
- pos = 0;
- for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
- n = kernel_read(umd_ops.info.pipe_from_umh,
- &obj[i], sizeof(*obj), &pos);
- if (n != sizeof(*obj))
- return -EPIPE;
+ goto out;
+ err = iterators_bpf__attach(skel);
+ if (err)
+ goto out;
+ maps_link = bpf_link_get_from_fd(skel->links.dump_bpf_map_fd);
+ if (IS_ERR(maps_link)) {
+ err = PTR_ERR(maps_link);
+ goto out;
}
- return 0;
-}
-
-static int finish(void)
-{
- int magic = BPF_PRELOAD_END;
- struct pid *tgid;
- loff_t pos = 0;
- ssize_t n;
-
- /* send the last magic to UMD. It will do a normal exit. */
- n = kernel_write(umd_ops.info.pipe_to_umh,
- &magic, sizeof(magic), &pos);
- if (n != sizeof(magic))
- return -EPIPE;
-
- tgid = umd_ops.info.tgid;
- if (tgid) {
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
+ progs_link = bpf_link_get_from_fd(skel->links.dump_bpf_prog_fd);
+ if (IS_ERR(progs_link)) {
+ err = PTR_ERR(progs_link);
+ goto out;
}
+ /* Avoid taking over stdin/stdout/stderr of init process. Zeroing out
+ * makes skel_closenz() a no-op later in iterators_bpf__destroy().
+ */
+ close_fd(skel->links.dump_bpf_map_fd);
+ skel->links.dump_bpf_map_fd = 0;
+ close_fd(skel->links.dump_bpf_prog_fd);
+ skel->links.dump_bpf_prog_fd = 0;
return 0;
+out:
+ free_links_and_skel();
+ return err;
}
-static int __init load_umd(void)
+static int __init load(void)
{
int err;
- err = umd_load_blob(&umd_ops.info, &bpf_preload_umd_start,
- &bpf_preload_umd_end - &bpf_preload_umd_start);
+ err = load_skel();
if (err)
return err;
- bpf_preload_ops = &umd_ops;
+ bpf_preload_ops = &ops;
return err;
}
-static void __exit fini_umd(void)
+static void __exit fini(void)
{
- struct pid *tgid;
-
bpf_preload_ops = NULL;
-
- /* kill UMD in case it's still there due to earlier error */
- tgid = umd_ops.info.tgid;
- if (tgid) {
- kill_pid(tgid, SIGKILL, 1);
-
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_cleanup_helper(&umd_ops.info);
- }
- umd_unload_blob(&umd_ops.info);
+ free_links_and_skel();
}
-late_initcall(load_umd);
-module_exit(fini_umd);
+late_initcall(load);
+module_exit(fini);
MODULE_LICENSE("GPL");
diff --git a/kernel/bpf/preload/bpf_preload_umd_blob.S b/kernel/bpf/preload/bpf_preload_umd_blob.S
deleted file mode 100644
index f1f40223b5c3..000000000000
--- a/kernel/bpf/preload/bpf_preload_umd_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpf_preload_umd_start
-bpf_preload_umd_start:
- .incbin "kernel/bpf/preload/bpf_preload_umd"
- .global bpf_preload_umd_end
-bpf_preload_umd_end:
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index b8bd60511227..bfe24f8c5a20 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -35,15 +35,15 @@ endif
.PHONY: all clean
-all: iterators.skel.h
+all: iterators.lskel.h
clean:
$(call msg,CLEAN)
$(Q)rm -rf $(OUTPUT) iterators
-iterators.skel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
+iterators.lskel.h: $(OUTPUT)/iterators.bpf.o | $(BPFTOOL)
$(call msg,GEN-SKEL,$@)
- $(Q)$(BPFTOOL) gen skeleton $< > $@
+ $(Q)$(BPFTOOL) gen skeleton -L $< > $@
$(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
diff --git a/kernel/bpf/preload/iterators/bpf_preload_common.h b/kernel/bpf/preload/iterators/bpf_preload_common.h
deleted file mode 100644
index 8464d1a48c05..000000000000
--- a/kernel/bpf/preload/iterators/bpf_preload_common.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BPF_PRELOAD_COMMON_H
-#define _BPF_PRELOAD_COMMON_H
-
-#define BPF_PRELOAD_START 0x5555
-#define BPF_PRELOAD_END 0xAAAA
-
-struct bpf_preload_info {
- char link_name[16];
- int link_id;
-};
-
-#endif
diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c
deleted file mode 100644
index 5d872a705470..000000000000
--- a/kernel/bpf/preload/iterators/iterators.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (c) 2020 Facebook */
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/resource.h>
-#include <bpf/libbpf.h>
-#include <bpf/bpf.h>
-#include <sys/mount.h>
-#include "iterators.skel.h"
-#include "bpf_preload_common.h"
-
-int to_kernel = -1;
-int from_kernel = 0;
-
-static int send_link_to_kernel(struct bpf_link *link, const char *link_name)
-{
- struct bpf_preload_info obj = {};
- struct bpf_link_info info = {};
- __u32 info_len = sizeof(info);
- int err;
-
- err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &info, &info_len);
- if (err)
- return err;
- obj.link_id = info.id;
- if (strlen(link_name) >= sizeof(obj.link_name))
- return -E2BIG;
- strcpy(obj.link_name, link_name);
- if (write(to_kernel, &obj, sizeof(obj)) != sizeof(obj))
- return -EPIPE;
- return 0;
-}
-
-int main(int argc, char **argv)
-{
- struct rlimit rlim = { RLIM_INFINITY, RLIM_INFINITY };
- struct iterators_bpf *skel;
- int err, magic;
- int debug_fd;
-
- debug_fd = open("/dev/console", O_WRONLY | O_NOCTTY | O_CLOEXEC);
- if (debug_fd < 0)
- return 1;
- to_kernel = dup(1);
- close(1);
- dup(debug_fd);
- /* now stdin and stderr point to /dev/console */
-
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_START) {
- printf("bad start magic %d\n", magic);
- return 1;
- }
- setrlimit(RLIMIT_MEMLOCK, &rlim);
- /* libbpf opens BPF object and loads it into the kernel */
- skel = iterators_bpf__open_and_load();
- if (!skel) {
- /* iterators.skel.h is little endian.
- * libbpf doesn't support automatic little->big conversion
- * of BPF bytecode yet.
- * The program load will fail in such case.
- */
- printf("Failed load could be due to wrong endianness\n");
- return 1;
- }
- err = iterators_bpf__attach(skel);
- if (err)
- goto cleanup;
-
- /* send two bpf_link IDs with names to the kernel */
- err = send_link_to_kernel(skel->links.dump_bpf_map, "maps.debug");
- if (err)
- goto cleanup;
- err = send_link_to_kernel(skel->links.dump_bpf_prog, "progs.debug");
- if (err)
- goto cleanup;
-
- /* The kernel will proceed with pinnging the links in bpffs.
- * UMD will wait on read from pipe.
- */
- read(from_kernel, &magic, sizeof(magic));
- if (magic != BPF_PRELOAD_END) {
- printf("bad final magic %d\n", magic);
- err = -EINVAL;
- }
-cleanup:
- iterators_bpf__destroy(skel);
-
- return err != 0;
-}
diff --git a/kernel/bpf/preload/iterators/iterators.lskel.h b/kernel/bpf/preload/iterators/iterators.lskel.h
new file mode 100644
index 000000000000..70f236a82fe1
--- /dev/null
+++ b/kernel/bpf/preload/iterators/iterators.lskel.h
@@ -0,0 +1,425 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* THIS FILE IS AUTOGENERATED! */
+#ifndef __ITERATORS_BPF_SKEL_H__
+#define __ITERATORS_BPF_SKEL_H__
+
+#include <bpf/skel_internal.h>
+
+struct iterators_bpf {
+ struct bpf_loader_ctx ctx;
+ struct {
+ struct bpf_map_desc rodata;
+ } maps;
+ struct {
+ struct bpf_prog_desc dump_bpf_map;
+ struct bpf_prog_desc dump_bpf_prog;
+ } progs;
+ struct {
+ int dump_bpf_map_fd;
+ int dump_bpf_prog_fd;
+ } links;
+ struct iterators_bpf__rodata {
+ } *rodata;
+};
+
+static inline int
+iterators_bpf__dump_bpf_map__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_map.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_map_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__dump_bpf_prog__attach(struct iterators_bpf *skel)
+{
+ int prog_fd = skel->progs.dump_bpf_prog.prog_fd;
+ int fd = skel_link_create(prog_fd, 0, BPF_TRACE_ITER);
+
+ if (fd > 0)
+ skel->links.dump_bpf_prog_fd = fd;
+ return fd;
+}
+
+static inline int
+iterators_bpf__attach(struct iterators_bpf *skel)
+{
+ int ret = 0;
+
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_map__attach(skel);
+ ret = ret < 0 ? ret : iterators_bpf__dump_bpf_prog__attach(skel);
+ return ret < 0 ? ret : 0;
+}
+
+static inline void
+iterators_bpf__detach(struct iterators_bpf *skel)
+{
+ skel_closenz(skel->links.dump_bpf_map_fd);
+ skel_closenz(skel->links.dump_bpf_prog_fd);
+}
+static void
+iterators_bpf__destroy(struct iterators_bpf *skel)
+{
+ if (!skel)
+ return;
+ iterators_bpf__detach(skel);
+ skel_closenz(skel->progs.dump_bpf_map.prog_fd);
+ skel_closenz(skel->progs.dump_bpf_prog.prog_fd);
+ skel_free_map_data(skel->rodata, skel->maps.rodata.initial_value, 4096);
+ skel_closenz(skel->maps.rodata.map_fd);
+ skel_free(skel);
+}
+static inline struct iterators_bpf *
+iterators_bpf__open(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = skel_alloc(sizeof(*skel));
+ if (!skel)
+ goto cleanup;
+ skel->ctx.sz = (void *)&skel->links - (void *)skel;
+ skel->rodata = skel_prep_map_data((void *)"\
+\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\
+\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\
+\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0", 4096, 98);
+ if (!skel->rodata)
+ goto cleanup;
+ skel->maps.rodata.initial_value = (__u64) (long) skel->rodata;
+ return skel;
+cleanup:
+ iterators_bpf__destroy(skel);
+ return NULL;
+}
+
+static inline int
+iterators_bpf__load(struct iterators_bpf *skel)
+{
+ struct bpf_load_and_run_opts opts = {};
+ int err;
+
+ opts.ctx = (struct bpf_loader_ctx *)skel;
+ opts.data_sz = 6056;
+ opts.data = (void *)"\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x9f\xeb\x01\0\
+\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\xf9\x04\0\0\0\0\0\0\0\0\0\x02\x02\0\
+\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\0\0\0\x04\
+\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\0\0\0\0\0\
+\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\0\0\0\x20\
+\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xa3\0\0\0\x03\0\0\x04\x18\0\0\0\xb1\0\
+\0\0\x09\0\0\0\0\0\0\0\xb5\0\0\0\x0b\0\0\0\x40\0\0\0\xc0\0\0\0\x0b\0\0\0\x80\0\
+\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xc8\0\0\0\0\0\0\x07\0\0\0\0\xd1\0\0\0\0\0\0\
+\x08\x0c\0\0\0\xd7\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\x94\x01\0\0\x03\0\0\x04\
+\x18\0\0\0\x9c\x01\0\0\x0e\0\0\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xa4\
+\x01\0\0\x0e\0\0\0\xa0\0\0\0\xb0\x01\0\0\0\0\0\x08\x0f\0\0\0\xb6\x01\0\0\0\0\0\
+\x01\x04\0\0\0\x20\0\0\0\xc3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\0\0\0\
+\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xc8\x01\0\0\0\0\0\x01\x04\0\0\0\
+\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x2c\x02\0\0\x02\0\0\x04\x10\0\0\0\x13\0\
+\0\0\x03\0\0\0\0\0\0\0\x3f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x18\0\
+\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x44\x02\0\0\x01\0\0\x0c\
+\x16\0\0\0\x90\x02\0\0\x01\0\0\x04\x08\0\0\0\x99\x02\0\0\x19\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x02\x1a\0\0\0\xea\x02\0\0\x06\0\0\x04\x38\0\0\0\x9c\x01\0\0\x0e\0\0\
+\0\0\0\0\0\x9f\x01\0\0\x11\0\0\0\x20\0\0\0\xf7\x02\0\0\x1b\0\0\0\xc0\0\0\0\x08\
+\x03\0\0\x15\0\0\0\0\x01\0\0\x11\x03\0\0\x1d\0\0\0\x40\x01\0\0\x1b\x03\0\0\x1e\
+\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\0\0\0\0\
+\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x65\x03\0\0\x02\0\0\x04\
+\x08\0\0\0\x73\x03\0\0\x0e\0\0\0\0\0\0\0\x7c\x03\0\0\x0e\0\0\0\x20\0\0\0\x1b\
+\x03\0\0\x03\0\0\x04\x18\0\0\0\x86\x03\0\0\x1b\0\0\0\0\0\0\0\x8e\x03\0\0\x21\0\
+\0\0\x40\0\0\0\x94\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\0\0\0\
+\0\0\0\0\0\x02\x24\0\0\0\x98\x03\0\0\x01\0\0\x04\x04\0\0\0\xa3\x03\0\0\x0e\0\0\
+\0\0\0\0\0\x0c\x04\0\0\x01\0\0\x04\x04\0\0\0\x15\x04\0\0\x0e\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x8b\x04\0\0\0\0\0\x0e\x25\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\x9f\x04\
+\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\
+\x20\0\0\0\xb5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\
+\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xca\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xe1\x04\0\0\0\0\0\x0e\x2d\0\0\
+\0\x01\0\0\0\xe9\x04\0\0\x04\0\0\x0f\x62\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\0\x28\
+\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\0\0\
+\x11\0\0\0\xf1\x04\0\0\x01\0\0\x0f\x04\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\x74\
+\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x30\
+\x3a\x30\0\x2f\x77\x2f\x6e\x65\x74\x2d\x6e\x65\x78\x74\x2f\x6b\x65\x72\x6e\x65\
+\x6c\x2f\x62\x70\x66\x2f\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\
+\x74\x6f\x72\x73\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\
+\x63\0\x09\x73\x74\x72\x75\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\
+\x73\x65\x71\x20\x3d\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\
+\x71\x3b\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\
+\x73\x65\x73\x73\x69\x6f\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\
+\x65\x71\x5f\x66\x69\x6c\x65\0\x5f\x5f\x75\x36\x34\0\x75\x6e\x73\x69\x67\x6e\
+\x65\x64\x20\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\0\x30\x3a\x31\0\x09\x73\x74\
+\x72\x75\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\
+\x20\x63\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\
+\x29\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x20\x63\
+\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\x3b\0\x30\
+\x3a\x32\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\
+\x29\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\
+\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\
+\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\
+\0\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
+\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
+\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
+\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
+\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
+\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
+\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
+\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
+\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
+\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
+\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
+\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
+\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
+\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
+\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
+\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
+\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
+\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
+\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
+\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
+\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
+\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
+\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
+\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
+\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
+\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
+\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
+\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
+\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
+\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
+\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
+\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
+\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
+\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
+\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
+\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
+\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
+\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
+\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
+\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
+\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x2d\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x02\0\0\
+\0\x04\0\0\0\x62\0\0\0\x01\0\0\0\x80\x04\0\0\0\0\0\0\0\0\0\0\x69\x74\x65\x72\
+\x61\x74\x6f\x72\x2e\x72\x6f\x64\x61\x74\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\x2f\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\
+\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\
+\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x0a\0\x20\x20\x69\
+\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\
+\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\
+\x25\x73\x20\x25\x73\x0a\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\
+\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\x1b\0\0\
+\0\0\0\x79\x11\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\
+\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\
+\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\
+\0\0\0\0\0\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\
+\x7b\x1a\xf8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\
+\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x23\0\0\0\xb7\x03\0\0\x0e\0\0\0\
+\xb7\x05\0\0\x18\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x07\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x3c\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x3c\x01\0\x02\0\0\0\x42\0\0\0\xee\0\0\0\x1d\x44\x01\0\x03\0\0\
+\0\x42\0\0\0\x0f\x01\0\0\x06\x4c\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\x40\
+\x01\0\x05\0\0\0\x42\0\0\0\x1a\x01\0\0\x1d\x40\x01\0\x06\0\0\0\x42\0\0\0\x43\
+\x01\0\0\x06\x58\x01\0\x08\0\0\0\x42\0\0\0\x56\x01\0\0\x03\x5c\x01\0\x0f\0\0\0\
+\x42\0\0\0\xdc\x01\0\0\x02\x64\x01\0\x1f\0\0\0\x42\0\0\0\x2a\x02\0\0\x01\x6c\
+\x01\0\0\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\
+\0\x10\0\0\0\x02\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x02\0\0\0\x3e\0\0\0\0\0\0\0\
+\x28\0\0\0\x08\0\0\0\x3f\x01\0\0\0\0\0\0\x78\0\0\0\x0d\0\0\0\x3e\0\0\0\0\0\0\0\
+\x88\0\0\0\x0d\0\0\0\xea\0\0\0\0\0\0\0\xa8\0\0\0\x0d\0\0\0\x3f\x01\0\0\0\0\0\0\
+\x1a\0\0\0\x21\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\0\0\0\
+\0\0\0\0\x1c\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\
+\0\0\0\0\0\x0a\0\0\0\x01\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x10\0\0\0\0\0\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x6d\
+\x61\x70\0\0\0\0\0\0\0\0\x47\x50\x4c\0\0\0\0\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\
+\0\0\0\0\x79\x12\x08\0\0\0\0\0\x15\x02\x3c\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x79\
+\x27\0\0\0\0\0\0\x79\x11\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\
+\0\x07\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\
+\x31\0\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\
+\x6a\xc8\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\
+\x04\0\0\0\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\
+\x78\x30\0\0\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\
+\0\x61\x11\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\
+\0\0\0\0\0\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\
+\xb7\x02\0\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\
+\0\0\0\0\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\
+\xb7\x02\0\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\
+\xff\0\0\0\0\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\
+\xbf\x69\0\0\0\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\
+\xff\0\0\0\0\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\
+\x1a\xe8\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\
+\xc8\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x51\0\0\0\xb7\x03\0\0\x11\0\0\0\
+\xb7\x05\0\0\x20\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\
+\0\0\0\0\x17\0\0\0\0\0\0\0\x42\0\0\0\x7b\0\0\0\x1e\x80\x01\0\x01\0\0\0\x42\0\0\
+\0\x7b\0\0\0\x24\x80\x01\0\x02\0\0\0\x42\0\0\0\x60\x02\0\0\x1f\x88\x01\0\x03\0\
+\0\0\x42\0\0\0\x84\x02\0\0\x06\x94\x01\0\x04\0\0\0\x42\0\0\0\x1a\x01\0\0\x17\
+\x84\x01\0\x05\0\0\0\x42\0\0\0\x9d\x02\0\0\x0e\xa0\x01\0\x06\0\0\0\x42\0\0\0\
+\x1a\x01\0\0\x1d\x84\x01\0\x07\0\0\0\x42\0\0\0\x43\x01\0\0\x06\xa4\x01\0\x09\0\
+\0\0\x42\0\0\0\xaf\x02\0\0\x03\xa8\x01\0\x11\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\
+\xb0\x01\0\x18\0\0\0\x42\0\0\0\x5a\x03\0\0\x06\x04\x01\0\x1b\0\0\0\x42\0\0\0\0\
+\0\0\0\0\0\0\0\x1c\0\0\0\x42\0\0\0\xab\x03\0\0\x0f\x10\x01\0\x1d\0\0\0\x42\0\0\
+\0\xc0\x03\0\0\x2d\x14\x01\0\x1f\0\0\0\x42\0\0\0\xf7\x03\0\0\x0d\x0c\x01\0\x21\
+\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x22\0\0\0\x42\0\0\0\xc0\x03\0\0\x02\x14\x01\0\
+\x25\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x28\0\0\0\x42\0\0\0\0\0\0\0\0\0\
+\0\0\x29\0\0\0\x42\0\0\0\x1e\x04\0\0\x0d\x18\x01\0\x2c\0\0\0\x42\0\0\0\x1e\x04\
+\0\0\x0d\x18\x01\0\x2d\0\0\0\x42\0\0\0\x4c\x04\0\0\x1b\x1c\x01\0\x2e\0\0\0\x42\
+\0\0\0\x4c\x04\0\0\x06\x1c\x01\0\x2f\0\0\0\x42\0\0\0\x6f\x04\0\0\x0d\x24\x01\0\
+\x31\0\0\0\x42\0\0\0\x1f\x03\0\0\x02\xb0\x01\0\x40\0\0\0\x42\0\0\0\x2a\x02\0\0\
+\x01\xc0\x01\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\
+\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xea\0\0\0\0\0\0\0\x20\0\0\0\x14\0\0\0\x3e\0\0\0\
+\0\0\0\0\x28\0\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x30\0\0\0\x08\0\0\0\x3f\x01\0\0\
+\0\0\0\0\x88\0\0\0\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x98\0\0\0\x1a\0\0\0\xea\0\0\0\0\
+\0\0\0\xb0\0\0\0\x1a\0\0\0\x52\x03\0\0\0\0\0\0\xb8\0\0\0\x1a\0\0\0\x56\x03\0\0\
+\0\0\0\0\xc8\0\0\0\x1f\0\0\0\x84\x03\0\0\0\0\0\0\xe0\0\0\0\x20\0\0\0\xea\0\0\0\
+\0\0\0\0\xf8\0\0\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x20\x01\0\0\x24\0\0\0\x3e\0\0\0\
+\0\0\0\0\x58\x01\0\0\x1a\0\0\0\xea\0\0\0\0\0\0\0\x68\x01\0\0\x20\0\0\0\x46\x04\
+\0\0\0\0\0\0\x90\x01\0\0\x1a\0\0\0\x3f\x01\0\0\0\0\0\0\xa0\x01\0\0\x1a\0\0\0\
+\x87\x04\0\0\0\0\0\0\xa8\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\x1a\0\0\0\x42\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\0\0\0\0\x1c\0\0\
+\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x10\0\0\0\0\0\0\0\0\0\0\0\x1a\0\
+\0\0\x01\0\0\0\0\0\0\0\x13\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x10\0\0\0\0\0\
+\0\0\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\0\0\
+\0\0\0\0";
+ opts.insns_sz = 2216;
+ opts.insns = (void *)"\
+\xbf\x16\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\x78\xff\xff\xff\xb7\x02\0\
+\0\x88\0\0\0\xb7\x03\0\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x05\0\x14\0\0\0\0\0\x61\
+\xa1\x78\xff\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x7c\xff\
+\0\0\0\0\xd5\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x80\xff\0\0\0\0\xd5\
+\x01\x01\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa1\x84\xff\0\0\0\0\xd5\x01\x01\0\0\
+\0\0\0\x85\0\0\0\xa8\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x01\0\0\0\0\
+\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xbf\x70\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x48\x0e\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x44\x0e\0\0\x63\x01\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\
+\0\0\0\0\x38\x0e\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x05\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x12\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x30\x0e\0\0\xb7\x03\0\0\x1c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\xd4\xff\0\0\0\0\x63\x7a\x78\xff\0\0\0\0\
+\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x0e\0\0\x63\x01\0\0\0\
+\0\0\0\x61\x60\x1c\0\0\0\0\0\x15\0\x03\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x5c\x0e\0\0\x63\x01\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\
+\0\x50\x0e\0\0\xb7\x03\0\0\x48\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\
+\xc5\x07\xc3\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x63\x71\0\0\0\0\0\
+\0\x79\x63\x20\0\0\0\0\0\x15\x03\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x98\
+\x0e\0\0\xb7\x02\0\0\x62\0\0\0\x61\x60\x04\0\0\0\0\0\x45\0\x02\0\x01\0\0\0\x85\
+\0\0\0\x94\0\0\0\x05\0\x01\0\0\0\0\0\x85\0\0\0\x71\0\0\0\x18\x62\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\
+\0\0\x10\x0f\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x98\x0e\0\0\
+\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x0f\0\0\x7b\x01\0\0\0\0\0\0\xb7\x01\0\0\x02\0\
+\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x08\x0f\0\0\xb7\x03\0\0\x20\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x9f\xff\0\0\0\0\x18\x62\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\x61\x20\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\x63\
+\x01\0\0\0\0\0\0\xb7\x01\0\0\x16\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\x28\x0f\0\0\
+\xb7\x03\0\0\x04\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\x92\xff\
+\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x0f\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\
+\x78\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x38\x0f\0\0\x18\
+\x61\0\0\0\0\0\0\0\0\0\0\x70\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\
+\0\0\0\x40\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb8\x11\0\0\x7b\x01\0\0\0\0\0\0\
+\x18\x60\0\0\0\0\0\0\0\0\0\0\x48\x10\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xc8\x11\0\
+\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xe8\x10\0\0\x18\x61\0\0\0\0\
+\0\0\0\0\0\0\xe8\x11\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xe0\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\x60\x08\0\0\
+\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x11\0\0\x63\x01\0\0\0\0\0\0\x61\x60\x0c\
+\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x84\x11\0\0\x63\x01\0\0\0\0\0\0\x79\x60\
+\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x88\x11\0\0\x7b\x01\0\0\0\0\0\0\x61\
+\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xb0\x11\0\0\x63\x01\0\0\0\0\0\
+\0\x18\x61\0\0\0\0\0\0\0\0\0\0\xf8\x11\0\0\xb7\x02\0\0\x11\0\0\0\xb7\x03\0\0\
+\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\0\0\0\0\0\xc5\x07\
+\x5c\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x68\x11\0\0\x63\x70\x6c\0\0\0\0\0\
+\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\0\0\0\x18\x62\0\0\
+\0\0\0\0\0\0\0\0\x68\x11\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\xa6\0\0\0\xbf\x07\
+\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd8\x11\0\0\x61\x01\0\0\0\0\0\0\xd5\
+\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\x07\x4a\xff\0\0\
+\0\0\x63\x7a\x80\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x10\x12\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\x10\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\
+\x18\x12\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x08\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\
+\x60\0\0\0\0\0\0\0\0\0\0\x28\x14\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x50\x17\0\0\
+\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x30\x14\0\0\x18\x61\0\0\0\0\0\
+\0\0\0\0\0\x60\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\xd0\x15\
+\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x80\x17\0\0\x7b\x01\0\0\0\0\0\0\x18\x60\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x78\x17\0\0\x7b\x01\0\0\0\0\
+\0\0\x61\x60\x08\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x18\x17\0\0\x63\x01\0\0\
+\0\0\0\0\x61\x60\x0c\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x1c\x17\0\0\x63\x01\
+\0\0\0\0\0\0\x79\x60\x10\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\x7b\
+\x01\0\0\0\0\0\0\x61\xa0\x78\xff\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x48\x17\0\
+\0\x63\x01\0\0\0\0\0\0\x18\x61\0\0\0\0\0\0\0\0\0\0\x90\x17\0\0\xb7\x02\0\0\x12\
+\0\0\0\xb7\x03\0\0\x0c\0\0\0\xb7\x04\0\0\0\0\0\0\x85\0\0\0\xa7\0\0\0\xbf\x07\0\
+\0\0\0\0\0\xc5\x07\x13\xff\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\x63\
+\x70\x6c\0\0\0\0\0\x77\x07\0\0\x20\0\0\0\x63\x70\x70\0\0\0\0\0\xb7\x01\0\0\x05\
+\0\0\0\x18\x62\0\0\0\0\0\0\0\0\0\0\0\x17\0\0\xb7\x03\0\0\x8c\0\0\0\x85\0\0\0\
+\xa6\0\0\0\xbf\x07\0\0\0\0\0\0\x18\x60\0\0\0\0\0\0\0\0\0\0\x70\x17\0\0\x61\x01\
+\0\0\0\0\0\0\xd5\x01\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\xc5\
+\x07\x01\xff\0\0\0\0\x63\x7a\x84\xff\0\0\0\0\x61\xa1\x78\xff\0\0\0\0\xd5\x01\
+\x02\0\0\0\0\0\xbf\x19\0\0\0\0\0\0\x85\0\0\0\xa8\0\0\0\x61\xa0\x80\xff\0\0\0\0\
+\x63\x06\x28\0\0\0\0\0\x61\xa0\x84\xff\0\0\0\0\x63\x06\x2c\0\0\0\0\0\x18\x61\0\
+\0\0\0\0\0\0\0\0\0\0\0\0\0\x61\x10\0\0\0\0\0\0\x63\x06\x18\0\0\0\0\0\xb7\0\0\0\
+\0\0\0\0\x95\0\0\0\0\0\0\0";
+ err = bpf_load_and_run(&opts);
+ if (err < 0)
+ return err;
+ skel->rodata = skel_finalize_map_data(&skel->maps.rodata.initial_value,
+ 4096, PROT_READ, skel->maps.rodata.map_fd);
+ if (!skel->rodata)
+ return -ENOMEM;
+ return 0;
+}
+
+static inline struct iterators_bpf *
+iterators_bpf__open_and_load(void)
+{
+ struct iterators_bpf *skel;
+
+ skel = iterators_bpf__open();
+ if (!skel)
+ return NULL;
+ if (iterators_bpf__load(skel)) {
+ iterators_bpf__destroy(skel);
+ return NULL;
+ }
+ return skel;
+}
+
+#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/preload/iterators/iterators.skel.h b/kernel/bpf/preload/iterators/iterators.skel.h
deleted file mode 100644
index cf9a6a94b3a4..000000000000
--- a/kernel/bpf/preload/iterators/iterators.skel.h
+++ /dev/null
@@ -1,412 +0,0 @@
-/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
-
-/* THIS FILE IS AUTOGENERATED! */
-#ifndef __ITERATORS_BPF_SKEL_H__
-#define __ITERATORS_BPF_SKEL_H__
-
-#include <stdlib.h>
-#include <bpf/libbpf.h>
-
-struct iterators_bpf {
- struct bpf_object_skeleton *skeleton;
- struct bpf_object *obj;
- struct {
- struct bpf_map *rodata;
- } maps;
- struct {
- struct bpf_program *dump_bpf_map;
- struct bpf_program *dump_bpf_prog;
- } progs;
- struct {
- struct bpf_link *dump_bpf_map;
- struct bpf_link *dump_bpf_prog;
- } links;
- struct iterators_bpf__rodata {
- char dump_bpf_map____fmt[35];
- char dump_bpf_map____fmt_1[14];
- char dump_bpf_prog____fmt[32];
- char dump_bpf_prog____fmt_2[17];
- } *rodata;
-};
-
-static void
-iterators_bpf__destroy(struct iterators_bpf *obj)
-{
- if (!obj)
- return;
- if (obj->skeleton)
- bpf_object__destroy_skeleton(obj->skeleton);
- free(obj);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj);
-
-static inline struct iterators_bpf *
-iterators_bpf__open_opts(const struct bpf_object_open_opts *opts)
-{
- struct iterators_bpf *obj;
-
- obj = (struct iterators_bpf *)calloc(1, sizeof(*obj));
- if (!obj)
- return NULL;
- if (iterators_bpf__create_skeleton(obj))
- goto err;
- if (bpf_object__open_skeleton(obj->skeleton, opts))
- goto err;
-
- return obj;
-err:
- iterators_bpf__destroy(obj);
- return NULL;
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open(void)
-{
- return iterators_bpf__open_opts(NULL);
-}
-
-static inline int
-iterators_bpf__load(struct iterators_bpf *obj)
-{
- return bpf_object__load_skeleton(obj->skeleton);
-}
-
-static inline struct iterators_bpf *
-iterators_bpf__open_and_load(void)
-{
- struct iterators_bpf *obj;
-
- obj = iterators_bpf__open();
- if (!obj)
- return NULL;
- if (iterators_bpf__load(obj)) {
- iterators_bpf__destroy(obj);
- return NULL;
- }
- return obj;
-}
-
-static inline int
-iterators_bpf__attach(struct iterators_bpf *obj)
-{
- return bpf_object__attach_skeleton(obj->skeleton);
-}
-
-static inline void
-iterators_bpf__detach(struct iterators_bpf *obj)
-{
- return bpf_object__detach_skeleton(obj->skeleton);
-}
-
-static inline int
-iterators_bpf__create_skeleton(struct iterators_bpf *obj)
-{
- struct bpf_object_skeleton *s;
-
- s = (struct bpf_object_skeleton *)calloc(1, sizeof(*s));
- if (!s)
- return -1;
- obj->skeleton = s;
-
- s->sz = sizeof(*s);
- s->name = "iterators_bpf";
- s->obj = &obj->obj;
-
- /* maps */
- s->map_cnt = 1;
- s->map_skel_sz = sizeof(*s->maps);
- s->maps = (struct bpf_map_skeleton *)calloc(s->map_cnt, s->map_skel_sz);
- if (!s->maps)
- goto err;
-
- s->maps[0].name = "iterator.rodata";
- s->maps[0].map = &obj->maps.rodata;
- s->maps[0].mmaped = (void **)&obj->rodata;
-
- /* programs */
- s->prog_cnt = 2;
- s->prog_skel_sz = sizeof(*s->progs);
- s->progs = (struct bpf_prog_skeleton *)calloc(s->prog_cnt, s->prog_skel_sz);
- if (!s->progs)
- goto err;
-
- s->progs[0].name = "dump_bpf_map";
- s->progs[0].prog = &obj->progs.dump_bpf_map;
- s->progs[0].link = &obj->links.dump_bpf_map;
-
- s->progs[1].name = "dump_bpf_prog";
- s->progs[1].prog = &obj->progs.dump_bpf_prog;
- s->progs[1].link = &obj->links.dump_bpf_prog;
-
- s->data_sz = 7176;
- s->data = (void *)"\
-\x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x48\x18\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0f\0\
-\x0e\0\x79\x12\0\0\0\0\0\0\x79\x26\0\0\0\0\0\0\x79\x17\x08\0\0\0\0\0\x15\x07\
-\x1a\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\
-\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\xb7\x03\0\0\x23\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x61\x71\0\
-\0\0\0\0\0\x7b\x1a\xe8\xff\0\0\0\0\xb7\x01\0\0\x04\0\0\0\xbf\x72\0\0\0\0\0\0\
-\x0f\x12\0\0\0\0\0\0\x7b\x2a\xf0\xff\0\0\0\0\x61\x71\x14\0\0\0\0\0\x7b\x1a\xf8\
-\xff\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xe8\xff\xff\xff\xbf\x61\0\0\0\0\0\
-\0\x18\x02\0\0\x23\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x0e\0\0\0\xb7\x05\0\0\x18\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x79\x12\0\0\0\0\
-\0\0\x79\x26\0\0\0\0\0\0\x79\x11\x08\0\0\0\0\0\x15\x01\x3b\0\0\0\0\0\x79\x17\0\
-\0\0\0\0\0\x79\x21\x10\0\0\0\0\0\x55\x01\x08\0\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\
-\x04\0\0\xd0\xff\xff\xff\xbf\x61\0\0\0\0\0\0\x18\x02\0\0\x31\0\0\0\0\0\0\0\0\0\
-\0\0\xb7\x03\0\0\x20\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x7e\0\0\0\x7b\x6a\xc8\
-\xff\0\0\0\0\x61\x71\0\0\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xb7\x03\0\0\x04\0\0\0\
-\xbf\x79\0\0\0\0\0\0\x0f\x39\0\0\0\0\0\0\x79\x71\x28\0\0\0\0\0\x79\x78\x30\0\0\
-\0\0\0\x15\x08\x18\0\0\0\0\0\xb7\x02\0\0\0\0\0\0\x0f\x21\0\0\0\0\0\0\x61\x11\
-\x04\0\0\0\0\0\x79\x83\x08\0\0\0\0\0\x67\x01\0\0\x03\0\0\0\x0f\x13\0\0\0\0\0\0\
-\x79\x86\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf8\xff\xff\xff\xb7\x02\0\
-\0\x08\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x01\0\0\0\0\0\0\x79\xa3\xf8\xff\0\0\0\0\
-\x0f\x13\0\0\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\x01\0\0\xf4\xff\xff\xff\xb7\x02\0\
-\0\x04\0\0\0\x85\0\0\0\x71\0\0\0\xb7\x03\0\0\x04\0\0\0\x61\xa1\xf4\xff\0\0\0\0\
-\x61\x82\x10\0\0\0\0\0\x3d\x21\x02\0\0\0\0\0\x0f\x16\0\0\0\0\0\0\xbf\x69\0\0\0\
-\0\0\0\x7b\x9a\xd8\xff\0\0\0\0\x79\x71\x18\0\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\
-\x79\x71\x20\0\0\0\0\0\x79\x11\0\0\0\0\0\0\x0f\x31\0\0\0\0\0\0\x7b\x1a\xe8\xff\
-\0\0\0\0\xbf\xa4\0\0\0\0\0\0\x07\x04\0\0\xd0\xff\xff\xff\x79\xa1\xc8\xff\0\0\0\
-\0\x18\x02\0\0\x51\0\0\0\0\0\0\0\0\0\0\0\xb7\x03\0\0\x11\0\0\0\xb7\x05\0\0\x20\
-\0\0\0\x85\0\0\0\x7e\0\0\0\xb7\0\0\0\0\0\0\0\x95\0\0\0\0\0\0\0\x20\x20\x69\x64\
-\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x6d\
-\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x0a\0\x25\x34\x75\x20\x25\x2d\x31\x36\
-\x73\x25\x36\x64\x0a\0\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\x74\x61\x63\x68\x65\x64\x0a\0\x25\x34\
-\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\x25\x73\x0a\0\x47\x50\x4c\0\x9f\
-\xeb\x01\0\x18\0\0\0\0\0\0\0\x1c\x04\0\0\x1c\x04\0\0\x09\x05\0\0\0\0\0\0\0\0\0\
-\x02\x02\0\0\0\x01\0\0\0\x02\0\0\x04\x10\0\0\0\x13\0\0\0\x03\0\0\0\0\0\0\0\x18\
-\0\0\0\x04\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\x08\0\0\0\0\0\0\0\0\0\0\x02\x0d\0\
-\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x01\0\0\0\x20\0\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\x01\x24\0\0\0\x01\0\0\x0c\x05\0\0\0\xaf\0\0\0\x03\0\0\x04\x18\0\
-\0\0\xbd\0\0\0\x09\0\0\0\0\0\0\0\xc1\0\0\0\x0b\0\0\0\x40\0\0\0\xcc\0\0\0\x0b\0\
-\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x0a\0\0\0\xd4\0\0\0\0\0\0\x07\0\0\0\0\xdd\0\0\
-\0\0\0\0\x08\x0c\0\0\0\xe3\0\0\0\0\0\0\x01\x08\0\0\0\x40\0\0\0\xa4\x01\0\0\x03\
-\0\0\x04\x18\0\0\0\xac\x01\0\0\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\
-\0\xb4\x01\0\0\x0e\0\0\0\xa0\0\0\0\xc0\x01\0\0\0\0\0\x08\x0f\0\0\0\xc6\x01\0\0\
-\0\0\0\x01\x04\0\0\0\x20\0\0\0\xd3\x01\0\0\0\0\0\x01\x01\0\0\0\x08\0\0\x01\0\0\
-\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x10\0\0\0\xd8\x01\0\0\0\0\0\x01\x04\
-\0\0\0\x20\0\0\0\0\0\0\0\0\0\0\x02\x14\0\0\0\x3c\x02\0\0\x02\0\0\x04\x10\0\0\0\
-\x13\0\0\0\x03\0\0\0\0\0\0\0\x4f\x02\0\0\x15\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\x02\
-\x18\0\0\0\0\0\0\0\x01\0\0\x0d\x06\0\0\0\x1c\0\0\0\x13\0\0\0\x54\x02\0\0\x01\0\
-\0\x0c\x16\0\0\0\xa0\x02\0\0\x01\0\0\x04\x08\0\0\0\xa9\x02\0\0\x19\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x02\x1a\0\0\0\xfa\x02\0\0\x06\0\0\x04\x38\0\0\0\xac\x01\0\0\
-\x0e\0\0\0\0\0\0\0\xaf\x01\0\0\x11\0\0\0\x20\0\0\0\x07\x03\0\0\x1b\0\0\0\xc0\0\
-\0\0\x18\x03\0\0\x15\0\0\0\0\x01\0\0\x21\x03\0\0\x1d\0\0\0\x40\x01\0\0\x2b\x03\
-\0\0\x1e\0\0\0\x80\x01\0\0\0\0\0\0\0\0\0\x02\x1c\0\0\0\0\0\0\0\0\0\0\x0a\x10\0\
-\0\0\0\0\0\0\0\0\0\x02\x1f\0\0\0\0\0\0\0\0\0\0\x02\x20\0\0\0\x75\x03\0\0\x02\0\
-\0\x04\x08\0\0\0\x83\x03\0\0\x0e\0\0\0\0\0\0\0\x8c\x03\0\0\x0e\0\0\0\x20\0\0\0\
-\x2b\x03\0\0\x03\0\0\x04\x18\0\0\0\x96\x03\0\0\x1b\0\0\0\0\0\0\0\x9e\x03\0\0\
-\x21\0\0\0\x40\0\0\0\xa4\x03\0\0\x23\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\x02\x22\0\0\
-\0\0\0\0\0\0\0\0\x02\x24\0\0\0\xa8\x03\0\0\x01\0\0\x04\x04\0\0\0\xb3\x03\0\0\
-\x0e\0\0\0\0\0\0\0\x1c\x04\0\0\x01\0\0\x04\x04\0\0\0\x25\x04\0\0\x0e\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x23\0\0\0\x9b\x04\0\0\0\0\0\
-\x0e\x25\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\x12\0\0\0\x0e\0\0\0\
-\xaf\x04\0\0\0\0\0\x0e\x27\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\0\0\0\x1c\0\0\0\
-\x12\0\0\0\x20\0\0\0\xc5\x04\0\0\0\0\0\x0e\x29\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\
-\0\0\0\0\x1c\0\0\0\x12\0\0\0\x11\0\0\0\xda\x04\0\0\0\0\0\x0e\x2b\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x03\0\0\0\0\x10\0\0\0\x12\0\0\0\x04\0\0\0\xf1\x04\0\0\0\0\0\x0e\
-\x2d\0\0\0\x01\0\0\0\xf9\x04\0\0\x04\0\0\x0f\0\0\0\0\x26\0\0\0\0\0\0\0\x23\0\0\
-\0\x28\0\0\0\x23\0\0\0\x0e\0\0\0\x2a\0\0\0\x31\0\0\0\x20\0\0\0\x2c\0\0\0\x51\0\
-\0\0\x11\0\0\0\x01\x05\0\0\x01\0\0\x0f\0\0\0\0\x2e\0\0\0\0\0\0\0\x04\0\0\0\0\
-\x62\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x6d\x65\
-\x74\x61\0\x6d\x61\x70\0\x63\x74\x78\0\x69\x6e\x74\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x6d\x61\x70\0\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\
-\x30\x3a\x30\0\x2f\x68\x6f\x6d\x65\x2f\x61\x6c\x72\x75\x61\x2f\x62\x75\x69\x6c\
-\x64\x2f\x6c\x69\x6e\x75\x78\x2f\x6b\x65\x72\x6e\x65\x6c\x2f\x62\x70\x66\x2f\
-\x70\x72\x65\x6c\x6f\x61\x64\x2f\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2f\x69\
-\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x73\x65\x71\x5f\x66\x69\x6c\x65\x20\x2a\x73\x65\x71\x20\x3d\x20\
-\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x3b\0\x62\x70\x66\x5f\
-\x69\x74\x65\x72\x5f\x6d\x65\x74\x61\0\x73\x65\x71\0\x73\x65\x73\x73\x69\x6f\
-\x6e\x5f\x69\x64\0\x73\x65\x71\x5f\x6e\x75\x6d\0\x73\x65\x71\x5f\x66\x69\x6c\
-\x65\0\x5f\x5f\x75\x36\x34\0\x6c\x6f\x6e\x67\x20\x6c\x6f\x6e\x67\x20\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x30\x3a\x31\0\x09\x73\x74\x72\x75\
-\x63\x74\x20\x62\x70\x66\x5f\x6d\x61\x70\x20\x2a\x6d\x61\x70\x20\x3d\x20\x63\
-\x74\x78\x2d\x3e\x6d\x61\x70\x3b\0\x09\x69\x66\x20\x28\x21\x6d\x61\x70\x29\0\
-\x30\x3a\x32\0\x09\x5f\x5f\x75\x36\x34\x20\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\
-\x20\x63\x74\x78\x2d\x3e\x6d\x65\x74\x61\x2d\x3e\x73\x65\x71\x5f\x6e\x75\x6d\
-\x3b\0\x09\x69\x66\x20\x28\x73\x65\x71\x5f\x6e\x75\x6d\x20\x3d\x3d\x20\x30\x29\
-\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\
-\x71\x2c\x20\x22\x20\x20\x69\x64\x20\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\
-\x20\x20\x20\x20\x20\x20\x20\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x5c\
-\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x6d\x61\x70\0\x69\x64\0\x6e\x61\x6d\x65\0\
-\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\0\x5f\x5f\x75\x33\x32\0\x75\x6e\
-\x73\x69\x67\x6e\x65\x64\x20\x69\x6e\x74\0\x63\x68\x61\x72\0\x5f\x5f\x41\x52\
-\x52\x41\x59\x5f\x53\x49\x5a\x45\x5f\x54\x59\x50\x45\x5f\x5f\0\x09\x42\x50\x46\
-\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x25\
-\x34\x75\x20\x25\x2d\x31\x36\x73\x25\x36\x64\x5c\x6e\x22\x2c\x20\x6d\x61\x70\
-\x2d\x3e\x69\x64\x2c\x20\x6d\x61\x70\x2d\x3e\x6e\x61\x6d\x65\x2c\x20\x6d\x61\
-\x70\x2d\x3e\x6d\x61\x78\x5f\x65\x6e\x74\x72\x69\x65\x73\x29\x3b\0\x7d\0\x62\
-\x70\x66\x5f\x69\x74\x65\x72\x5f\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x70\x72\
-\x6f\x67\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x09\x73\x74\x72\x75\x63\x74\x20\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x20\x2a\x70\x72\x6f\x67\x20\x3d\x20\x63\x74\x78\
-\x2d\x3e\x70\x72\x6f\x67\x3b\0\x09\x69\x66\x20\x28\x21\x70\x72\x6f\x67\x29\0\
-\x62\x70\x66\x5f\x70\x72\x6f\x67\0\x61\x75\x78\0\x09\x61\x75\x78\x20\x3d\x20\
-\x70\x72\x6f\x67\x2d\x3e\x61\x75\x78\x3b\0\x09\x09\x42\x50\x46\x5f\x53\x45\x51\
-\x5f\x50\x52\x49\x4e\x54\x46\x28\x73\x65\x71\x2c\x20\x22\x20\x20\x69\x64\x20\
-\x6e\x61\x6d\x65\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x61\x74\
-\x74\x61\x63\x68\x65\x64\x5c\x6e\x22\x29\x3b\0\x62\x70\x66\x5f\x70\x72\x6f\x67\
-\x5f\x61\x75\x78\0\x61\x74\x74\x61\x63\x68\x5f\x66\x75\x6e\x63\x5f\x6e\x61\x6d\
-\x65\0\x64\x73\x74\x5f\x70\x72\x6f\x67\0\x66\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\
-\x62\x74\x66\0\x09\x42\x50\x46\x5f\x53\x45\x51\x5f\x50\x52\x49\x4e\x54\x46\x28\
-\x73\x65\x71\x2c\x20\x22\x25\x34\x75\x20\x25\x2d\x31\x36\x73\x20\x25\x73\x20\
-\x25\x73\x5c\x6e\x22\x2c\x20\x61\x75\x78\x2d\x3e\x69\x64\x2c\0\x30\x3a\x34\0\
-\x30\x3a\x35\0\x09\x69\x66\x20\x28\x21\x62\x74\x66\x29\0\x62\x70\x66\x5f\x66\
-\x75\x6e\x63\x5f\x69\x6e\x66\x6f\0\x69\x6e\x73\x6e\x5f\x6f\x66\x66\0\x74\x79\
-\x70\x65\x5f\x69\x64\0\x30\0\x73\x74\x72\x69\x6e\x67\x73\0\x74\x79\x70\x65\x73\
-\0\x68\x64\x72\0\x62\x74\x66\x5f\x68\x65\x61\x64\x65\x72\0\x73\x74\x72\x5f\x6c\
-\x65\x6e\0\x09\x74\x79\x70\x65\x73\x20\x3d\x20\x62\x74\x66\x2d\x3e\x74\x79\x70\
-\x65\x73\x3b\0\x09\x62\x70\x66\x5f\x70\x72\x6f\x62\x65\x5f\x72\x65\x61\x64\x5f\
-\x6b\x65\x72\x6e\x65\x6c\x28\x26\x74\x2c\x20\x73\x69\x7a\x65\x6f\x66\x28\x74\
-\x29\x2c\x20\x74\x79\x70\x65\x73\x20\x2b\x20\x62\x74\x66\x5f\x69\x64\x29\x3b\0\
-\x09\x73\x74\x72\x20\x3d\x20\x62\x74\x66\x2d\x3e\x73\x74\x72\x69\x6e\x67\x73\
-\x3b\0\x62\x74\x66\x5f\x74\x79\x70\x65\0\x6e\x61\x6d\x65\x5f\x6f\x66\x66\0\x09\
-\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\x3d\x20\x42\x50\x46\x5f\x43\x4f\x52\x45\
-\x5f\x52\x45\x41\x44\x28\x74\x2c\x20\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x29\x3b\0\
-\x30\x3a\x32\x3a\x30\0\x09\x69\x66\x20\x28\x6e\x61\x6d\x65\x5f\x6f\x66\x66\x20\
-\x3e\x3d\x20\x62\x74\x66\x2d\x3e\x68\x64\x72\x2e\x73\x74\x72\x5f\x6c\x65\x6e\
-\x29\0\x09\x72\x65\x74\x75\x72\x6e\x20\x73\x74\x72\x20\x2b\x20\x6e\x61\x6d\x65\
-\x5f\x6f\x66\x66\x3b\0\x30\x3a\x33\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\
-\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\x64\x75\x6d\x70\x5f\x62\x70\x66\
-\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\x75\x6d\x70\x5f\x62\x70\
-\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x4c\x49\x43\x45\
-\x4e\x53\x45\0\x2e\x72\x6f\x64\x61\x74\x61\0\x6c\x69\x63\x65\x6e\x73\x65\0\x9f\
-\xeb\x01\0\x20\0\0\0\0\0\0\0\x24\0\0\0\x24\0\0\0\x44\x02\0\0\x68\x02\0\0\xa4\
-\x01\0\0\x08\0\0\0\x31\0\0\0\x01\0\0\0\0\0\0\0\x07\0\0\0\x62\x02\0\0\x01\0\0\0\
-\0\0\0\0\x17\0\0\0\x10\0\0\0\x31\0\0\0\x09\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\
-\x1e\x40\x01\0\x08\0\0\0\x42\0\0\0\x87\0\0\0\x24\x40\x01\0\x10\0\0\0\x42\0\0\0\
-\xfe\0\0\0\x1d\x48\x01\0\x18\0\0\0\x42\0\0\0\x1f\x01\0\0\x06\x50\x01\0\x20\0\0\
-\0\x42\0\0\0\x2e\x01\0\0\x1d\x44\x01\0\x28\0\0\0\x42\0\0\0\x53\x01\0\0\x06\x5c\
-\x01\0\x38\0\0\0\x42\0\0\0\x66\x01\0\0\x03\x60\x01\0\x70\0\0\0\x42\0\0\0\xec\
-\x01\0\0\x02\x68\x01\0\xf0\0\0\0\x42\0\0\0\x3a\x02\0\0\x01\x70\x01\0\x62\x02\0\
-\0\x1a\0\0\0\0\0\0\0\x42\0\0\0\x87\0\0\0\x1e\x84\x01\0\x08\0\0\0\x42\0\0\0\x87\
-\0\0\0\x24\x84\x01\0\x10\0\0\0\x42\0\0\0\x70\x02\0\0\x1f\x8c\x01\0\x18\0\0\0\
-\x42\0\0\0\x94\x02\0\0\x06\x98\x01\0\x20\0\0\0\x42\0\0\0\xad\x02\0\0\x0e\xa4\
-\x01\0\x28\0\0\0\x42\0\0\0\x2e\x01\0\0\x1d\x88\x01\0\x30\0\0\0\x42\0\0\0\x53\
-\x01\0\0\x06\xa8\x01\0\x40\0\0\0\x42\0\0\0\xbf\x02\0\0\x03\xac\x01\0\x80\0\0\0\
-\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xb8\0\0\0\x42\0\0\0\x6a\x03\0\0\x06\x08\
-\x01\0\xd0\0\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\x42\0\0\0\xbb\x03\0\0\x0f\
-\x14\x01\0\xe0\0\0\0\x42\0\0\0\xd0\x03\0\0\x2d\x18\x01\0\xf0\0\0\0\x42\0\0\0\
-\x07\x04\0\0\x0d\x10\x01\0\0\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x08\x01\0\0\x42\
-\0\0\0\xd0\x03\0\0\x02\x18\x01\0\x20\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\
-\0\x38\x01\0\0\x42\0\0\0\0\0\0\0\0\0\0\0\x40\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\
-\x1c\x01\0\x58\x01\0\0\x42\0\0\0\x2e\x04\0\0\x0d\x1c\x01\0\x60\x01\0\0\x42\0\0\
-\0\x5c\x04\0\0\x1b\x20\x01\0\x68\x01\0\0\x42\0\0\0\x5c\x04\0\0\x06\x20\x01\0\
-\x70\x01\0\0\x42\0\0\0\x7f\x04\0\0\x0d\x28\x01\0\x78\x01\0\0\x42\0\0\0\0\0\0\0\
-\0\0\0\0\x80\x01\0\0\x42\0\0\0\x2f\x03\0\0\x02\xb4\x01\0\xf8\x01\0\0\x42\0\0\0\
-\x3a\x02\0\0\x01\xc4\x01\0\x10\0\0\0\x31\0\0\0\x07\0\0\0\0\0\0\0\x02\0\0\0\x3e\
-\0\0\0\0\0\0\0\x08\0\0\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x02\0\0\0\xfa\0\
-\0\0\0\0\0\0\x20\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x70\0\0\0\x0d\0\0\0\x3e\0\
-\0\0\0\0\0\0\x80\0\0\0\x0d\0\0\0\xfa\0\0\0\0\0\0\0\xa0\0\0\0\x0d\0\0\0\x2a\x01\
-\0\0\0\0\0\0\x62\x02\0\0\x12\0\0\0\0\0\0\0\x14\0\0\0\x3e\0\0\0\0\0\0\0\x08\0\0\
-\0\x08\0\0\0\x3e\0\0\0\0\0\0\0\x10\0\0\0\x14\0\0\0\xfa\0\0\0\0\0\0\0\x20\0\0\0\
-\x18\0\0\0\x3e\0\0\0\0\0\0\0\x28\0\0\0\x08\0\0\0\x2a\x01\0\0\0\0\0\0\x80\0\0\0\
-\x1a\0\0\0\x3e\0\0\0\0\0\0\0\x90\0\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\xa8\0\0\0\
-\x1a\0\0\0\x62\x03\0\0\0\0\0\0\xb0\0\0\0\x1a\0\0\0\x66\x03\0\0\0\0\0\0\xc0\0\0\
-\0\x1f\0\0\0\x94\x03\0\0\0\0\0\0\xd8\0\0\0\x20\0\0\0\xfa\0\0\0\0\0\0\0\xf0\0\0\
-\0\x20\0\0\0\x3e\0\0\0\0\0\0\0\x18\x01\0\0\x24\0\0\0\x3e\0\0\0\0\0\0\0\x50\x01\
-\0\0\x1a\0\0\0\xfa\0\0\0\0\0\0\0\x60\x01\0\0\x20\0\0\0\x56\x04\0\0\0\0\0\0\x88\
-\x01\0\0\x1a\0\0\0\x2a\x01\0\0\0\0\0\0\x98\x01\0\0\x1a\0\0\0\x97\x04\0\0\0\0\0\
-\0\xa0\x01\0\0\x18\0\0\0\x3e\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\x91\0\0\0\x04\0\xf1\xff\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xe6\0\0\
-\0\0\0\x02\0\x70\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd8\0\0\0\0\0\x02\0\xf0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xdf\0\0\0\0\0\x03\0\x78\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xd1\0\0\0\0\0\x03\0\x80\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xca\0\0\0\0\0\x03\0\
-\xf8\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x14\0\0\0\x01\0\x04\0\0\0\0\0\0\0\0\0\x23\
-\0\0\0\0\0\0\0\x04\x01\0\0\x01\0\x04\0\x23\0\0\0\0\0\0\0\x0e\0\0\0\0\0\0\0\x28\
-\0\0\0\x01\0\x04\0\x31\0\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\xed\0\0\0\x01\0\x04\0\
-\x51\0\0\0\0\0\0\0\x11\0\0\0\0\0\0\0\0\0\0\0\x03\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x03\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x03\0\
-\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xc2\0\0\0\x11\0\x05\0\0\0\0\0\0\0\0\0\
-\x04\0\0\0\0\0\0\0\x3d\0\0\0\x12\0\x02\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\x5b\
-\0\0\0\x12\0\x03\0\0\0\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\x48\0\0\0\0\0\0\0\x01\0\
-\0\0\x0d\0\0\0\xc8\0\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\x50\0\0\0\0\0\0\0\x01\0\0\
-\0\x0d\0\0\0\xd0\x01\0\0\0\0\0\0\x01\0\0\0\x0d\0\0\0\xf0\x03\0\0\0\0\0\0\x0a\0\
-\0\0\x0d\0\0\0\xfc\x03\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x08\x04\0\0\0\0\0\0\x0a\
-\0\0\0\x0d\0\0\0\x14\x04\0\0\0\0\0\0\x0a\0\0\0\x0d\0\0\0\x2c\x04\0\0\0\0\0\0\0\
-\0\0\0\x0e\0\0\0\x2c\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x3c\0\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x50\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x60\0\0\0\0\0\0\0\0\0\0\0\x0b\0\
-\0\0\x70\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\
-\x90\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xb0\0\
-\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xd0\0\0\0\0\
-\0\0\0\0\0\0\0\x0b\0\0\0\xe8\0\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\0\0\0\0\0\0\0\
-\0\0\0\0\x0c\0\0\0\x08\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x01\0\0\0\0\0\0\0\
-\0\0\0\x0c\0\0\0\x28\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x01\0\0\0\0\0\0\0\0\
-\0\0\x0c\0\0\0\x48\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x01\0\0\0\0\0\0\0\0\0\
-\0\x0c\0\0\0\x68\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x88\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x98\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xa8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xb8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xc8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xd8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xe8\x01\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xf8\x01\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x18\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x28\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x38\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x48\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x58\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x68\x02\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x78\x02\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x94\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xa4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xb4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xc4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xd4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\xe4\x02\0\0\0\0\0\0\0\0\0\0\
-\x0b\0\0\0\xf4\x02\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x0c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x2c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x3c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x5c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x6c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x7c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x8c\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x9c\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xac\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xbc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xcc\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xdc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\xec\x03\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\xfc\x03\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x0c\x04\0\0\0\0\0\0\0\0\0\0\
-\x0c\0\0\0\x1c\x04\0\0\0\0\0\0\0\0\0\0\x0c\0\0\0\x4d\x4e\x40\x41\x42\x43\x4c\0\
-\x2e\x74\x65\x78\x74\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\x2e\x65\x78\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\0\x64\
-\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\0\
-\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\0\x2e\x72\x65\x6c\x69\x74\x65\
-\x72\x2f\x62\x70\x66\x5f\x6d\x61\x70\0\x64\x75\x6d\x70\x5f\x62\x70\x66\x5f\x70\
-\x72\x6f\x67\0\x2e\x72\x65\x6c\x69\x74\x65\x72\x2f\x62\x70\x66\x5f\x70\x72\x6f\
-\x67\0\x2e\x6c\x6c\x76\x6d\x5f\x61\x64\x64\x72\x73\x69\x67\0\x6c\x69\x63\x65\
-\x6e\x73\x65\0\x69\x74\x65\x72\x61\x74\x6f\x72\x73\x2e\x62\x70\x66\x2e\x63\0\
-\x2e\x73\x74\x72\x74\x61\x62\0\x2e\x73\x79\x6d\x74\x61\x62\0\x2e\x72\x6f\x64\
-\x61\x74\x61\0\x2e\x72\x65\x6c\x2e\x42\x54\x46\0\x4c\x49\x43\x45\x4e\x53\x45\0\
-\x4c\x42\x42\x31\x5f\x37\0\x4c\x42\x42\x31\x5f\x36\0\x4c\x42\x42\x30\x5f\x34\0\
-\x4c\x42\x42\x31\x5f\x33\0\x4c\x42\x42\x30\x5f\x33\0\x64\x75\x6d\x70\x5f\x62\
-\x70\x66\x5f\x70\x72\x6f\x67\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x32\0\x64\x75\x6d\
-\x70\x5f\x62\x70\x66\x5f\x6d\x61\x70\x2e\x5f\x5f\x5f\x66\x6d\x74\x2e\x31\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\x01\0\0\
-\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x4e\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\x6d\0\0\0\x01\0\0\0\x06\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x40\x01\0\0\0\0\0\0\x08\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x08\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\xb1\0\0\0\x01\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x48\x03\0\
-\0\0\0\0\0\x62\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\x89\0\0\0\x01\0\0\0\x03\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xaa\x03\0\0\0\0\0\0\x04\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xbd\0\0\0\x01\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xae\x03\0\0\0\0\0\0\x3d\x09\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x0b\0\0\0\x01\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\0\xeb\x0c\0\0\0\0\0\0\x2c\x04\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa9\0\0\0\x02\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\x18\x11\0\0\0\0\0\0\x98\x01\0\0\0\0\0\0\x0e\0\0\0\x0e\0\0\0\x08\0\0\
-\0\0\0\0\0\x18\0\0\0\0\0\0\0\x4a\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\0\xb0\x12\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x02\0\0\0\x08\0\0\0\0\0\0\0\
-\x10\0\0\0\0\0\0\0\x69\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xd0\x12\
-\0\0\0\0\0\0\x20\0\0\0\0\0\0\0\x08\0\0\0\x03\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\
-\0\0\0\0\xb9\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xf0\x12\0\0\0\0\0\
-\0\x50\0\0\0\0\0\0\0\x08\0\0\0\x06\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\
-\x07\0\0\0\x09\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x40\x13\0\0\0\0\0\0\xe0\
-\x03\0\0\0\0\0\0\x08\0\0\0\x07\0\0\0\x08\0\0\0\0\0\0\0\x10\0\0\0\0\0\0\0\x7b\0\
-\0\0\x03\x4c\xff\x6f\0\0\0\x80\0\0\0\0\0\0\0\0\0\0\0\0\x20\x17\0\0\0\0\0\0\x07\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\xa1\0\0\0\x03\
-\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\x27\x17\0\0\0\0\0\0\x1a\x01\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
-
- return 0;
-err:
- bpf_object__destroy_skeleton(s);
- return -1;
-}
-
-#endif /* __ITERATORS_BPF_SKEL_H__ */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 556a769b5b80..8251243022a2 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -143,7 +143,7 @@ static void reuseport_array_free(struct bpf_map *map)
/*
* Once reaching here, all sk->sk_user_data is not
- * referenceing this "array". "array" can be freed now.
+ * referencing this "array". "array" can be freed now.
*/
bpf_map_area_free(array);
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 638d7fd7b375..710ba9de12ce 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -104,7 +104,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
}
rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
- VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+ VM_MAP | VM_USERMAP, PAGE_KERNEL);
if (rb) {
kmemleak_not_leak(pages);
rb->pages = pages;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 49e567209c6b..34725bfa1e97 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -132,7 +132,8 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma, *prev_vma = NULL;
+ const char *prev_build_id;
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
@@ -150,6 +151,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
for (i = 0; i < trace_nr; i++) {
+ if (range_in_vma(prev_vma, ips[i], ips[i])) {
+ vma = prev_vma;
+ memcpy(id_offs[i].build_id, prev_build_id,
+ BUILD_ID_SIZE_MAX);
+ goto build_id_valid;
+ }
vma = find_vma(current->mm, ips[i]);
if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) {
/* per entry fall back to ips */
@@ -158,15 +165,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX);
continue;
}
+build_id_valid:
id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
+ prev_vma = vma;
+ prev_build_id = id_offs[i].build_id;
}
bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
-get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
+get_callchain_entry_for_task(struct task_struct *task, u32 max_depth)
{
#ifdef CONFIG_STACKTRACE
struct perf_callchain_entry *entry;
@@ -177,9 +187,8 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
if (!entry)
return NULL;
- entry->nr = init_nr +
- stack_trace_save_tsk(task, (unsigned long *)(entry->ip + init_nr),
- sysctl_perf_event_max_stack - init_nr, 0);
+ entry->nr = stack_trace_save_tsk(task, (unsigned long *)entry->ip,
+ max_depth, 0);
/* stack_trace_save_tsk() works on unsigned long array, while
* perf_callchain_entry uses u64 array. For 32-bit systems, it is
@@ -191,7 +200,7 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
int i;
/* copy data from the end to avoid using extra buffer */
- for (i = entry->nr - 1; i >= (int)init_nr; i--)
+ for (i = entry->nr - 1; i >= 0; i--)
to[i] = (u64)(from[i]);
}
@@ -208,27 +217,19 @@ static long __bpf_get_stackid(struct bpf_map *map,
{
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
- u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
u64 *ips;
bool hash_matches;
- /* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
- */
- trace_nr = trace->nr - init_nr;
-
- if (trace_nr <= skip)
+ if (trace->nr <= skip)
/* skipping more than usable stack trace */
return -EFAULT;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_len = trace_nr * sizeof(u64);
- ips = trace->ip + skip + init_nr;
+ ips = trace->ip + skip;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1);
bucket = READ_ONCE(smap->buckets[id]);
@@ -285,8 +286,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
u64, flags)
{
u32 max_depth = map->value_size / stack_map_data_size(map);
- /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
- u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
struct perf_callchain_entry *trace;
bool kernel = !user;
@@ -295,8 +295,12 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
return -EINVAL;
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack, false, false);
+ max_depth += skip;
+ if (max_depth > sysctl_perf_event_max_stack)
+ max_depth = sysctl_perf_event_max_stack;
+
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
+ false, false);
if (unlikely(!trace))
/* couldn't fetch the stack trace */
@@ -387,7 +391,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
struct perf_callchain_entry *trace_in,
void *buf, u32 size, u64 flags)
{
- u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ u32 trace_nr, copy_len, elem_size, num_elem, max_depth;
bool user_build_id = flags & BPF_F_USER_BUILD_ID;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
bool user = flags & BPF_F_USER_STACK;
@@ -412,30 +416,28 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
goto err_fault;
num_elem = size / elem_size;
- if (sysctl_perf_event_max_stack < num_elem)
- init_nr = 0;
- else
- init_nr = sysctl_perf_event_max_stack - num_elem;
+ max_depth = num_elem + skip;
+ if (sysctl_perf_event_max_stack < max_depth)
+ max_depth = sysctl_perf_event_max_stack;
if (trace_in)
trace = trace_in;
else if (kernel && task)
- trace = get_callchain_entry_for_task(task, init_nr);
+ trace = get_callchain_entry_for_task(task, max_depth);
else
- trace = get_perf_callchain(regs, init_nr, kernel, user,
- sysctl_perf_event_max_stack,
+ trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
goto err_fault;
- trace_nr = trace->nr - init_nr;
- if (trace_nr < skip)
+ if (trace->nr < skip)
goto err_fault;
- trace_nr -= skip;
+ trace_nr = trace->nr - skip;
trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
copy_len = trace_nr * elem_size;
- ips = trace->ip + skip + init_nr;
+
+ ips = trace->ip + skip;
if (user && user_build_id)
stack_map_get_build_id_offset(buf, ips, trace_nr, user);
else
@@ -472,13 +474,14 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
u32, size, u64, flags)
{
struct pt_regs *regs;
- long res;
+ long res = -EINVAL;
if (!try_get_task_stack(task))
return -EFAULT;
regs = task_pt_regs(task);
- res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
+ if (regs)
+ res = __bpf_get_stack(regs, task, NULL, buf, size, flags);
put_task_stack(task);
return res;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fa4505f9b611..cdaa1152436a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -32,6 +32,7 @@
#include <linux/bpf-netns.h>
#include <linux/rcupdate_trace.h>
#include <linux/memcontrol.h>
+#include <linux/trace_events.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -556,16 +557,14 @@ static unsigned long bpf_map_memory_footprint(const struct bpf_map *map)
static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
{
- const struct bpf_map *map = filp->private_data;
- const struct bpf_array *array;
+ struct bpf_map *map = filp->private_data;
u32 type = 0, jited = 0;
- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
- array = container_of(map, struct bpf_array, map);
- spin_lock(&array->aux->owner.lock);
- type = array->aux->owner.type;
- jited = array->aux->owner.jited;
- spin_unlock(&array->aux->owner.lock);
+ if (map_type_contains_progs(map)) {
+ spin_lock(&map->owner.lock);
+ type = map->owner.type;
+ jited = map->owner.jited;
+ spin_unlock(&map->owner.lock);
}
seq_printf(m,
@@ -874,6 +873,7 @@ static int map_create(union bpf_attr *attr)
atomic64_set(&map->refcnt, 1);
atomic64_set(&map->usercnt, 1);
mutex_init(&map->freeze_mutex);
+ spin_lock_init(&map->owner.lock);
map->spin_lock_off = -EINVAL;
map->timer_off = -EINVAL;
@@ -986,6 +986,7 @@ struct bpf_map *bpf_map_get(u32 ufd)
return map;
}
+EXPORT_SYMBOL(bpf_map_get);
struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
@@ -1352,14 +1353,16 @@ int generic_map_delete_batch(struct bpf_map *map,
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
- maybe_wait_bpf_programs(map);
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
err = -EFAULT;
kvfree(key);
+
+ maybe_wait_bpf_programs(map);
return err;
}
@@ -1412,6 +1415,7 @@ int generic_map_update_batch(struct bpf_map *map,
if (err)
break;
+ cond_resched();
}
if (copy_to_user(&uattr->batch.count, &cp, sizeof(cp)))
@@ -1509,6 +1513,7 @@ int generic_map_lookup_batch(struct bpf_map *map,
swap(prev_key, key);
retry = MAP_LOOKUP_RETRIES;
cp++;
+ cond_resched();
}
if (err == -EFAULT)
@@ -2217,7 +2222,8 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
BPF_F_ANY_ALIGNMENT |
BPF_F_TEST_STATE_FREQ |
BPF_F_SLEEPABLE |
- BPF_F_TEST_RND_HI32))
+ BPF_F_TEST_RND_HI32 |
+ BPF_F_XDP_HAS_FRAGS))
return -EINVAL;
if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
@@ -2303,6 +2309,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
prog->aux->dst_prog = dst_prog;
prog->aux->offload_requested = !!attr->prog_ifindex;
prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE;
+ prog->aux->xdp_has_frags = attr->prog_flags & BPF_F_XDP_HAS_FRAGS;
err = security_bpf_prog_alloc(prog->aux);
if (err)
@@ -2488,6 +2495,7 @@ void bpf_link_put(struct bpf_link *link)
bpf_link_free(link);
}
}
+EXPORT_SYMBOL(bpf_link_put);
static int bpf_link_release(struct inode *inode, struct file *filp)
{
@@ -2559,7 +2567,7 @@ static int bpf_link_alloc_id(struct bpf_link *link)
* pre-allocated resources are to be freed with bpf_cleanup() call. All the
* transient state is passed around in struct bpf_link_primer.
* This is preferred way to create and initialize bpf_link, especially when
- * there are complicated and expensive operations inbetween creating bpf_link
+ * there are complicated and expensive operations in between creating bpf_link
* itself and attaching it to BPF hook. By using bpf_link_prime() and
* bpf_link_settle() kernel code using bpf_link doesn't have to perform
* expensive (and potentially failing) roll back operations in a rare case
@@ -2630,6 +2638,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)
return link;
}
+EXPORT_SYMBOL(bpf_link_get_from_fd);
struct bpf_tracing_link {
struct bpf_link link;
@@ -3014,6 +3023,11 @@ out_put_file:
fput(perf_file);
return err;
}
+#else
+static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
#endif /* CONFIG_PERF_EVENTS */
#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
@@ -3318,12 +3332,17 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_FLOW_DISSECTOR:
case BPF_SK_LOOKUP:
return netns_bpf_prog_query(attr, uattr);
+ case BPF_SK_SKB_STREAM_PARSER:
+ case BPF_SK_SKB_STREAM_VERDICT:
+ case BPF_SK_MSG_VERDICT:
+ case BPF_SK_SKB_VERDICT:
+ return sock_map_bpf_prog_query(attr, uattr);
default:
return -EINVAL;
}
}
-#define BPF_PROG_TEST_RUN_LAST_FIELD test.cpu
+#define BPF_PROG_TEST_RUN_LAST_FIELD test.batch_size
static int bpf_prog_test_run(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -4242,7 +4261,7 @@ static int tracing_bpf_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
return -EINVAL;
}
-#define BPF_LINK_CREATE_LAST_FIELD link_create.iter_info_len
+#define BPF_LINK_CREATE_LAST_FIELD link_create.kprobe_multi.cookies
static int link_create(union bpf_attr *attr, bpfptr_t uattr)
{
enum bpf_prog_type ptype;
@@ -4266,7 +4285,6 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = tracing_bpf_link_attach(attr, uattr, prog);
goto out;
case BPF_PROG_TYPE_PERF_EVENT:
- case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
if (attr->link_create.attach_type != BPF_PERF_EVENT) {
ret = -EINVAL;
@@ -4274,6 +4292,14 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
}
ptype = prog->type;
break;
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type != BPF_PERF_EVENT &&
+ attr->link_create.attach_type != BPF_TRACE_KPROBE_MULTI) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ptype = prog->type;
+ break;
default:
ptype = attach_type_to_prog_type(attr->link_create.attach_type);
if (ptype == BPF_PROG_TYPE_UNSPEC || ptype != prog->type) {
@@ -4305,13 +4331,16 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_xdp_link_attach(attr, prog);
break;
#endif
-#ifdef CONFIG_PERF_EVENTS
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_TRACEPOINT:
- case BPF_PROG_TYPE_KPROBE:
ret = bpf_perf_link_attach(attr, prog);
break;
-#endif
+ case BPF_PROG_TYPE_KPROBE:
+ if (attr->link_create.attach_type == BPF_PERF_EVENT)
+ ret = bpf_perf_link_attach(attr, prog);
+ else
+ ret = bpf_kprobe_multi_link_attach(attr, prog);
+ break;
default:
ret = -EINVAL;
}
@@ -4750,23 +4779,52 @@ static bool syscall_prog_is_valid_access(int off, int size,
return true;
}
-BPF_CALL_3(bpf_sys_bpf, int, cmd, void *, attr, u32, attr_size)
+BPF_CALL_3(bpf_sys_bpf, int, cmd, union bpf_attr *, attr, u32, attr_size)
{
+ struct bpf_prog * __maybe_unused prog;
+
switch (cmd) {
case BPF_MAP_CREATE:
case BPF_MAP_UPDATE_ELEM:
case BPF_MAP_FREEZE:
case BPF_PROG_LOAD:
case BPF_BTF_LOAD:
+ case BPF_LINK_CREATE:
+ case BPF_RAW_TRACEPOINT_OPEN:
break;
- /* case BPF_PROG_TEST_RUN:
- * is not part of this list to prevent recursive test_run
- */
+#ifdef CONFIG_BPF_JIT /* __bpf_prog_enter_sleepable used by trampoline and JIT */
+ case BPF_PROG_TEST_RUN:
+ if (attr->test.data_in || attr->test.data_out ||
+ attr->test.ctx_out || attr->test.duration ||
+ attr->test.repeat || attr->test.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get_type(attr->test.prog_fd, BPF_PROG_TYPE_SYSCALL);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (attr->test.ctx_size_in < prog->aux->max_ctx_offset ||
+ attr->test.ctx_size_in > U16_MAX) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ if (!__bpf_prog_enter_sleepable(prog)) {
+ /* recursion detected */
+ bpf_prog_put(prog);
+ return -EBUSY;
+ }
+ attr->test.retval = bpf_prog_run(prog, (void *) (long) attr->test.ctx_in);
+ __bpf_prog_exit_sleepable(prog, 0 /* bpf_prog_run does runtime stats */);
+ bpf_prog_put(prog);
+ return 0;
+#endif
default:
return -EINVAL;
}
return __sys_bpf(cmd, KERNEL_BPFPTR(attr), attr_size);
}
+EXPORT_SYMBOL(bpf_sys_bpf);
static const struct bpf_func_proto bpf_sys_bpf_proto = {
.func = bpf_sys_bpf,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 4b6974a195c1..ada97751ae1b 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -45,7 +45,7 @@ void *bpf_jit_alloc_exec_page(void)
set_vm_flush_reset_perms(image);
/* Keep image as writeable. The alternative is to keep flipping ro/rw
- * everytime new program is attached or detached.
+ * every time new program is attached or detached.
*/
set_memory_x((long)image, 1);
return image;
@@ -117,18 +117,6 @@ static void bpf_trampoline_module_put(struct bpf_trampoline *tr)
tr->mod = NULL;
}
-static int is_ftrace_location(void *ip)
-{
- long addr;
-
- addr = ftrace_location((long)ip);
- if (!addr)
- return 0;
- if (WARN_ON_ONCE(addr != (long)ip))
- return -EFAULT;
- return 1;
-}
-
static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
{
void *ip = tr->func.addr;
@@ -160,12 +148,12 @@ static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_ad
static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
{
void *ip = tr->func.addr;
+ unsigned long faddr;
int ret;
- ret = is_ftrace_location(ip);
- if (ret < 0)
- return ret;
- tr->func.ftrace_managed = ret;
+ faddr = ftrace_location((unsigned long)ip);
+ if (faddr)
+ tr->func.ftrace_managed = true;
if (bpf_trampoline_module_get(tr))
return -ENOENT;
@@ -213,7 +201,7 @@ static void __bpf_tramp_image_put_deferred(struct work_struct *work)
im = container_of(work, struct bpf_tramp_image, work);
bpf_image_ksym_del(&im->ksym);
bpf_jit_free_exec(im->image);
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
percpu_ref_exit(&im->pcref);
kfree_rcu(im, rcu);
}
@@ -310,7 +298,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
if (!im)
goto out;
- err = bpf_jit_charge_modmem(1);
+ err = bpf_jit_charge_modmem(PAGE_SIZE);
if (err)
goto out_free_im;
@@ -332,7 +320,7 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
out_free_image:
bpf_jit_free_exec(im->image);
out_uncharge:
- bpf_jit_uncharge_modmem(1);
+ bpf_jit_uncharge_modmem(PAGE_SIZE);
out_free_im:
kfree(im);
out:
@@ -550,11 +538,12 @@ static __always_inline u64 notrace bpf_prog_start_time(void)
static void notrace inc_misses_counter(struct bpf_prog *prog)
{
struct bpf_prog_stats *stats;
+ unsigned int flags;
stats = this_cpu_ptr(prog->stats);
- u64_stats_update_begin(&stats->syncp);
+ flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->misses);
- u64_stats_update_end(&stats->syncp);
+ u64_stats_update_end_irqrestore(&stats->syncp, flags);
}
/* The logic is similar to bpf_prog_run(), but with an explicit
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a39eedecc93a..d175b70067b3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -452,7 +452,8 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
{
return base_type(type) == PTR_TO_SOCKET ||
base_type(type) == PTR_TO_TCP_SOCK ||
- base_type(type) == PTR_TO_MEM;
+ base_type(type) == PTR_TO_MEM ||
+ base_type(type) == PTR_TO_BTF_ID;
}
static bool type_is_rdonly_mem(u32 type)
@@ -535,10 +536,10 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
static const char *reg_type_str(struct bpf_verifier_env *env,
enum bpf_reg_type type)
{
- char postfix[16] = {0}, prefix[16] = {0};
+ char postfix[16] = {0}, prefix[32] = {0};
static const char * const str[] = {
[NOT_INIT] = "?",
- [SCALAR_VALUE] = "inv",
+ [SCALAR_VALUE] = "scalar",
[PTR_TO_CTX] = "ctx",
[CONST_PTR_TO_MAP] = "map_ptr",
[PTR_TO_MAP_VALUE] = "map_value",
@@ -553,7 +554,6 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
[PTR_TO_TP_BUFFER] = "tp_buffer",
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
- [PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
[PTR_TO_MEM] = "mem",
[PTR_TO_BUF] = "buf",
[PTR_TO_FUNC] = "func",
@@ -561,17 +561,20 @@ static const char *reg_type_str(struct bpf_verifier_env *env,
};
if (type & PTR_MAYBE_NULL) {
- if (base_type(type) == PTR_TO_BTF_ID ||
- base_type(type) == PTR_TO_PERCPU_BTF_ID)
+ if (base_type(type) == PTR_TO_BTF_ID)
strncpy(postfix, "or_null_", 16);
else
strncpy(postfix, "_or_null", 16);
}
if (type & MEM_RDONLY)
- strncpy(prefix, "rdonly_", 16);
+ strncpy(prefix, "rdonly_", 32);
if (type & MEM_ALLOC)
- strncpy(prefix, "alloc_", 16);
+ strncpy(prefix, "alloc_", 32);
+ if (type & MEM_USER)
+ strncpy(prefix, "user_", 32);
+ if (type & MEM_PERCPU)
+ strncpy(prefix, "percpu_", 32);
snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
prefix, str[base_type(type)], postfix);
@@ -682,74 +685,79 @@ static void print_verifier_state(struct bpf_verifier_env *env,
continue;
verbose(env, " R%d", i);
print_liveness(env, reg->live);
- verbose(env, "=%s", reg_type_str(env, t));
+ verbose(env, "=");
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
tnum_is_const(reg->var_off)) {
/* reg->off should be 0 for SCALAR_VALUE */
+ verbose(env, "%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
- if (base_type(t) == PTR_TO_BTF_ID ||
- base_type(t) == PTR_TO_PERCPU_BTF_ID)
+ const char *sep = "";
+
+ verbose(env, "%s", reg_type_str(env, t));
+ if (base_type(t) == PTR_TO_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id));
- verbose(env, "(id=%d", reg->id);
- if (reg_type_may_be_refcounted_or_null(t))
- verbose(env, ",ref_obj_id=%d", reg->ref_obj_id);
+ verbose(env, "(");
+/*
+ * _a stands for append, was shortened to avoid multiline statements below.
+ * This macro is used to output a comma separated list of attributes.
+ */
+#define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, __VA_ARGS__); sep = ","; })
+
+ if (reg->id)
+ verbose_a("id=%d", reg->id);
+ if (reg_type_may_be_refcounted_or_null(t) && reg->ref_obj_id)
+ verbose_a("ref_obj_id=%d", reg->ref_obj_id);
if (t != SCALAR_VALUE)
- verbose(env, ",off=%d", reg->off);
+ verbose_a("off=%d", reg->off);
if (type_is_pkt_pointer(t))
- verbose(env, ",r=%d", reg->range);
+ verbose_a("r=%d", reg->range);
else if (base_type(t) == CONST_PTR_TO_MAP ||
base_type(t) == PTR_TO_MAP_KEY ||
base_type(t) == PTR_TO_MAP_VALUE)
- verbose(env, ",ks=%d,vs=%d",
- reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ verbose_a("ks=%d,vs=%d",
+ reg->map_ptr->key_size,
+ reg->map_ptr->value_size);
if (tnum_is_const(reg->var_off)) {
/* Typically an immediate SCALAR_VALUE, but
* could be a pointer whose offset is too big
* for reg->off
*/
- verbose(env, ",imm=%llx", reg->var_off.value);
+ verbose_a("imm=%llx", reg->var_off.value);
} else {
if (reg->smin_value != reg->umin_value &&
reg->smin_value != S64_MIN)
- verbose(env, ",smin_value=%lld",
- (long long)reg->smin_value);
+ verbose_a("smin=%lld", (long long)reg->smin_value);
if (reg->smax_value != reg->umax_value &&
reg->smax_value != S64_MAX)
- verbose(env, ",smax_value=%lld",
- (long long)reg->smax_value);
+ verbose_a("smax=%lld", (long long)reg->smax_value);
if (reg->umin_value != 0)
- verbose(env, ",umin_value=%llu",
- (unsigned long long)reg->umin_value);
+ verbose_a("umin=%llu", (unsigned long long)reg->umin_value);
if (reg->umax_value != U64_MAX)
- verbose(env, ",umax_value=%llu",
- (unsigned long long)reg->umax_value);
+ verbose_a("umax=%llu", (unsigned long long)reg->umax_value);
if (!tnum_is_unknown(reg->var_off)) {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, ",var_off=%s", tn_buf);
+ verbose_a("var_off=%s", tn_buf);
}
if (reg->s32_min_value != reg->smin_value &&
reg->s32_min_value != S32_MIN)
- verbose(env, ",s32_min_value=%d",
- (int)(reg->s32_min_value));
+ verbose_a("s32_min=%d", (int)(reg->s32_min_value));
if (reg->s32_max_value != reg->smax_value &&
reg->s32_max_value != S32_MAX)
- verbose(env, ",s32_max_value=%d",
- (int)(reg->s32_max_value));
+ verbose_a("s32_max=%d", (int)(reg->s32_max_value));
if (reg->u32_min_value != reg->umin_value &&
reg->u32_min_value != U32_MIN)
- verbose(env, ",u32_min_value=%d",
- (int)(reg->u32_min_value));
+ verbose_a("u32_min=%d", (int)(reg->u32_min_value));
if (reg->u32_max_value != reg->umax_value &&
reg->u32_max_value != U32_MAX)
- verbose(env, ",u32_max_value=%d",
- (int)(reg->u32_max_value));
+ verbose_a("u32_max=%d", (int)(reg->u32_max_value));
}
+#undef verbose_a
+
verbose(env, ")");
}
}
@@ -774,7 +782,7 @@ static void print_verifier_state(struct bpf_verifier_env *env,
if (is_spilled_reg(&state->stack[i])) {
reg = &state->stack[i].spilled_ptr;
t = reg->type;
- verbose(env, "=%s", reg_type_str(env, t));
+ verbose(env, "=%s", t == SCALAR_VALUE ? "" : reg_type_str(env, t));
if (t == SCALAR_VALUE && reg->precise)
verbose(env, "P");
if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
@@ -1546,14 +1554,15 @@ static void mark_reg_not_init(struct bpf_verifier_env *env,
static void mark_btf_ld_reg(struct bpf_verifier_env *env,
struct bpf_reg_state *regs, u32 regno,
enum bpf_reg_type reg_type,
- struct btf *btf, u32 btf_id)
+ struct btf *btf, u32 btf_id,
+ enum bpf_type_flag flag)
{
if (reg_type == SCALAR_VALUE) {
mark_reg_unknown(env, regs, regno);
return;
}
mark_reg_known_zero(env, regs, regno);
- regs[regno].type = PTR_TO_BTF_ID;
+ regs[regno].type = PTR_TO_BTF_ID | flag;
regs[regno].btf = btf;
regs[regno].btf_id = btf_id;
}
@@ -1743,7 +1752,7 @@ find_kfunc_desc(const struct bpf_prog *prog, u32 func_id, u16 offset)
}
static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
- s16 offset, struct module **btf_modp)
+ s16 offset)
{
struct bpf_kfunc_btf kf_btf = { .offset = offset };
struct bpf_kfunc_btf_tab *tab;
@@ -1797,8 +1806,6 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env,
sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]),
kfunc_btf_cmp_by_off, NULL);
}
- if (btf_modp)
- *btf_modp = b->module;
return b->btf;
}
@@ -1815,8 +1822,7 @@ void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab)
}
static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
- u32 func_id, s16 offset,
- struct module **btf_modp)
+ u32 func_id, s16 offset)
{
if (offset) {
if (offset < 0) {
@@ -1827,7 +1833,7 @@ static struct btf *find_kfunc_desc_btf(struct bpf_verifier_env *env,
return ERR_PTR(-EINVAL);
}
- return __find_kfunc_desc_btf(env, offset, btf_modp);
+ return __find_kfunc_desc_btf(env, offset);
}
return btf_vmlinux ?: ERR_PTR(-ENOENT);
}
@@ -1841,6 +1847,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
struct bpf_kfunc_desc *desc;
const char *func_name;
struct btf *desc_btf;
+ unsigned long call_imm;
unsigned long addr;
int err;
@@ -1890,7 +1897,7 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
prog_aux->kfunc_btf_tab = btf_tab;
}
- desc_btf = find_kfunc_desc_btf(env, func_id, offset, NULL);
+ desc_btf = find_kfunc_desc_btf(env, func_id, offset);
if (IS_ERR(desc_btf)) {
verbose(env, "failed to find BTF for kernel function\n");
return PTR_ERR(desc_btf);
@@ -1925,9 +1932,17 @@ static int add_kfunc_call(struct bpf_verifier_env *env, u32 func_id, s16 offset)
return -EINVAL;
}
+ call_imm = BPF_CALL_IMM(addr);
+ /* Check whether or not the relative offset overflows desc->imm */
+ if ((unsigned long)(s32)call_imm != call_imm) {
+ verbose(env, "address of kernel function %s is out of range\n",
+ func_name);
+ return -EINVAL;
+ }
+
desc = &tab->descs[tab->nr_descs++];
desc->func_id = func_id;
- desc->imm = BPF_CALL_IMM(addr);
+ desc->imm = call_imm;
desc->offset = offset;
err = btf_distill_func_proto(&env->log, desc_btf,
func_proto, func_name,
@@ -2351,7 +2366,7 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn)
if (insn->src_reg != BPF_PSEUDO_KFUNC_CALL)
return NULL;
- desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off, NULL);
+ desc_btf = find_kfunc_desc_btf(data, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return "<error>";
@@ -2767,7 +2782,6 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_XDP_SOCK:
case PTR_TO_BTF_ID:
case PTR_TO_BUF:
- case PTR_TO_PERCPU_BTF_ID:
case PTR_TO_MEM:
case PTR_TO_FUNC:
case PTR_TO_MAP_KEY:
@@ -3498,11 +3512,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
#define MAX_PACKET_OFF 0xffff
-static enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog)
-{
- return prog->aux->dst_prog ? prog->aux->dst_prog->type : prog->type;
-}
-
static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
const struct bpf_call_arg_meta *meta,
enum bpf_access_type t)
@@ -3979,6 +3988,12 @@ static int __check_ptr_off_reg(struct bpf_verifier_env *env,
* is only allowed in its original, unmodified form.
*/
+ if (reg->off < 0) {
+ verbose(env, "negative offset %s ptr R%d off=%d disallowed\n",
+ reg_type_str(env, reg->type), regno, reg->off);
+ return -EACCES;
+ }
+
if (!fixed_off_ok && reg->off) {
verbose(env, "dereference of modified %s ptr R%d off=%d disallowed\n",
reg_type_str(env, reg->type), regno, reg->off);
@@ -4047,9 +4062,9 @@ static int check_buffer_access(struct bpf_verifier_env *env,
const struct bpf_reg_state *reg,
int regno, int off, int size,
bool zero_size_allowed,
- const char *buf_info,
u32 *max_access)
{
+ const char *buf_info = type_is_rdonly_mem(reg->type) ? "rdonly" : "rdwr";
int err;
err = __check_buffer_access(env, buf_info, reg, regno, off, size);
@@ -4159,6 +4174,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
struct bpf_reg_state *reg = regs + regno;
const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
const char *tname = btf_name_by_offset(reg->btf, t->name_off);
+ enum bpf_type_flag flag = 0;
u32 btf_id;
int ret;
@@ -4178,9 +4194,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
return -EACCES;
}
+ if (reg->type & MEM_USER) {
+ verbose(env,
+ "R%d is ptr_%s access user memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
+ if (reg->type & MEM_PERCPU) {
+ verbose(env,
+ "R%d is ptr_%s access percpu memory: off=%d\n",
+ regno, tname, off);
+ return -EACCES;
+ }
+
if (env->ops->btf_struct_access) {
ret = env->ops->btf_struct_access(&env->log, reg->btf, t,
- off, size, atype, &btf_id);
+ off, size, atype, &btf_id, &flag);
} else {
if (atype != BPF_READ) {
verbose(env, "only read is supported\n");
@@ -4188,14 +4218,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
}
ret = btf_struct_access(&env->log, reg->btf, t, off, size,
- atype, &btf_id);
+ atype, &btf_id, &flag);
}
if (ret < 0)
return ret;
if (atype == BPF_READ && value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag);
return 0;
}
@@ -4208,6 +4238,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
{
struct bpf_reg_state *reg = regs + regno;
struct bpf_map *map = reg->map_ptr;
+ enum bpf_type_flag flag = 0;
const struct btf_type *t;
const char *tname;
u32 btf_id;
@@ -4245,12 +4276,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env,
return -EACCES;
}
- ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id);
+ ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag);
if (ret < 0)
return ret;
if (value_regno >= 0)
- mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id);
+ mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id, flag);
return 0;
}
@@ -4451,7 +4482,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
if (err < 0)
return err;
- err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf, &btf_id);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type, &btf,
+ &btf_id);
if (err)
verbose_linfo(env, insn_idx, "; ");
if (!err && t == BPF_READ && value_regno >= 0) {
@@ -4535,7 +4567,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_tp_buffer_access(env, reg, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown(env, regs, value_regno);
- } else if (reg->type == PTR_TO_BTF_ID) {
+ } else if (base_type(reg->type) == PTR_TO_BTF_ID &&
+ !type_may_be_null(reg->type)) {
err = check_ptr_to_btf_access(env, regs, regno, off, size, t,
value_regno);
} else if (reg->type == CONST_PTR_TO_MAP) {
@@ -4543,7 +4576,6 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
value_regno);
} else if (base_type(reg->type) == PTR_TO_BUF) {
bool rdonly_mem = type_is_rdonly_mem(reg->type);
- const char *buf_info;
u32 *max_access;
if (rdonly_mem) {
@@ -4552,15 +4584,13 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
regno, reg_type_str(env, reg->type));
return -EACCES;
}
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
err = check_buffer_access(env, reg, regno, off, size, false,
- buf_info, max_access);
+ max_access);
if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
mark_reg_unknown(env, regs, value_regno);
@@ -4781,7 +4811,7 @@ static int check_stack_range_initialized(
}
if (is_spilled_reg(&state->stack[spi]) &&
- state->stack[spi].spilled_ptr.type == PTR_TO_BTF_ID)
+ base_type(state->stack[spi].spilled_ptr.type) == PTR_TO_BTF_ID)
goto mark;
if (is_spilled_reg(&state->stack[spi]) &&
@@ -4823,7 +4853,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
- const char *buf_info;
u32 *max_access;
switch (base_type(reg->type)) {
@@ -4850,15 +4879,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
if (meta && meta->raw_mode)
return -EACCES;
- buf_info = "rdonly";
max_access = &env->prog->aux->max_rdonly_access;
} else {
- buf_info = "rdwr";
max_access = &env->prog->aux->max_rdwr_access;
}
return check_buffer_access(env, reg, regno, reg->off,
access_size, zero_size_allowed,
- buf_info, max_access);
+ max_access);
case PTR_TO_STACK:
return check_stack_range_initialized(
env,
@@ -4877,6 +4904,62 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
}
}
+static int check_mem_size_reg(struct bpf_verifier_env *env,
+ struct bpf_reg_state *reg, u32 regno,
+ bool zero_size_allowed,
+ struct bpf_call_arg_meta *meta)
+{
+ int err;
+
+ /* This is used to refine r0 return value bounds for helpers
+ * that enforce this value as an upper bound on return values.
+ * See do_refine_retval_range() for helpers that can refine
+ * the return value. C type of helper is u32 so we pull register
+ * bound from umax_value however, if negative verifier errors
+ * out. Only upper bounds can be learned because retval is an
+ * int type and negative retvals are allowed.
+ */
+ if (meta)
+ meta->msize_max_value = reg->umax_value;
+
+ /* The register is SCALAR_VALUE; the access check
+ * happens using its boundaries.
+ */
+ if (!tnum_is_const(reg->var_off))
+ /* For unprivileged variable accesses, disable raw
+ * mode so that the program is required to
+ * initialize all the memory that the helper could
+ * just partially fill up.
+ */
+ meta = NULL;
+
+ if (reg->smin_value < 0) {
+ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
+ regno);
+ return -EACCES;
+ }
+
+ if (reg->umin_value == 0) {
+ err = check_helper_mem_access(env, regno - 1, 0,
+ zero_size_allowed,
+ meta);
+ if (err)
+ return err;
+ }
+
+ if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
+ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+ regno);
+ return -EACCES;
+ }
+ err = check_helper_mem_access(env, regno - 1,
+ reg->umax_value,
+ zero_size_allowed, meta);
+ if (!err)
+ err = mark_chain_precision(env, regno);
+ return err;
+}
+
int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
u32 regno, u32 mem_size)
{
@@ -4900,6 +4983,28 @@ int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return check_helper_mem_access(env, regno, mem_size, true, NULL);
}
+int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ u32 regno)
+{
+ struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
+ bool may_be_null = type_may_be_null(mem_reg->type);
+ struct bpf_reg_state saved_reg;
+ int err;
+
+ WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
+
+ if (may_be_null) {
+ saved_reg = *mem_reg;
+ mark_ptr_not_null_reg(mem_reg);
+ }
+
+ err = check_mem_size_reg(env, reg, regno, true, NULL);
+
+ if (may_be_null)
+ *mem_reg = saved_reg;
+ return err;
+}
+
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
* Two bpf_map_lookups (even with the same key) will have different reg->id.
@@ -5159,7 +5264,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM | ME
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
-static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } };
static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } };
@@ -5260,6 +5365,60 @@ found:
return 0;
}
+int check_func_arg_reg_off(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg, int regno,
+ enum bpf_arg_type arg_type,
+ bool is_release_func)
+{
+ bool fixed_off_ok = false, release_reg;
+ enum bpf_reg_type type = reg->type;
+
+ switch ((u32)type) {
+ case SCALAR_VALUE:
+ /* Pointer types where reg offset is explicitly allowed: */
+ case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
+ case PTR_TO_MAP_KEY:
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MEM:
+ case PTR_TO_MEM | MEM_RDONLY:
+ case PTR_TO_MEM | MEM_ALLOC:
+ case PTR_TO_BUF:
+ case PTR_TO_BUF | MEM_RDONLY:
+ case PTR_TO_STACK:
+ /* Some of the argument types nevertheless require a
+ * zero register offset.
+ */
+ if (arg_type != ARG_PTR_TO_ALLOC_MEM)
+ return 0;
+ break;
+ /* All the rest must be rejected, except PTR_TO_BTF_ID which allows
+ * fixed offset.
+ */
+ case PTR_TO_BTF_ID:
+ /* When referenced PTR_TO_BTF_ID is passed to release function,
+ * it's fixed offset must be 0. We rely on the property that
+ * only one referenced register can be passed to BPF helpers and
+ * kfuncs. In the other cases, fixed offset can be non-zero.
+ */
+ release_reg = is_release_func && reg->ref_obj_id;
+ if (release_reg && reg->off) {
+ verbose(env, "R%d must have zero offset when passed to release func\n",
+ regno);
+ return -EINVAL;
+ }
+ /* For release_reg == true, fixed_off_ok must be false, but we
+ * already checked and rejected reg->off != 0 above, so set to
+ * true to allow fixed offset for all other cases.
+ */
+ fixed_off_ok = true;
+ break;
+ default:
+ break;
+ }
+ return __check_ptr_off_reg(env, reg, regno, fixed_off_ok);
+}
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn)
@@ -5309,36 +5468,14 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
if (err)
return err;
- switch ((u32)type) {
- case SCALAR_VALUE:
- /* Pointer types where reg offset is explicitly allowed: */
- case PTR_TO_PACKET:
- case PTR_TO_PACKET_META:
- case PTR_TO_MAP_KEY:
- case PTR_TO_MAP_VALUE:
- case PTR_TO_MEM:
- case PTR_TO_MEM | MEM_RDONLY:
- case PTR_TO_MEM | MEM_ALLOC:
- case PTR_TO_BUF:
- case PTR_TO_BUF | MEM_RDONLY:
- case PTR_TO_STACK:
- /* Some of the argument types nevertheless require a
- * zero register offset.
- */
- if (arg_type == ARG_PTR_TO_ALLOC_MEM)
- goto force_off_check;
- break;
- /* All the rest must be rejected: */
- default:
-force_off_check:
- err = __check_ptr_off_reg(env, reg, regno,
- type == PTR_TO_BTF_ID);
- if (err < 0)
- return err;
- break;
- }
+ err = check_func_arg_reg_off(env, reg, regno, arg_type, is_release_function(meta->func_id));
+ if (err)
+ return err;
skip_type_check:
+ /* check_func_arg_reg_off relies on only one referenced register being
+ * allowed for BPF helpers.
+ */
if (reg->ref_obj_id) {
if (meta->ref_obj_id) {
verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
@@ -5439,51 +5576,7 @@ skip_type_check:
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
- /* This is used to refine r0 return value bounds for helpers
- * that enforce this value as an upper bound on return values.
- * See do_refine_retval_range() for helpers that can refine
- * the return value. C type of helper is u32 so we pull register
- * bound from umax_value however, if negative verifier errors
- * out. Only upper bounds can be learned because retval is an
- * int type and negative retvals are allowed.
- */
- meta->msize_max_value = reg->umax_value;
-
- /* The register is SCALAR_VALUE; the access check
- * happens using its boundaries.
- */
- if (!tnum_is_const(reg->var_off))
- /* For unprivileged variable accesses, disable raw
- * mode so that the program is required to
- * initialize all the memory that the helper could
- * just partially fill up.
- */
- meta = NULL;
-
- if (reg->smin_value < 0) {
- verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
- regno);
- return -EACCES;
- }
-
- if (reg->umin_value == 0) {
- err = check_helper_mem_access(env, regno - 1, 0,
- zero_size_allowed,
- meta);
- if (err)
- return err;
- }
-
- if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
- verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
- regno);
- return -EACCES;
- }
- err = check_helper_mem_access(env, regno - 1,
- reg->umax_value,
- zero_size_allowed, meta);
- if (!err)
- err = mark_chain_precision(env, regno);
+ err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
} else if (arg_type_is_alloc_size(arg_type)) {
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
@@ -6842,22 +6935,23 @@ static void mark_btf_func_reg_size(struct bpf_verifier_env *env, u32 regno,
}
}
-static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx_p)
{
const struct btf_type *t, *func, *func_proto, *ptr_type;
struct bpf_reg_state *regs = cur_regs(env);
const char *func_name, *ptr_type_name;
u32 i, nargs, func_id, ptr_type_id;
- struct module *btf_mod = NULL;
+ int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
struct btf *desc_btf;
- int err;
+ bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
if (!insn->imm)
return 0;
- desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off, &btf_mod);
+ desc_btf = find_kfunc_desc_btf(env, insn->imm, insn->off);
if (IS_ERR(desc_btf))
return PTR_ERR(desc_btf);
@@ -6866,23 +6960,43 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
func_name = btf_name_by_offset(desc_btf, func->name_off);
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!env->ops->check_kfunc_call ||
- !env->ops->check_kfunc_call(func_id, btf_mod)) {
+ if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_CHECK, func_id)) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
+ acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_ACQUIRE, func_id);
+
/* Check the arguments */
err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
- if (err)
+ if (err < 0)
return err;
+ /* In case of release function, we get register number of refcounted
+ * PTR_TO_BTF_ID back from btf_check_kfunc_arg_match, do the release now
+ */
+ if (err) {
+ err = release_reference(env, regs[err].ref_obj_id);
+ if (err) {
+ verbose(env, "kfunc %s#%d reference has not been acquired before\n",
+ func_name, func_id);
+ return err;
+ }
+ }
for (i = 0; i < CALLER_SAVED_REGS; i++)
mark_reg_not_init(env, regs, caller_saved[i]);
/* Check return type */
t = btf_type_skip_modifiers(desc_btf, func_proto->type, NULL);
+
+ if (acq && !btf_type_is_ptr(t)) {
+ verbose(env, "acquire kernel function does not return PTR_TO_BTF_ID\n");
+ return -EINVAL;
+ }
+
if (btf_type_is_scalar(t)) {
mark_reg_unknown(env, regs, BPF_REG_0);
mark_btf_func_reg_size(env, BPF_REG_0, t->size);
@@ -6901,7 +7015,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
+ if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
+ BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+ regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
+ /* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
+ regs[BPF_REG_0].id = ++env->id_gen;
+ }
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
+ if (acq) {
+ int id = acquire_reference_state(env, insn_idx);
+
+ if (id < 0)
+ return id;
+ regs[BPF_REG_0].id = id;
+ regs[BPF_REG_0].ref_obj_id = id;
+ }
} /* else { add_kfunc_call() ensures it is btf_type_is_void(t) } */
nargs = btf_type_vlen(func_proto);
@@ -9548,7 +9676,6 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
- case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf = aux->btf_var.btf;
dst_reg->btf_id = aux->btf_var.btf_id;
break;
@@ -10273,8 +10400,7 @@ static void adjust_btf_func(struct bpf_verifier_env *env)
aux->func_info[i].insn_off = env->subprog_info[i].start;
}
-#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \
- sizeof(((struct bpf_line_info *)(0))->line_col))
+#define MIN_BPF_LINEINFO_SIZE offsetofend(struct bpf_line_info, line_col)
#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE
static int check_btf_line(struct bpf_verifier_env *env,
@@ -11549,7 +11675,7 @@ static int do_check(struct bpf_verifier_env *env)
if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx);
else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
- err = check_kfunc_call(env, insn);
+ err = check_kfunc_call(env, insn, &env->insn_idx);
else
err = check_helper_call(env, insn, &env->insn_idx);
if (err)
@@ -11748,7 +11874,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
type = t->type;
t = btf_type_skip_modifiers(btf, type, NULL);
if (percpu) {
- aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
+ aux->btf_var.reg_type = PTR_TO_BTF_ID | MEM_PERCPU;
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
@@ -12897,6 +13023,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
+ func[i]->blinding_requested = prog->blinding_requested;
func[i]->aux->kfunc_tab = prog->aux->kfunc_tab;
func[i]->aux->kfunc_btf_tab = prog->aux->kfunc_btf_tab;
func[i]->aux->linfo = prog->aux->linfo;
@@ -12992,6 +13119,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
prog->jited = 1;
prog->bpf_func = func[0]->bpf_func;
+ prog->jited_len = func[0]->jited_len;
prog->aux->func = func;
prog->aux->func_cnt = env->subprog_cnt;
bpf_prog_jit_attempt_done(prog);
@@ -13019,6 +13147,7 @@ out_free:
out_undo_insn:
/* cleanup main prog to be interpreted */
prog->jit_requested = 0;
+ prog->blinding_requested = 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (!bpf_pseudo_call(insn))
continue;
@@ -13112,7 +13241,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
{
struct bpf_prog *prog = env->prog;
enum bpf_attach_type eatype = prog->expected_attach_type;
- bool expect_blinding = bpf_jit_blinding_enabled(prog);
enum bpf_prog_type prog_type = resolve_prog_type(prog);
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
@@ -13276,7 +13404,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
insn->code = BPF_JMP | BPF_TAIL_CALL;
aux = &env->insn_aux_data[i + delta];
- if (env->bpf_capable && !expect_blinding &&
+ if (env->bpf_capable && !prog->blinding_requested &&
prog->jit_requested &&
!bpf_map_key_poisoned(aux) &&
!bpf_map_ptr_poisoned(aux) &&
@@ -13364,6 +13492,26 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
+ if (insn->imm == BPF_FUNC_task_storage_get ||
+ insn->imm == BPF_FUNC_sk_storage_get ||
+ insn->imm == BPF_FUNC_inode_storage_get) {
+ if (env->prog->aux->sleepable)
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
+ else
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
+ insn_buf[1] = *insn;
+ cnt = 2;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ goto patch_call_imm;
+ }
+
/* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup
* and other inlining handlers are currently limited to 64 bit
* only.
diff --git a/kernel/capability.c b/kernel/capability.c
index 46a361dde042..765194f5d678 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -360,6 +360,7 @@ bool has_capability_noaudit(struct task_struct *t, int cap)
{
return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
+EXPORT_SYMBOL(has_capability_noaudit);
static bool ns_capable_common(struct user_namespace *ns,
int cap,
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 41e0837a5a0b..afc6c0e9c966 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -546,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -954,6 +964,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b31e1465868a..adb820e98f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1302,7 +1302,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
@@ -2025,7 +2025,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
@@ -3643,6 +3643,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
@@ -3650,8 +3656,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&ctx->psi.trigger, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3690,7 +3695,7 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
- psi_trigger_replace(&ctx->psi.trigger, NULL);
+ psi_trigger_destroy(ctx->psi.trigger);
}
bool cgroup_psi_enabled(void)
@@ -6161,6 +6166,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
@@ -6224,6 +6243,7 @@ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This prepares a new css_set for the child process which the child will
* be attached to in cgroup_post_fork().
@@ -6286,6 +6306,7 @@ void cgroup_cancel_fork(struct task_struct *child,
/**
* cgroup_post_fork - finalize cgroup setup for the child process
* @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* Attach the child process to its css_set calling the subsystem fork()
* callbacks.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index dc653ab26e50..9390bfd9f1cd 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
- * node binding, add this key to provide a quick low-cost judgement
+ * node binding, add this key to provide a quick low-cost judgment
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
@@ -591,6 +591,35 @@ static inline void free_cpuset(struct cpuset *cs)
}
/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
@@ -614,20 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
-
- /* The checks don't apply to root cpuset */
- if (cur == &top_cpuset)
- return 0;
+ int ret = 0;
rcu_read_lock();
- par = parent_cs(cur);
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
+
+ /* Remaining checks don't apply to root cpuset */
+ if (cur == &top_cpuset)
goto out;
+ par = parent_cs(cur);
+
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
@@ -803,7 +833,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -833,7 +863,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -922,7 +952,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -1151,7 +1181,7 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * For partcmd_disable, the cpuset is being transformed from a partition
* root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
* into parent's effective_cpus. 0 should always be returned.
@@ -1175,9 +1205,7 @@ enum subparts_cmd {
*
* Because of the implicit cpu exclusive nature of a partition root,
* cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
+ * permitted when checked by validate_change().
*/
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpumask *newmask,
@@ -1522,10 +1550,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1533,8 +1566,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;
+ rcu_read_unlock();
update_cpumasks_hier(sibling, tmp);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1607,8 +1645,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Make sure that subparts_cpus is a subset of cpus_allowed.
*/
if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
spin_unlock_irq(&callback_lock);
@@ -1990,7 +2027,7 @@ out:
}
/*
- * update_prstate - update partititon_root_state
+ * update_prstate - update partition_root_state
* cs: the cpuset to update
* new_prs: new partition root state
*
@@ -2252,6 +2289,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
@@ -2305,6 +2343,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
@@ -2840,7 +2879,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -3037,7 +3076,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
@@ -3485,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * __cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3657,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3673,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
void __cpuset_memory_pressure_bump(void)
{
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 9d331ba44870..24b5c2ab5598 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -153,8 +153,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
@@ -166,7 +175,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -315,7 +324,7 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -325,11 +334,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index eb0029c9a6a6..e400fbbc8aba 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -1,5 +1,5 @@
# KEEP ALPHABETICALLY SORTED
-# CONFIG_AIO is not set
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
# CONFIG_INPUT_MOUSE is not set
# CONFIG_LEGACY_PTYS is not set
diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config
index e9ffb0cc1eec..e8db8d938661 100644
--- a/kernel/configs/debug.config
+++ b/kernel/configs/debug.config
@@ -17,6 +17,7 @@ CONFIG_SYMBOLIC_ERRNAME=y
# Compile-time checks and compiler options
#
CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_FRAME_WARN=2048
CONFIG_SECTION_MISMATCH_WARN_ONLY=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 407a2568f35e..5797c2a7a93f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -34,6 +34,7 @@
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
+#include <linux/random.h>
#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
@@ -1488,8 +1489,8 @@ int freeze_secondary_cpus(int primary)
cpu_maps_update_begin();
if (primary == -1) {
primary = cpumask_first(cpu_online_mask);
- if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
- primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+ if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
+ primary = housekeeping_any_cpu(HK_TYPE_TIMER);
} else {
if (!cpu_online(primary))
primary = cpumask_first(cpu_online_mask);
@@ -1659,6 +1660,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
+ [CPUHP_RANDOM_PREPARE] = {
+ .name = "random:prepare",
+ .startup.single = random_prepare_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
@@ -1782,6 +1788,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
+ [CPUHP_AP_RANDOM_ONLINE] = {
+ .name = "random:online",
+ .startup.single = random_online_cpu,
+ .teardown.single = NULL,
+ },
[CPUHP_AP_RCUTREE_ONLINE] = {
.name = "RCU/tree:online",
.startup.single = rcutree_online_cpu,
diff --git a/kernel/cred.c b/kernel/cred.c
index 473d17c431f3..e10c15f51c1f 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -665,21 +665,16 @@ EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
- struct task_struct *task = current;
- const struct cred *old = task->real_cred;
struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
- if (new->user == old->user && new->user_ns == old->user_ns)
- return 0;
-
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->uid))
return 0;
- if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->uid)))
return -EAGAIN;
new->ucounts = new_ucounts;
@@ -875,7 +870,7 @@ static void dump_invalid_creds(const struct cred *cred, const char *label,
/*
* report use of invalid credentials
*/
-void __invalid_creds(const struct cred *cred, const char *file, unsigned line)
+void __noreturn __invalid_creds(const struct cred *cred, const char *file, unsigned line)
{
printk(KERN_ERR "CRED: Invalid credentials\n");
printk(KERN_ERR "CRED: At %s:%u\n", file, line);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index df2bface866e..85cb51c4a17e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -291,7 +291,7 @@ int kdb_getarea_size(void *res, unsigned long addr, size_t size)
*/
int kdb_putarea_size(unsigned long addr, void *res, size_t size)
{
- int ret = copy_from_kernel_nofault((char *)addr, (char *)res, size);
+ int ret = copy_to_kernel_nofault((char *)addr, (char *)res, size);
if (ret) {
if (!KDB_STATE(SUPPRESS)) {
kdb_func_printf("Bad address 0x%lx\n", addr);
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 1b02179758cb..56866aaa2ae1 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -110,15 +110,10 @@ config DMA_GLOBAL_POOL
select DMA_DECLARE_COHERENT
bool
-config DMA_REMAP
- bool
- depends on MMU
- select DMA_NONCOHERENT_MMAP
-
config DMA_DIRECT_REMAP
bool
- select DMA_REMAP
select DMA_COHERENT_POOL
+ select DMA_NONCOHERENT_MMAP
config DMA_CMA
bool "DMA Contiguous Memory Allocator"
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index 0dd65ec1d234..21926e46ef4f 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -8,5 +8,5 @@ obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
obj-$(CONFIG_DMA_API_DEBUG) += debug.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o
-obj-$(CONFIG_DMA_REMAP) += remap.o
+obj-$(CONFIG_MMU) += remap.o
obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c
index 3d63d91cba5c..6ea80ae42622 100644
--- a/kernel/dma/contiguous.c
+++ b/kernel/dma/contiguous.c
@@ -399,8 +399,6 @@ static const struct reserved_mem_ops rmem_cma_ops = {
static int __init rmem_cma_setup(struct reserved_mem *rmem)
{
- phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
- phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
@@ -416,7 +414,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
- if ((rmem->base & mask) || (rmem->size & mask)) {
+ if (!IS_ALIGNED(rmem->base | rmem->size, CMA_MIN_ALIGNMENT_BYTES)) {
pr_err("Reserved memory: incorrect alignment of CMA region\n");
return -EINVAL;
}
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 7a14ca29c377..f8ff598596b8 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -927,7 +927,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -936,7 +936,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 50f48e9e4598..9743c6ccce1a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -265,28 +265,28 @@ void *dma_direct_alloc(struct device *dev, size_t size,
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
+
+ /*
+ * dma_alloc_contiguous can return highmem pages depending on a
+ * combination the cma= arguments and per-arch setup. These need to be
+ * remapped to return a kernel virtual address.
+ */
if (PageHighMem(page)) {
- /*
- * Depending on the cma= arguments and per-arch setup,
- * dma_alloc_contiguous could return highmem pages.
- * Without remapping there is no way to return them here, so
- * log an error and fail.
- */
- if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
- dev_info(dev, "Rejecting highmem page from CMA.\n");
- goto out_free_pages;
- }
remap = true;
set_uncached = false;
}
if (remap) {
+ pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
+
+ if (force_dma_unencrypted(dev))
+ prot = pgprot_decrypted(prot);
+
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
/* create a coherent mapping */
- ret = dma_common_contiguous_remap(page, size,
- dma_pgprot(dev, PAGE_KERNEL, attrs),
+ ret = dma_common_contiguous_remap(page, size, prot,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
@@ -349,7 +349,7 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
- if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
+ if (is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
@@ -539,6 +539,8 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
int ret = -ENXIO;
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
+ if (force_dma_unencrypted(dev))
+ vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
return ret;
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
index 9b9af1bd6be3..0520a8f4fb1d 100644
--- a/kernel/dma/map_benchmark.c
+++ b/kernel/dma/map_benchmark.c
@@ -11,6 +11,7 @@
#include <linux/dma-mapping.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
+#include <linux/map_benchmark.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/pci.h>
@@ -18,30 +19,6 @@
#include <linux/slab.h>
#include <linux/timekeeping.h>
-#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark)
-#define DMA_MAP_MAX_THREADS 1024
-#define DMA_MAP_MAX_SECONDS 300
-#define DMA_MAP_MAX_TRANS_DELAY (10 * NSEC_PER_MSEC)
-
-#define DMA_MAP_BIDIRECTIONAL 0
-#define DMA_MAP_TO_DEVICE 1
-#define DMA_MAP_FROM_DEVICE 2
-
-struct map_benchmark {
- __u64 avg_map_100ns; /* average map latency in 100ns */
- __u64 map_stddev; /* standard deviation of map latency */
- __u64 avg_unmap_100ns; /* as above */
- __u64 unmap_stddev;
- __u32 threads; /* how many threads will do map/unmap in parallel */
- __u32 seconds; /* how long the test will last */
- __s32 node; /* which numa node this benchmark will run on */
- __u32 dma_bits; /* DMA addressing capability */
- __u32 dma_dir; /* DMA data direction */
- __u32 dma_trans_ns; /* time for DMA transmission in ns */
- __u32 granule; /* how many PAGE_SIZE will do map/unmap once a time */
- __u8 expansion[76]; /* For future use */
-};
-
struct map_benchmark_data {
struct map_benchmark bparam;
struct device *dev;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9478eccd1c8e..db7244291b74 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -407,8 +407,6 @@ EXPORT_SYMBOL(dma_get_sgtable_attrs);
*/
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
{
- if (force_dma_unencrypted(dev))
- prot = pgprot_decrypted(prot);
if (dev_is_dma_coherent(dev))
return prot;
#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
@@ -745,7 +743,6 @@ int dma_set_mask(struct device *dev, u64 mask)
}
EXPORT_SYMBOL(dma_set_mask);
-#ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK
int dma_set_coherent_mask(struct device *dev, u64 mask)
{
/*
@@ -761,7 +758,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask)
return 0;
}
EXPORT_SYMBOL(dma_set_coherent_mask);
-#endif
size_t dma_max_mapping_size(struct device *dev)
{
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f1e7ea160b43..73a41cec9e38 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -21,40 +21,33 @@
#define pr_fmt(fmt) "software IO TLB: " fmt
#include <linux/cache.h>
+#include <linux/cc_platform.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
-#include <linux/mm.h>
#include <linux/export.h>
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/iommu-helper.h>
+#include <linux/init.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/pfn.h>
+#include <linux/scatterlist.h>
+#include <linux/set_memory.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/swiotlb.h>
-#include <linux/pfn.h>
#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/scatterlist.h>
-#include <linux/cc_platform.h>
-#include <linux/set_memory.h>
-#ifdef CONFIG_DEBUG_FS
-#include <linux/debugfs.h>
-#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#include <linux/io.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/of_reserved_mem.h>
#include <linux/slab.h>
#endif
-#include <asm/io.h>
-#include <asm/dma.h>
-
-#include <linux/io.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/iommu-helper.h>
-
#define CREATE_TRACE_POINTS
#include <trace/events/swiotlb.h>
@@ -207,8 +200,6 @@ void __init swiotlb_update_mem_attributes(void)
mem->vaddr = swiotlb_mem_remap(mem, bytes);
if (!mem->vaddr)
mem->vaddr = vaddr;
-
- memset(mem->vaddr, 0, bytes);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@@ -627,9 +618,14 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
tlb_addr = slot_addr(mem->start, index) + offset;
- if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
+ /*
+ * When dir == DMA_FROM_DEVICE we could omit the copy from the orig
+ * to the tlb buffer, if we knew for sure the device will
+ * overwirte the entire current content. But we don't. Thus
+ * unconditional bounce may prevent leaking swiotlb content (i.e.
+ * kernel memory) to user-space.
+ */
+ swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
}
@@ -758,47 +754,29 @@ bool is_swiotlb_active(struct device *dev)
}
EXPORT_SYMBOL_GPL(is_swiotlb_active);
-#ifdef CONFIG_DEBUG_FS
-static struct dentry *debugfs_dir;
-
-static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem)
+static void swiotlb_create_debugfs_files(struct io_tlb_mem *mem,
+ const char *dirname)
{
+ mem->debugfs = debugfs_create_dir(dirname, io_tlb_default_mem.debugfs);
+ if (!mem->nslabs)
+ return;
+
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, &mem->nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, &mem->used);
}
-static int __init swiotlb_create_default_debugfs(void)
+static int __init __maybe_unused swiotlb_create_default_debugfs(void)
{
- struct io_tlb_mem *mem = &io_tlb_default_mem;
-
- debugfs_dir = debugfs_create_dir("swiotlb", NULL);
- if (mem->nslabs) {
- mem->debugfs = debugfs_dir;
- swiotlb_create_debugfs_files(mem);
- }
+ swiotlb_create_debugfs_files(&io_tlb_default_mem, "swiotlb");
return 0;
}
+#ifdef CONFIG_DEBUG_FS
late_initcall(swiotlb_create_default_debugfs);
-
#endif
#ifdef CONFIG_DMA_RESTRICTED_POOL
-#ifdef CONFIG_DEBUG_FS
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
- struct io_tlb_mem *mem = rmem->priv;
-
- mem->debugfs = debugfs_create_dir(rmem->name, debugfs_dir);
- swiotlb_create_debugfs_files(mem);
-}
-#else
-static void rmem_swiotlb_debugfs_init(struct reserved_mem *rmem)
-{
-}
-#endif
-
struct page *swiotlb_alloc(struct device *dev, size_t size)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
@@ -845,8 +823,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
if (!mem)
return -ENOMEM;
- mem->slots = kzalloc(array_size(sizeof(*mem->slots), nslabs),
- GFP_KERNEL);
+ mem->slots = kcalloc(nslabs, sizeof(*mem->slots), GFP_KERNEL);
if (!mem->slots) {
kfree(mem);
return -ENOMEM;
@@ -860,7 +837,7 @@ static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
rmem->priv = mem;
- rmem_swiotlb_debugfs_init(rmem);
+ swiotlb_create_debugfs_files(mem, rmem->name);
}
dev->dma_io_tlb_mem = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bad713684c2e..e57a224d6b79 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -2,7 +2,9 @@
#include <linux/context_tracking.h>
#include <linux/entry-common.h>
+#include <linux/resume_user_mode.h>
#include <linux/highmem.h>
+#include <linux/jump_label.h>
#include <linux/livepatch.h>
#include <linux/audit.h>
#include <linux/tick.h>
@@ -58,7 +60,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Handle ptrace */
if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
- ret = arch_syscall_enter_tracehook(regs);
+ ret = ptrace_report_syscall_entry(regs);
if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
@@ -138,15 +140,7 @@ void noinstr exit_to_user_mode(void)
}
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
-
-static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
-{
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
-
- arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
-}
+void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -169,10 +163,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
klp_update_patch_state(current);
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
- handle_signal_work(regs, ti_work);
+ arch_do_signal_or_restart(regs);
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(regs);
+ resume_user_mode_work(regs);
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
@@ -252,7 +246,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
step = report_single_step(work);
if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
- arch_syscall_exit_tracehook(regs, step);
+ ptrace_report_syscall_exit(regs, step);
}
/*
@@ -380,7 +374,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
return ret;
}
-void irqentry_exit_cond_resched(void)
+void raw_irqentry_exit_cond_resched(void)
{
if (!preempt_count()) {
/* Sanity check RCU and thread stack */
@@ -392,7 +386,17 @@ void irqentry_exit_cond_resched(void)
}
}
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+DEFINE_STATIC_CALL(irqentry_exit_cond_resched, raw_irqentry_exit_cond_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+DEFINE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
+void dynamic_irqentry_exit_cond_resched(void)
+{
+ if (!static_key_unlikely(&sk_dynamic_irqentry_exit_cond_resched))
+ return;
+ raw_irqentry_exit_cond_resched();
+}
+#endif
#endif
noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
@@ -420,13 +424,9 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
}
instrumentation_begin();
- if (IS_ENABLED(CONFIG_PREEMPTION)) {
-#ifdef CONFIG_PREEMPT_DYNAMIC
- static_call(irqentry_exit_cond_resched)();
-#else
+ if (IS_ENABLED(CONFIG_PREEMPTION))
irqentry_exit_cond_resched();
-#endif
- }
+
/* Covers both tracing and lockdep */
trace_hardirqs_on();
instrumentation_end();
diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c
index 96d476e06c77..9d09f489b60e 100644
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -8,8 +8,11 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
do {
int ret;
- if (ti_work & _TIF_NOTIFY_SIGNAL)
- tracehook_notify_signal();
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) {
+ clear_notify_signal();
+ if (task_work_pending(current))
+ task_work_run();
+ }
if (ti_work & _TIF_SIGPENDING) {
kvm_handle_signal_exit(vcpu);
@@ -20,7 +23,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
schedule();
if (ti_work & _TIF_NOTIFY_RESUME)
- tracehook_notify_resume(NULL);
+ resume_user_mode_work(NULL);
ret = arch_xfer_to_guest_mode_handle_work(vcpu, ti_work);
if (ret)
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 58cbe357fb2b..1273be84392c 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -209,17 +209,13 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
}
if (regs) {
- mm_segment_t fs;
-
if (crosstask)
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
- fs = force_uaccess_begin();
perf_callchain_user(&ctx, regs);
- force_uaccess_end(fs);
}
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fc18664f49b0..cfde994ce61c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
WRITE_ONCE(event->state, state);
}
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val) \
+do { \
+ barrier(); \
+ WRITE_ONCE(*(ptr), (val)); \
+} while (0)
+
+#define __load_acquire(ptr) \
+({ \
+ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \
+ barrier(); \
+ ___p; \
+})
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
return t->time;
}
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
- struct perf_cgroup_info *info;
- u64 now;
-
- now = perf_clock();
+ struct perf_cgroup_info *t;
- info = this_cpu_ptr(cgrp->info);
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ if (!__load_acquire(&t->active))
+ return t->time;
+ now += READ_ONCE(t->timeoffset);
+ return now;
+}
- info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+ if (adv)
+ info->time += now - info->timestamp;
info->timestamp = now;
+ /*
+ * see update_context_time()
+ */
+ WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
struct perf_cgroup *cgrp = cpuctx->cgrp;
struct cgroup_subsys_state *css;
+ struct perf_cgroup_info *info;
if (cgrp) {
+ u64 now = perf_clock();
+
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
- __update_cgrp_time(cgrp);
+ info = this_cpu_ptr(cgrp->info);
+
+ __update_cgrp_time(info, now, true);
+ if (final)
+ __store_release(&info->active, 0);
}
}
}
static inline void update_cgrp_time_from_event(struct perf_event *event)
{
+ struct perf_cgroup_info *info;
struct perf_cgroup *cgrp;
/*
@@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
/*
* Do not update time when cgroup is not active
*/
- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
- __update_cgrp_time(event->cgrp);
+ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
+ info = this_cpu_ptr(event->cgrp->info);
+ __update_cgrp_time(info, perf_clock(), true);
+ }
}
static inline void
@@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
for (css = &cgrp->css; css; css = css->parent) {
cgrp = container_of(css, struct perf_cgroup, css);
info = this_cpu_ptr(cgrp->info);
- info->timestamp = ctx->timestamp;
+ __update_cgrp_time(info, ctx->timestamp, false);
+ __store_release(&info->active, 1);
}
}
@@ -802,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
*/
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -813,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
local_irq_save(flags);
list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
@@ -982,14 +1019,6 @@ out:
}
static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
- struct perf_cgroup_info *t;
- t = per_cpu_ptr(event->cgrp->info, event->cpu);
- event->shadow_ctx_time = now - t->timestamp;
-}
-
-static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_cpu_context *cpuctx;
@@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+ bool final)
{
}
@@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
{
}
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
+ return 0;
}
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
return 0;
}
@@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx)
/*
* Update the record of the current time in a context.
*/
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
u64 now = perf_clock();
- ctx->time += now - ctx->timestamp;
+ if (adv)
+ ctx->time += now - ctx->timestamp;
ctx->timestamp = now;
+
+ /*
+ * The above: time' = time + (now - timestamp), can be re-arranged
+ * into: time` = now + (time - timestamp), which gives a single value
+ * offset to compute future time without locks on.
+ *
+ * See perf_event_time_now(), which can be used from NMI context where
+ * it's (obviously) not possible to acquire ctx->lock in order to read
+ * both the above values in a consistent manner.
+ */
+ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+ __update_context_time(ctx, true);
}
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+ if (unlikely(!ctx))
+ return 0;
+
if (is_cgroup_event(event))
return perf_cgroup_event_time(event);
- return ctx ? ctx->time : 0;
+ return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+ struct perf_event_context *ctx = event->ctx;
+
+ if (unlikely(!ctx))
+ return 0;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time_now(event, now);
+
+ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+ return ctx->time;
+
+ now += READ_ONCE(ctx->timeoffset);
+ return now;
}
static enum event_type_t get_event_type(struct perf_event *event)
@@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event,
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, false);
}
event_sched_out(event, cpuctx, ctx);
@@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event,
list_del_event(event, ctx);
if (!ctx->nr_events && ctx->is_active) {
+ if (ctx == &cpuctx->ctx)
+ update_cgrp_time_from_cpuctx(cpuctx, true);
+
ctx->is_active = 0;
ctx->rotate_necessary = 0;
if (ctx->task) {
@@ -2392,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
* event_function_call() user.
*/
raw_spin_lock_irq(&ctx->lock);
- if (!ctx->is_active) {
+ /*
+ * Cgroup events are per-cpu events, and must IPI because of
+ * cgrp_cpuctx_list.
+ */
+ if (!ctx->is_active && !is_cgroup_event(event)) {
__perf_remove_from_context(event, __get_cpu_context(ctx),
ctx, (void *)flags);
raw_spin_unlock_irq(&ctx->lock);
@@ -2482,40 +2556,6 @@ void perf_event_disable_inatomic(struct perf_event *event)
irq_work_queue(&event->pending);
}
-static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- /*
- * use the correct time source for the time snapshot
- *
- * We could get by without this by leveraging the
- * fact that to get to this function, the caller
- * has most likely already called update_context_time()
- * and update_cgrp_time_xx() and thus both timestamp
- * are identical (or very close). Given that tstamp is,
- * already adjusted for cgroup, we could say that:
- * tstamp - ctx->timestamp
- * is equivalent to
- * tstamp - cgrp->timestamp.
- *
- * Then, in perf_output_read(), the calculation would
- * work with no changes because:
- * - event is guaranteed scheduled in
- * - no scheduled out in between
- * - thus the timestamp would be the same
- *
- * But this is a bit hairy.
- *
- * So instead, we have an explicit cgroup call to remain
- * within the time source all along. We believe it
- * is cleaner and simpler to understand.
- */
- if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, event->tstamp);
- else
- event->shadow_ctx_time = event->tstamp - ctx->timestamp;
-}
-
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -2556,8 +2596,6 @@ event_sched_in(struct perf_event *event,
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx);
-
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
@@ -2861,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx,
* perf_event_attr::disabled events will not run and can be initialized
* without IPI. Except when this is the first event for the context, in
* that case we need the magic of the IPI to set ctx->is_active.
+ * Similarly, cgroup events for the context also needs the IPI to
+ * manipulate the cgrp_cpuctx_list.
*
* The IOC_ENABLE that is sure to follow the creation of a disabled
* event will issue the IPI and reprogram the hardware.
*/
- if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+ if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+ ctx->nr_events && !is_cgroup_event(event)) {
raw_spin_lock_irq(&ctx->lock);
if (ctx->task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
@@ -3197,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
return err;
}
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+ const struct perf_event_attr *from)
+{
+ to->sig_data = from->sig_data;
+}
+
static int perf_event_modify_attr(struct perf_event *event,
struct perf_event_attr *attr)
{
@@ -3219,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event,
WARN_ON_ONCE(event->ctx->parent_ctx);
mutex_lock(&event->child_mutex);
+ /*
+ * Event-type-independent attributes must be copied before event-type
+ * modification, which will validate that final attributes match the
+ * source attributes after all relevant attributes have been copied.
+ */
+ perf_event_modify_copy_attr(&event->attr, attr);
err = func(event, attr);
if (err)
goto out;
list_for_each_entry(child, &event->child_list, child_list) {
+ perf_event_modify_copy_attr(&child->attr, attr);
err = func(child, attr);
if (err)
goto out;
@@ -3251,16 +3308,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
return;
}
- ctx->is_active &= ~event_type;
- if (!(ctx->is_active & EVENT_ALL))
- ctx->is_active = 0;
-
- if (ctx->task) {
- WARN_ON_ONCE(cpuctx->task_ctx != ctx);
- if (!ctx->is_active)
- cpuctx->task_ctx = NULL;
- }
-
/*
* Always update time if it was set; not only when it changes.
* Otherwise we can 'forget' to update time for any but the last
@@ -3274,7 +3321,22 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
+ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
@@ -3711,13 +3773,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_update_userpage(event);
return true;
@@ -3801,13 +3869,23 @@ ctx_sched_in(struct perf_event_context *ctx,
struct task_struct *task)
{
int is_active = ctx->is_active;
- u64 now;
lockdep_assert_held(&ctx->lock);
if (likely(!ctx->nr_events))
return;
+ if (is_active ^ EVENT_TIME) {
+ /* start ctx time */
+ __update_context_time(ctx, false);
+ perf_cgroup_set_timestamp(task, ctx);
+ /*
+ * CPU-release for the below ->is_active store,
+ * see __load_acquire() in perf_event_time_now()
+ */
+ barrier();
+ }
+
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
@@ -3818,13 +3896,6 @@ ctx_sched_in(struct perf_event_context *ctx,
is_active ^= ctx->is_active; /* changed bits */
- if (is_active & EVENT_TIME) {
- /* start ctx time */
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
- }
-
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -4418,6 +4489,18 @@ static inline u64 perf_event_count(struct perf_event *event)
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
+static void calc_timer_values(struct perf_event *event,
+ u64 *now,
+ u64 *enabled,
+ u64 *running)
+{
+ u64 ctx_time;
+
+ *now = perf_clock();
+ ctx_time = perf_event_time_now(event, *now);
+ __perf_update_times(event, ctx_time, enabled, running);
+}
+
/*
* NMI-safe method to read a local event, that is an event that
* is:
@@ -4477,10 +4560,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
*value = local64_read(&event->count);
if (enabled || running) {
- u64 now = event->shadow_ctx_time + perf_clock();
- u64 __enabled, __running;
+ u64 __enabled, __running, __now;;
- __perf_update_times(event, now, &__enabled, &__running);
+ calc_timer_values(event, &__now, &__enabled, &__running);
if (enabled)
*enabled = __enabled;
if (running)
@@ -5802,18 +5884,6 @@ static int perf_event_index(struct perf_event *event)
return event->pmu->event_idx(event);
}
-static void calc_timer_values(struct perf_event *event,
- u64 *now,
- u64 *enabled,
- u64 *running)
-{
- u64 ctx_time;
-
- *now = perf_clock();
- ctx_time = event->shadow_ctx_time + *now;
- __perf_update_times(event, ctx_time, enabled, running);
-}
-
static void perf_event_init_userpage(struct perf_event *event)
{
struct perf_event_mmap_page *userpg;
@@ -5938,6 +6008,8 @@ static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *old_rb = NULL;
unsigned long flags;
+ WARN_ON_ONCE(event->parent);
+
if (event->rb) {
/*
* Should be impossible, we set this when removing
@@ -5995,6 +6067,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6008,6 +6083,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
struct perf_buffer *rb;
+ if (event->parent)
+ event = event->parent;
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
if (rb) {
@@ -6353,7 +6431,6 @@ accounting:
ring_buffer_attach(event, rb);
perf_event_update_time(event);
- perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -6669,7 +6746,6 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
unsigned long sp;
unsigned int rem;
u64 dyn_size;
- mm_segment_t fs;
/*
* We dump:
@@ -6687,9 +6763,7 @@ perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
/* Data. */
sp = perf_user_stack_pointer(regs);
- fs = force_uaccess_begin();
rem = __output_copy_user(handle, (void *) sp, dump_size);
- force_uaccess_end(fs);
dyn_size = dump_size - rem;
perf_output_skip(handle, rem);
@@ -6717,7 +6791,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
goto out;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
goto out;
@@ -6783,7 +6857,7 @@ static void perf_aux_sample_output(struct perf_event *event,
if (WARN_ON_ONCE(!sampler || !data->aux_size))
return;
- rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+ rb = ring_buffer_get(sampler);
if (!rb)
return;
@@ -10454,8 +10528,6 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
*/
if (state == IF_STATE_END) {
ret = -EINVAL;
- if (kernel && event->attr.exclude_kernel)
- goto fail;
/*
* ACTION "filter" must have a non-zero length region
@@ -10497,8 +10569,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6357c3580d07..6418083901d4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -155,11 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
struct page *old_page, struct page *new_page)
{
struct mm_struct *mm = vma->vm_mm;
- struct page_vma_mapped_walk pvmw = {
- .page = compound_head(old_page),
- .vma = vma,
- .address = addr,
- };
+ DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0);
int err;
struct mmu_notifier_range range;
@@ -173,7 +169,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
return err;
}
- /* For try_to_free_swap() and munlock_vma_page() below */
+ /* For try_to_free_swap() below */
lock_page(old_page);
mmu_notifier_invalidate_range_start(&range);
@@ -201,13 +197,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
set_pte_at_notify(mm, addr, pvmw.pte,
mk_pte(new_page, vma->vm_page_prot));
- page_remove_rmap(old_page, false);
+ page_remove_rmap(old_page, vma, false);
if (!page_mapped(old_page))
try_to_free_swap(old_page);
page_vma_mapped_walk_done(&pvmw);
-
- if ((vma->vm_flags & VM_LOCKED) && !PageCompound(old_page))
- munlock_vma_page(old_page);
put_page(old_page);
err = 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index b00a25bb4ab9..f072959fcab7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -49,7 +49,8 @@
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
-#include <linux/tracehook.h>
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
@@ -64,6 +65,7 @@
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
+#include <linux/rethook.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -169,6 +171,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
+ rethook_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@@ -735,21 +738,7 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
- WARN_ON(blk_needs_flush_plug(tsk));
-
- /*
- * If do_dead is called because this processes oopsed, it's possible
- * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
- * continuing. Amongst other possible reasons, this is to prevent
- * mm_release()->clear_child_tid() from writing to a user-controlled
- * kernel address.
- *
- * On uptodate architectures force_uaccess_begin is a noop. On
- * architectures that still have set_fs/get_fs in addition to handling
- * oopses handles kernel threads that run as set_fs(KERNEL_DS) by
- * default.
- */
- force_uaccess_begin();
+ WARN_ON(tsk->plug);
kcov_task_exit(tsk);
@@ -845,6 +834,7 @@ void __noreturn do_exit(long code)
put_page(tsk->task_frag.page);
validate_creds_for_do_exit(tsk);
+ exit_task_stack_account(tsk);
check_stack_usage();
preempt_disable();
@@ -906,7 +896,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-void
+void __noreturn
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
diff --git a/kernel/extable.c b/kernel/extable.c
index b6f330f0fe74..bda5e9761541 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -3,6 +3,7 @@
Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
*/
+#include <linux/elf.h>
#include <linux/ftrace.h>
#include <linux/memory.h>
#include <linux/extable.h>
@@ -132,12 +133,33 @@ out:
}
/*
- * On some architectures (PPC64, IA64) function pointers
+ * On some architectures (PPC64, IA64, PARISC) function pointers
* are actually only tokens to some data that then holds the
* real function address. As a result, to find if a function
* pointer is part of the kernel text, we need to do some
* special dereferencing first.
*/
+#ifdef CONFIG_HAVE_FUNCTION_DESCRIPTORS
+void *dereference_function_descriptor(void *ptr)
+{
+ func_desc_t *desc = ptr;
+ void *p;
+
+ if (!get_kernel_nofault(p, (void *)&desc->addr))
+ ptr = p;
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(dereference_function_descriptor);
+
+void *dereference_kernel_function_descriptor(void *ptr)
+{
+ if (ptr < (void *)__start_opd || ptr >= (void *)__end_opd)
+ return ptr;
+
+ return dereference_function_descriptor(ptr);
+}
+#endif
+
int func_ptr_is_kernel_text(void *ptr)
{
unsigned long addr;
diff --git a/kernel/fork.c b/kernel/fork.c
index d75a528f7b21..9796897560ab 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,6 +97,7 @@
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
+#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -185,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk)
*/
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
-#ifdef CONFIG_VMAP_STACK
+# ifdef CONFIG_VMAP_STACK
/*
* vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
* flush. Try to minimize the number of calls by caching stacks.
@@ -193,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk)
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
+struct vm_stack {
+ struct rcu_head rcu;
+ struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+ unsigned int i;
+
+ for (i = 0; i < NR_CACHED_STACKS; i++) {
+ if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+ if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+ return;
+
+ vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct vm_stack *vm_stack = tsk->stack;
+
+ vm_stack->stack_vm_area = tsk->stack_vm_area;
+ call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
static int free_vm_stack_cache(unsigned int cpu)
{
struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -210,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu)
return 0;
}
-#endif
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
-#ifdef CONFIG_VMAP_STACK
+ int i;
+ int ret;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+ if (ret)
+ goto err;
+ }
+ return 0;
+err:
+ /*
+ * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+ * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+ * ignore this page.
+ */
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
+ return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ struct vm_struct *vm;
void *stack;
int i;
@@ -226,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
if (!s)
continue;
- /* Mark stack accessible for KASAN. */
+ /* Reset stack metadata. */
kasan_unpoison_range(s->addr, THREAD_SIZE);
+ stack = kasan_reset_tag(s->addr);
+
/* Clear stale pointers from reused stack. */
- memset(s->addr, 0, THREAD_SIZE);
+ memset(stack, 0, THREAD_SIZE);
+
+ if (memcg_charge_kernel_stack(s)) {
+ vfree(s->addr);
+ return -ENOMEM;
+ }
tsk->stack_vm_area = s;
- tsk->stack = s->addr;
- return s->addr;
+ tsk->stack = stack;
+ return 0;
}
/*
@@ -247,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
THREADINFO_GFP & ~__GFP_ACCOUNT,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
+ if (!stack)
+ return -ENOMEM;
+ vm = find_vm_area(stack);
+ if (memcg_charge_kernel_stack(vm)) {
+ vfree(stack);
+ return -ENOMEM;
+ }
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
- if (stack) {
- tsk->stack_vm_area = find_vm_area(stack);
- tsk->stack = stack;
- }
- return stack;
-#else
+ tsk->stack_vm_area = vm;
+ stack = kasan_reset_tag(stack);
+ tsk->stack = stack;
+ return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+ thread_stack_delayed_free(tsk);
+
+ tsk->stack = NULL;
+ tsk->stack_vm_area = NULL;
+}
+
+# else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
+
+ call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = kasan_reset_tag(page_address(page));
- return tsk->stack;
+ return 0;
}
- return NULL;
-#endif
+ return -ENOMEM;
}
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
- int i;
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
+}
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
- memcg_kmem_uncharge_page(vm->pages[i], 0);
+# endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
- for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_cmpxchg(cached_stacks[i],
- NULL, tsk->stack_vm_area) != NULL)
- continue;
+static struct kmem_cache *thread_stack_cache;
- return;
- }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+ kmem_cache_free(thread_stack_cache, rh);
+}
- vfree_atomic(tsk->stack);
- return;
- }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+ struct rcu_head *rh = tsk->stack;
- __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+ call_rcu(rh, thread_stack_free_rcu);
}
-# else
-static struct kmem_cache *thread_stack_cache;
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
- int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
stack = kasan_reset_tag(stack);
tsk->stack = stack;
- return stack;
+ return stack ? 0 : -ENOMEM;
}
static void free_thread_stack(struct task_struct *tsk)
{
- kmem_cache_free(thread_stack_cache, tsk->stack);
+ thread_stack_delayed_free(tsk);
+ tsk->stack = NULL;
}
void thread_stack_cache_init(void)
@@ -321,8 +413,26 @@ void thread_stack_cache_init(void)
THREAD_SIZE, NULL);
BUG_ON(thread_stack_cache == NULL);
}
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+ unsigned long *stack;
+
+ stack = arch_alloc_thread_stack_node(tsk, node);
+ tsk->stack = stack;
+ return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+ arch_free_thread_stack(tsk);
+ tsk->stack = NULL;
+}
+
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;
@@ -366,63 +476,47 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
- dup_vma_anon_name(orig, new);
+ dup_anon_vma_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
- free_vma_anon_name(vma);
+ free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
static void account_kernel_stack(struct task_struct *tsk, int account)
{
- void *stack = task_stack_page(tsk);
- struct vm_struct *vm = task_stack_vm_area(tsk);
-
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm = task_stack_vm_area(tsk);
int i;
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
account * (PAGE_SIZE / 1024));
} else {
+ void *stack = task_stack_page(tsk);
+
/* All stack pages are in the same node. */
mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
account * (THREAD_SIZE / 1024));
}
}
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
{
-#ifdef CONFIG_VMAP_STACK
- struct vm_struct *vm = task_stack_vm_area(tsk);
- int ret;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+ account_kernel_stack(tsk, -1);
- if (vm) {
+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+ struct vm_struct *vm;
int i;
- BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
- for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
- /*
- * If memcg_kmem_charge_page() fails, page's
- * memory cgroup pointer is NULL, and
- * memcg_kmem_uncharge_page() in free_thread_stack()
- * will ignore this page.
- */
- ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
- 0);
- if (ret)
- return ret;
- }
+ vm = task_stack_vm_area(tsk);
+ for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+ memcg_kmem_uncharge_page(vm->pages[i], 0);
}
-#endif
- return 0;
}
static void release_task_stack(struct task_struct *tsk)
@@ -430,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk)
if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
return; /* Better to leak the stack than to free prematurely */
- account_kernel_stack(tsk, -1);
free_thread_stack(tsk);
- tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = NULL;
-#endif
}
#ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -874,8 +963,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- unsigned long *stack;
- struct vm_struct *stack_vm_area __maybe_unused;
int err;
if (node == NUMA_NO_NODE)
@@ -884,32 +971,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
if (!tsk)
return NULL;
- stack = alloc_thread_stack_node(tsk, node);
- if (!stack)
+ err = arch_dup_task_struct(tsk, orig);
+ if (err)
goto free_tsk;
- if (memcg_charge_kernel_stack(tsk))
- goto free_stack;
-
- stack_vm_area = task_stack_vm_area(tsk);
-
- err = arch_dup_task_struct(tsk, orig);
+ err = alloc_thread_stack_node(tsk, node);
+ if (err)
+ goto free_tsk;
- /*
- * arch_dup_task_struct() clobbers the stack-related fields. Make
- * sure they're properly initialized before using any stack-related
- * functions again.
- */
- tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
- tsk->stack_vm_area = stack_vm_area;
-#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
refcount_set(&tsk->stack_refcount, 1);
#endif
-
- if (err)
- goto free_stack;
+ account_kernel_stack(tsk, 1);
err = scs_prepare(tsk, node);
if (err)
@@ -953,8 +1026,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;
- account_kernel_stack(tsk, 1);
-
kcov_task_init(tsk);
kmap_local_fork(tsk);
@@ -967,12 +1038,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->use_memdelay = 0;
#endif
+#ifdef CONFIG_IOMMU_SVA
+ tsk->pasid_activated = 0;
+#endif
+
#ifdef CONFIG_MEMCG
tsk->active_memcg = NULL;
#endif
return tsk;
free_stack:
+ exit_task_stack_account(tsk);
free_thread_stack(tsk);
free_tsk:
free_task_struct(tsk);
@@ -1019,13 +1095,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SUPPORT
- mm->pasid = INIT_PASID;
-#endif
-}
-
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1054,7 +1123,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
- mm_init_pasid(mm);
+ mm_pasid_init(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
@@ -1121,6 +1190,7 @@ static inline void __mmput(struct mm_struct *mm)
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ mm_pasid_drop(mm);
mmdrop(mm);
}
@@ -2021,18 +2091,18 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_PROVE_LOCKING
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
+ retval = copy_creds(p, clone_flags);
+ if (retval < 0)
+ goto bad_fork_free;
+
retval = -EAGAIN;
if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
- goto bad_fork_free;
+ goto bad_fork_cleanup_count;
}
current->flags &= ~PF_NPROC_EXCEEDED;
- retval = copy_creds(p, clone_flags);
- if (retval < 0)
- goto bad_fork_free;
-
/*
* If multiple threads are within copy_process(), then this check
* triggers too late. This doesn't hurt, the check is only there
@@ -2255,6 +2325,9 @@ static __latent_entropy struct task_struct *copy_process(
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
#endif
+#ifdef CONFIG_RETHOOK
+ p->rethooks.first = NULL;
+#endif
/*
* Ensure that the cgroup subsystem policies allow the new process to be
@@ -2267,6 +2340,17 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_put_pidfd;
/*
+ * Now that the cgroups are pinned, re-clone the parent cgroup and put
+ * the new task on the correct runqueue. All this *before* the task
+ * becomes visible.
+ *
+ * This isn't part of ->can_fork() because while the re-cloning is
+ * cgroup specific, it unconditionally needs to place the task on a
+ * runqueue.
+ */
+ sched_cgroup_fork(p, args);
+
+ /*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
* not want user-space to be able to predict the process start-time by
@@ -2323,10 +2407,6 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
-
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2375,8 +2455,11 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
- sched_post_fork(p, args);
+ sched_post_fork(p);
cgroup_post_fork(p, args);
perf_event_fork(p);
@@ -2441,6 +2524,7 @@ bad_fork_cleanup_count:
exit_creds(p);
bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD);
+ exit_task_stack_account(p);
put_task_stack(p);
delayed_free_task(p);
fork_out:
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 51dd822a8060..b22ef1efe751 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -302,7 +302,7 @@ again:
* found it, but truncated or holepunched or subjected to
* invalidate_complete_page2 before we got the page lock (also
* cases which we are happy to fail). And we hold a reference,
- * so refcount care in invalidate_complete_page's remove_mapping
+ * so refcount care in invalidate_inode_page's remove_mapping
* prevents drop_caches from setting mapping to NULL beneath us.
*
* The case we do have to guard against is when memory pressure made
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c09324663088..54af0deb239b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -38,7 +38,7 @@ struct irqaction chained_action = {
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int irq_set_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, const struct irq_chip *chip)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
@@ -46,10 +46,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip)
if (!desc)
return -EINVAL;
- if (!chip)
- chip = &no_irq_chip;
-
- desc->irq_data.chip = chip;
+ desc->irq_data.chip = (struct irq_chip *)(chip ?: &no_irq_chip);
irq_put_desc_unlock(desc, flags);
/*
* For !CONFIG_SPARSE_IRQ make the irq show up in
@@ -1073,7 +1070,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
EXPORT_SYMBOL_GPL(irq_set_chained_handler_and_data);
void
-irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, const struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
irq_set_chip(irq, chip);
@@ -1558,6 +1555,14 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
return 0;
}
+static struct device *irq_get_parent_device(struct irq_data *data)
+{
+ if (data->domain)
+ return data->domain->dev;
+
+ return NULL;
+}
+
/**
* irq_chip_pm_get - Enable power for an IRQ chip
* @data: Pointer to interrupt specific data
@@ -1567,12 +1572,13 @@ int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
*/
int irq_chip_pm_get(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device) {
- retval = pm_runtime_get_sync(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev) {
+ retval = pm_runtime_get_sync(dev);
if (retval < 0) {
- pm_runtime_put_noidle(data->chip->parent_device);
+ pm_runtime_put_noidle(dev);
return retval;
}
}
@@ -1590,10 +1596,11 @@ int irq_chip_pm_get(struct irq_data *data)
*/
int irq_chip_pm_put(struct irq_data *data)
{
+ struct device *dev = irq_get_parent_device(data);
int retval = 0;
- if (IS_ENABLED(CONFIG_PM) && data->chip->parent_device)
- retval = pm_runtime_put(data->chip->parent_device);
+ if (IS_ENABLED(CONFIG_PM) && dev)
+ retval = pm_runtime_put(dev);
return (retval < 0) ? retval : 0;
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 39a41c56ad4f..1ed2b1739363 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -176,10 +176,10 @@ static bool hk_should_isolate(struct irq_data *data, unsigned int cpu)
{
const struct cpumask *hk_mask;
- if (!housekeeping_enabled(HK_FLAG_MANAGED_IRQ))
+ if (!housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
return false;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
if (cpumask_subset(irq_data_get_effective_affinity_mask(data), hk_mask))
return false;
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index e4cff358b437..2b43f5f5033d 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -69,8 +69,12 @@ irq_debug_show_chip(struct seq_file *m, struct irq_data *data, int ind)
seq_printf(m, "chip: None\n");
return;
}
- seq_printf(m, "%*schip: %s\n", ind, "", chip->name);
- seq_printf(m, "%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
+ seq_printf(m, "%*schip: ", ind, "");
+ if (chip->irq_print_chip)
+ chip->irq_print_chip(data, m);
+ else
+ seq_printf(m, "%s", chip->name);
+ seq_printf(m, "\n%*sflags: 0x%lx\n", ind + 1, "", chip->flags);
irq_debug_show_bits(m, ind, chip->flags, irqchip_flags,
ARRAY_SIZE(irqchip_flags));
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2267e6527db3..939d21cd55c3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -640,7 +640,7 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_irq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
@@ -662,6 +662,29 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+/**
+ * generic_handle_irq_safe - Invoke the handler for a particular irq from any
+ * context.
+ * @irq: The irq number to handle
+ *
+ * Returns: 0 on success, a negative value on error.
+ *
+ * This function can be called from any context (IRQ or process context). It
+ * will report an error if not invoked from IRQ context and the irq has been
+ * marked to enforce IRQ-context only.
+ */
+int generic_handle_irq_safe(unsigned int irq)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = handle_irq_desc(irq_to_desc(irq));
+ local_irq_restore(flags);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq_safe);
+
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
@@ -676,7 +699,7 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
- WARN_ON_ONCE(!in_irq());
+ WARN_ON_ONCE(!in_hardirq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index bf38c546aa25..d5ce96510549 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1319,7 +1319,8 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @chip_data: The associated chip data
*/
int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq,
+ const struct irq_chip *chip,
void *chip_data)
{
struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
@@ -1328,7 +1329,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return -ENOENT;
irq_data->hwirq = hwirq;
- irq_data->chip = chip ? chip : &no_irq_chip;
+ irq_data->chip = (struct irq_chip *)(chip ? chip : &no_irq_chip);
irq_data->chip_data = chip_data;
return 0;
@@ -1347,7 +1348,7 @@ EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
@@ -1853,7 +1854,7 @@ EXPORT_SYMBOL_GPL(irq_domain_get_irq_data);
* @handler_name: The interrupt handler name
*/
void irq_domain_set_info(struct irq_domain *domain, unsigned int virq,
- irq_hw_number_t hwirq, struct irq_chip *chip,
+ irq_hw_number_t hwirq, const struct irq_chip *chip,
void *chip_data, irq_flow_handler_t handler,
void *handler_data, const char *handler_name)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f23ffd30385b..c03f71d5ec10 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -247,13 +247,13 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
* online.
*/
if (irqd_affinity_is_managed(data) &&
- housekeeping_enabled(HK_FLAG_MANAGED_IRQ)) {
+ housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
const struct cpumask *hk_mask, *prog_mask;
static DEFINE_RAW_SPINLOCK(tmp_mask_lock);
static struct cpumask tmp_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MANAGED_IRQ);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);
raw_spin_lock(&tmp_mask_lock);
cpumask_and(&tmp_mask, mask, hk_mask);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 951c93216fc4..79f2eb617a62 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -212,6 +212,10 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned long i;
unsigned int off;
+ /* Skip the search for empty string. */
+ if (!*name)
+ return 0;
+
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 36ca640c4f8e..475524bd900a 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -459,37 +459,28 @@ void kcov_task_exit(struct task_struct *t)
static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
{
int res = 0;
- void *area;
struct kcov *kcov = vma->vm_file->private_data;
unsigned long size, off;
struct page *page;
unsigned long flags;
- area = vmalloc_user(vma->vm_end - vma->vm_start);
- if (!area)
- return -ENOMEM;
-
spin_lock_irqsave(&kcov->lock, flags);
size = kcov->size * sizeof(unsigned long);
- if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ if (kcov->area == NULL || vma->vm_pgoff != 0 ||
vma->vm_end - vma->vm_start != size) {
res = -EINVAL;
goto exit;
}
- if (!kcov->area) {
- kcov->area = area;
- vma->vm_flags |= VM_DONTEXPAND;
- spin_unlock_irqrestore(&kcov->lock, flags);
- for (off = 0; off < size; off += PAGE_SIZE) {
- page = vmalloc_to_page(kcov->area + off);
- if (vm_insert_page(vma, vma->vm_start + off, page))
- WARN_ONCE(1, "vm_insert_page() failed");
- }
- return 0;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vma->vm_flags |= VM_DONTEXPAND;
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
}
+ return 0;
exit:
spin_unlock_irqrestore(&kcov->lock, flags);
- vfree(area);
return res;
}
@@ -564,31 +555,12 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
unsigned long arg)
{
struct task_struct *t;
- unsigned long size, unused;
+ unsigned long flags, unused;
int mode, i;
struct kcov_remote_arg *remote_arg;
struct kcov_remote *remote;
- unsigned long flags;
switch (cmd) {
- case KCOV_INIT_TRACE:
- /*
- * Enable kcov in trace mode and setup buffer size.
- * Must happen before anything else.
- */
- if (kcov->mode != KCOV_MODE_DISABLED)
- return -EBUSY;
- /*
- * Size must be at least 2 to hold current position and one PC.
- * Later we allocate size * sizeof(unsigned long) memory,
- * that must not overflow.
- */
- size = arg;
- if (size < 2 || size > INT_MAX / sizeof(unsigned long))
- return -EINVAL;
- kcov->size = size;
- kcov->mode = KCOV_MODE_INIT;
- return 0;
case KCOV_ENABLE:
/*
* Enable coverage for the current task.
@@ -692,9 +664,37 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
struct kcov_remote_arg *remote_arg = NULL;
unsigned int remote_num_handles;
unsigned long remote_arg_size;
- unsigned long flags;
+ unsigned long size, flags;
+ void *area;
- if (cmd == KCOV_REMOTE_ENABLE) {
+ kcov = filep->private_data;
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ *
+ * First check the size argument - it must be at least 2
+ * to hold the current position and one PC.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ area = vmalloc_user(size * sizeof(unsigned long));
+ if (area == NULL)
+ return -ENOMEM;
+ spin_lock_irqsave(&kcov->lock, flags);
+ if (kcov->mode != KCOV_MODE_DISABLED) {
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ vfree(area);
+ return -EBUSY;
+ }
+ kcov->area = area;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ return 0;
+ case KCOV_REMOTE_ENABLE:
if (get_user(remote_num_handles, (unsigned __user *)(arg +
offsetof(struct kcov_remote_arg, num_handles))))
return -EFAULT;
@@ -710,16 +710,18 @@ static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
return -EINVAL;
}
arg = (unsigned long)remote_arg;
+ fallthrough;
+ default:
+ /*
+ * All other commands can be normally executed under a spin lock, so we
+ * obtain and release it here in order to simplify kcov_ioctl_locked().
+ */
+ spin_lock_irqsave(&kcov->lock, flags);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock_irqrestore(&kcov->lock, flags);
+ kfree(remote_arg);
+ return res;
}
-
- kcov = filep->private_data;
- spin_lock_irqsave(&kcov->lock, flags);
- res = kcov_ioctl_locked(kcov, cmd, arg);
- spin_unlock_irqrestore(&kcov->lock, flags);
-
- kfree(remote_arg);
-
- return res;
}
static const struct file_operations kcov_fops = {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 94cab8c9ce56..dbe57df2e199 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1237,6 +1237,27 @@ void kprobes_inc_nmissed_count(struct kprobe *p)
}
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
+static struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
static void free_rp_inst_rcu(struct rcu_head *head)
{
struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu);
@@ -1258,26 +1279,6 @@ static void recycle_rp_inst(struct kretprobe_instance *ri)
}
NOKPROBE_SYMBOL(recycle_rp_inst);
-static struct kprobe kprobe_busy = {
- .addr = (void *) get_kprobe,
-};
-
-void kprobe_busy_begin(void)
-{
- struct kprobe_ctlblk *kcb;
-
- preempt_disable();
- __this_cpu_write(current_kprobe, &kprobe_busy);
- kcb = get_kprobe_ctlblk();
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
-}
-
-void kprobe_busy_end(void)
-{
- __this_cpu_write(current_kprobe, NULL);
- preempt_enable();
-}
-
/*
* This function is called from delayed_put_task_struct() when a task is
* dead and cleaned up to recycle any kretprobe instances associated with
@@ -1327,6 +1328,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
rp->rph = NULL;
}
}
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/* Add the new probe to 'ap->list'. */
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
@@ -1489,24 +1491,68 @@ bool within_kprobe_blacklist(unsigned long addr)
}
/*
+ * arch_adjust_kprobe_addr - adjust the address
+ * @addr: symbol base address
+ * @offset: offset within the symbol
+ * @on_func_entry: was this @addr+@offset on the function entry
+ *
+ * Typically returns @addr + @offset, except for special cases where the
+ * function might be prefixed by a CFI landing pad, in that case any offset
+ * inside the landing pad is mapped to the first 'real' instruction of the
+ * symbol.
+ *
+ * Specifically, for things like IBT/BTI, skip the resp. ENDBR/BTI.C
+ * instruction at +0.
+ */
+kprobe_opcode_t *__weak arch_adjust_kprobe_addr(unsigned long addr,
+ unsigned long offset,
+ bool *on_func_entry)
+{
+ *on_func_entry = !offset;
+ return (kprobe_opcode_t *)(addr + offset);
+}
+
+/*
* If 'symbol_name' is specified, look it up and add the 'offset'
* to it. This way, we can specify a relative address to a symbol.
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
- const char *symbol_name, unsigned int offset)
+static kprobe_opcode_t *
+_kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
+ unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
goto invalid;
if (symbol_name) {
+ /*
+ * Input: @sym + @offset
+ * Output: @addr + @offset
+ *
+ * NOTE: kprobe_lookup_name() does *NOT* fold the offset
+ * argument into it's output!
+ */
addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
- addr = (kprobe_opcode_t *)(((char *)addr) + offset);
+ /*
+ * So here we have @addr + @offset, displace it into a new
+ * @addr' + @offset' where @addr' is the symbol start address.
+ */
+ addr = (void *)addr + offset;
+ if (!kallsyms_lookup_size_offset((unsigned long)addr, NULL, &offset))
+ return ERR_PTR(-ENOENT);
+ addr = (void *)addr - offset;
+
+ /*
+ * Then ask the architecture to re-combine them, taking care of
+ * magical function entry details while telling us if this was indeed
+ * at the start of the function.
+ */
+ addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
if (addr)
return addr;
@@ -1516,7 +1562,8 @@ invalid:
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
- return _kprobe_addr(p->addr, p->symbol_name, p->offset);
+ bool on_func_entry;
+ return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
/*
@@ -1562,14 +1609,10 @@ static inline int warn_kprobe_rereg(struct kprobe *p)
static int check_ftrace_location(struct kprobe *p)
{
- unsigned long ftrace_addr;
+ unsigned long addr = (unsigned long)p->addr;
- ftrace_addr = ftrace_location((unsigned long)p->addr);
- if (ftrace_addr) {
+ if (ftrace_location(addr) == addr) {
#ifdef CONFIG_KPROBES_ON_FTRACE
- /* Given address is not on the instruction boundary */
- if ((unsigned long)p->addr != ftrace_addr)
- return -EILSEQ;
p->flags |= KPROBE_FLAG_FTRACE;
#else /* !CONFIG_KPROBES_ON_FTRACE */
return -EINVAL;
@@ -1884,6 +1927,7 @@ static struct notifier_block kprobe_exceptions_nb = {
#ifdef CONFIG_KRETPROBES
+#if !defined(CONFIG_KRETPROBE_ON_RETHOOK)
/* This assumes the 'tsk' is the current task or the is not running. */
static kprobe_opcode_t *__kretprobe_find_ret_addr(struct task_struct *tsk,
struct llist_node **cur)
@@ -2046,11 +2090,57 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
return 0;
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
+#else /* CONFIG_KRETPROBE_ON_RETHOOK */
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+ struct kretprobe_instance *ri;
+ struct rethook_node *rhn;
+
+ rhn = rethook_try_get(rp->rh);
+ if (!rhn) {
+ rp->nmissed++;
+ return 0;
+ }
+
+ ri = container_of(rhn, struct kretprobe_instance, node);
+
+ if (rp->entry_handler && rp->entry_handler(ri, regs))
+ rethook_recycle(rhn);
+ else
+ rethook_hook(rhn, regs, kprobe_ftrace(p));
+
+ return 0;
+}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_kprobe_on_func_entry(unsigned long offset)
+static void kretprobe_rethook_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
{
- return !offset;
+ struct kretprobe *rp = (struct kretprobe *)data;
+ struct kretprobe_instance *ri;
+ struct kprobe_ctlblk *kcb;
+
+ /* The data must NOT be null. This means rethook data structure is broken. */
+ if (WARN_ON_ONCE(!data))
+ return;
+
+ __this_cpu_write(current_kprobe, &rp->kp);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+
+ ri = container_of(rh, struct kretprobe_instance, node);
+ rp->handler(ri, regs);
+
+ __this_cpu_write(current_kprobe, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_rethook_handler);
+
+#endif /* !CONFIG_KRETPROBE_ON_RETHOOK */
/**
* kprobe_on_func_entry() -- check whether given address is function entry
@@ -2067,15 +2157,13 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset)
*/
int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
- kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
+ bool on_func_entry;
+ kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset, &on_func_entry);
if (IS_ERR(kp_addr))
return PTR_ERR(kp_addr);
- if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset))
- return -ENOENT;
-
- if (!arch_kprobe_on_func_entry(offset))
+ if (!on_func_entry)
return -EINVAL;
return 0;
@@ -2121,6 +2209,29 @@ int register_kretprobe(struct kretprobe *rp)
rp->maxactive = num_possible_cpus();
#endif
}
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rp->rh = rethook_alloc((void *)rp, kretprobe_rethook_handler);
+ if (!rp->rh)
+ return -ENOMEM;
+
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kzalloc(sizeof(struct kretprobe_instance) +
+ rp->data_size, GFP_KERNEL);
+ if (inst == NULL) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(rp->rh, &inst->node);
+ }
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ ret = register_kprobe(&rp->kp);
+ if (ret != 0) {
+ rethook_free(rp->rh);
+ rp->rh = NULL;
+ }
+#else /* !CONFIG_KRETPROBE_ON_RETHOOK */
rp->freelist.head = NULL;
rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL);
if (!rp->rph)
@@ -2145,6 +2256,7 @@ int register_kretprobe(struct kretprobe *rp)
ret = register_kprobe(&rp->kp);
if (ret != 0)
free_rp_inst(rp);
+#endif
return ret;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
@@ -2183,7 +2295,11 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
+#ifdef CONFIG_KRETPROBE_ON_RETHOOK
+ rethook_free(rps[i]->rh);
+#else
rps[i]->rph->rp = NULL;
+#endif
}
mutex_unlock(&kprobe_mutex);
@@ -2191,7 +2307,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
for (i = 0; i < num; i++) {
if (rps[i]->kp.addr) {
__unregister_kprobe_bottom(&rps[i]->kp);
+#ifndef CONFIG_KRETPROBE_ON_RETHOOK
free_rp_inst(rps[i]);
+#endif
}
}
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 35859da8bd4f..b1292a57c2a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,8 +24,7 @@
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KERNEL_ATTR_RW(_name) \
-static struct kobj_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
/* current uevent sequence number */
static ssize_t uevent_seqnum_show(struct kobject *kobj,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 38c6dd822da8..50265f69a135 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -55,7 +55,6 @@ struct kthread {
int result;
int (*threadfn)(void *);
void *data;
- mm_segment_t oldfs;
struct completion parked;
struct completion exited;
#ifdef CONFIG_BLK_CGROUP
@@ -356,7 +355,7 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -722,7 +721,7 @@ int kthreadd(void *unused)
/* Setup a clean context for our children to inherit. */
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
- set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_FLAG_KTHREAD));
+ set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
@@ -1441,8 +1440,6 @@ void kthread_use_mm(struct mm_struct *mm)
mmdrop(active_mm);
else
smp_mb();
-
- to_kthread(tsk)->oldfs = force_uaccess_begin();
}
EXPORT_SYMBOL_GPL(kthread_use_mm);
@@ -1457,8 +1454,6 @@ void kthread_unuse_mm(struct mm_struct *mm)
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(!tsk->mm);
- force_uaccess_end(to_kthread(tsk)->oldfs);
-
task_lock(tsk);
/*
* When a kthread stops operating on an address space, the loop
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 585494ec464f..bc475e62279d 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -190,7 +190,7 @@ static int klp_find_object_symbol(const char *objname, const char *name,
return -EINVAL;
}
-static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
+static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab,
unsigned int symndx, Elf_Shdr *relasec,
const char *sec_objname)
{
@@ -218,7 +218,7 @@ static int klp_resolve_symbols(Elf64_Shdr *sechdrs, const char *strtab,
relas = (Elf_Rela *) relasec->sh_addr;
/* For each rela in this klp relocation section */
for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) {
- sym = (Elf64_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
+ sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info);
if (sym->st_shndx != SHN_LIVEPATCH) {
pr_err("symbol %s is not marked as a livepatch symbol\n",
strtab + sym->st_name);
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index fe316c021d73..c172bf92b576 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -124,19 +124,6 @@ unlock:
ftrace_test_recursion_unlock(bit);
}
-/*
- * Convert a function address into the appropriate ftrace location.
- *
- * Usually this is just the address of the function, but on some architectures
- * it's more complicated so allow them to provide a custom behaviour.
- */
-#ifndef klp_get_ftrace_location
-static unsigned long klp_get_ftrace_location(unsigned long faddr)
-{
- return faddr;
-}
-#endif
-
static void klp_unpatch_func(struct klp_func *func)
{
struct klp_ops *ops;
@@ -153,8 +140,7 @@ static void klp_unpatch_func(struct klp_func *func)
if (list_is_singular(&ops->func_stack)) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (WARN_ON(!ftrace_loc))
return;
@@ -186,8 +172,7 @@ static int klp_patch_func(struct klp_func *func)
if (!ops) {
unsigned long ftrace_loc;
- ftrace_loc =
- klp_get_ftrace_location((unsigned long)func->old_func);
+ ftrace_loc = ftrace_location((unsigned long)func->old_func);
if (!ftrace_loc) {
pr_err("failed to find location for function '%s'\n",
func->old_name);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index 5683ac0d2566..5d03a2ad1066 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,7 +9,6 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
-#include <linux/tracehook.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
@@ -641,6 +640,13 @@ void klp_force_transition(void)
for_each_possible_cpu(cpu)
klp_update_patch_state(idle_task(cpu));
- klp_for_each_patch(patch)
- patch->forced = true;
+ /* Set forced flag for patches being removed. */
+ if (klp_target_state == KLP_UNPATCHED)
+ klp_transition_patch->forced = true;
+ else if (klp_transition_patch->replace) {
+ klp_for_each_patch(patch) {
+ if (patch != klp_transition_patch)
+ patch->forced = true;
+ }
+ }
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4a882f83aeb9..c06cab6546ed 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -183,11 +183,9 @@ static DECLARE_BITMAP(list_entries_in_use, MAX_LOCKDEP_ENTRIES);
static struct hlist_head lock_keys_hash[KEYHASH_SIZE];
unsigned long nr_lock_classes;
unsigned long nr_zapped_classes;
-#ifndef CONFIG_DEBUG_LOCKDEP
-static
-#endif
+unsigned long max_lock_class_idx;
struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
-static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
+DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS);
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
@@ -338,7 +336,7 @@ static inline void lock_release_holdtime(struct held_lock *hlock)
* elements. These elements are linked together by the lock_entry member in
* struct lock_class.
*/
-LIST_HEAD(all_lock_classes);
+static LIST_HEAD(all_lock_classes);
static LIST_HEAD(free_lock_classes);
/**
@@ -1252,6 +1250,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
struct lockdep_subclass_key *key;
struct hlist_head *hash_head;
struct lock_class *class;
+ int idx;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -1317,6 +1316,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* of classes.
*/
list_move_tail(&class->lock_entry, &all_lock_classes);
+ idx = class - lock_classes;
+ if (idx > max_lock_class_idx)
+ max_lock_class_idx = idx;
if (verbose(class)) {
graph_unlock();
@@ -3462,7 +3464,7 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
u16 chain_hlock = chain_hlocks[chain->base + i];
unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
- return lock_classes + class_idx - 1;
+ return lock_classes + class_idx;
}
/*
@@ -3530,7 +3532,7 @@ static void print_chain_keys_chain(struct lock_chain *chain)
hlock_id = chain_hlocks[chain->base + i];
chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
+ print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id));
printk("\n");
}
}
@@ -6000,6 +6002,8 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
WRITE_ONCE(class->name, NULL);
nr_lock_classes--;
__clear_bit(class - lock_classes, lock_classes_in_use);
+ if (class - lock_classes == max_lock_class_idx)
+ max_lock_class_idx--;
} else {
WARN_ONCE(true, "%s() failed for class %s\n", __func__,
class->name);
@@ -6011,13 +6015,10 @@ static void zap_class(struct pending_free *pf, struct lock_class *class)
static void reinit_class(struct lock_class *class)
{
- void *const p = class;
- const unsigned int offset = offsetof(struct lock_class, key);
-
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
- memset(p + offset, 0, sizeof(*class) - offset);
+ memset_startat(class, 0, key);
WARN_ON_ONCE(!class->lock_entry.next);
WARN_ON_ONCE(!list_empty(&class->locks_after));
WARN_ON_ONCE(!list_empty(&class->locks_before));
@@ -6290,7 +6291,13 @@ void lockdep_reset_lock(struct lockdep_map *lock)
lockdep_reset_lock_reg(lock);
}
-/* Unregister a dynamically allocated key. */
+/*
+ * Unregister a dynamically allocated key.
+ *
+ * Unlike lockdep_register_key(), a search is always done to find a matching
+ * key irrespective of debug_locks to avoid potential invalid access to freed
+ * memory in lock_class entry.
+ */
void lockdep_unregister_key(struct lock_class_key *key)
{
struct hlist_head *hash_head = keyhashentry(key);
@@ -6305,10 +6312,8 @@ void lockdep_unregister_key(struct lock_class_key *key)
return;
raw_local_irq_save(flags);
- if (!graph_lock())
- goto out_irq;
+ lockdep_lock();
- pf = get_pending_free();
hlist_for_each_entry_rcu(k, hash_head, hash_entry) {
if (k == key) {
hlist_del_rcu(&k->hash_entry);
@@ -6316,11 +6321,13 @@ void lockdep_unregister_key(struct lock_class_key *key)
break;
}
}
- WARN_ON_ONCE(!found);
- __lockdep_free_key_range(pf, key, 1);
- call_rcu_zapped(pf);
- graph_unlock();
-out_irq:
+ WARN_ON_ONCE(!found && debug_locks);
+ if (found) {
+ pf = get_pending_free();
+ __lockdep_free_key_range(pf, key, 1);
+ call_rcu_zapped(pf);
+ }
+ lockdep_unlock();
raw_local_irq_restore(flags);
/* Wait until is_dynamic_key() has finished accessing k->hash_entry. */
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index ecb8662e7a4e..bbe9000260d0 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -121,7 +121,6 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
-extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
@@ -151,6 +150,10 @@ extern unsigned int nr_large_chain_blocks;
extern unsigned int max_lockdep_depth;
extern unsigned int max_bfs_queue_depth;
+extern unsigned long max_lock_class_idx;
+
+extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
+extern unsigned long lock_classes_in_use[];
#ifdef CONFIG_PROVE_LOCKING
extern unsigned long lockdep_count_forward_deps(struct lock_class *);
@@ -205,7 +208,6 @@ struct lockdep_stats {
};
DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
-extern struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
#define __debug_atomic_inc(ptr) \
this_cpu_inc(lockdep_stats.ptr);
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index b8d9a050c337..15fdc7fa5c68 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -24,14 +24,33 @@
#include "lockdep_internals.h"
+/*
+ * Since iteration of lock_classes is done without holding the lockdep lock,
+ * it is not safe to iterate all_lock_classes list directly as the iteration
+ * may branch off to free_lock_classes or the zapped list. Iteration is done
+ * directly on the lock_classes array by checking the lock_classes_in_use
+ * bitmap and max_lock_class_idx.
+ */
+#define iterate_lock_classes(idx, class) \
+ for (idx = 0, class = lock_classes; idx <= max_lock_class_idx; \
+ idx++, class++)
+
static void *l_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &all_lock_classes, pos);
+ struct lock_class *class = v;
+
+ ++class;
+ *pos = class - lock_classes;
+ return (*pos > max_lock_class_idx) ? NULL : class;
}
static void *l_start(struct seq_file *m, loff_t *pos)
{
- return seq_list_start_head(&all_lock_classes, *pos);
+ unsigned long idx = *pos;
+
+ if (idx > max_lock_class_idx)
+ return NULL;
+ return lock_classes + idx;
}
static void l_stop(struct seq_file *m, void *v)
@@ -57,14 +76,16 @@ static void print_name(struct seq_file *m, struct lock_class *class)
static int l_show(struct seq_file *m, void *v)
{
- struct lock_class *class = list_entry(v, struct lock_class, lock_entry);
+ struct lock_class *class = v;
struct lock_list *entry;
char usage[LOCK_USAGE_CHARS];
+ int idx = class - lock_classes;
- if (v == &all_lock_classes) {
+ if (v == lock_classes)
seq_printf(m, "all lock classes:\n");
+
+ if (!test_bit(idx, lock_classes_in_use))
return 0;
- }
seq_printf(m, "%p", class->key);
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -220,8 +241,11 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
if (class->usage_mask == 0)
nr_unused++;
@@ -254,6 +278,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
sum_forward_deps += lockdep_count_forward_deps(class);
}
+
#ifdef CONFIG_DEBUG_LOCKDEP
DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
#endif
@@ -345,6 +370,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
seq_printf(m, " max bfs queue depth: %11u\n",
max_bfs_queue_depth);
#endif
+ seq_printf(m, " max lock class index: %11lu\n",
+ max_lock_class_idx);
lockdep_stats_debug_show(m);
seq_printf(m, " debug_locks: %11u\n",
debug_locks);
@@ -622,12 +649,16 @@ static int lock_stat_open(struct inode *inode, struct file *file)
if (!res) {
struct lock_stat_data *iter = data->stats;
struct seq_file *m = file->private_data;
+ unsigned long idx;
- list_for_each_entry(class, &all_lock_classes, lock_entry) {
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
iter->class = class;
iter->stats = lock_stats(class);
iter++;
}
+
data->iter_end = iter;
sort(data->stats, data->iter_end - data->stats,
@@ -645,6 +676,7 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct lock_class *class;
+ unsigned long idx;
char c;
if (count) {
@@ -654,8 +686,11 @@ static ssize_t lock_stat_write(struct file *file, const char __user *buf,
if (c != '0')
return count;
- list_for_each_entry(class, &all_lock_classes, lock_entry)
+ iterate_lock_classes(idx, class) {
+ if (!test_bit(idx, lock_classes_in_use))
+ continue;
clear_lock_stats(class);
+ }
}
return count;
}
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 70a32a576f3f..c9fdae94e098 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -7,6 +7,7 @@
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
+#include <linux/sched/debug.h>
#include <linux/errno.h>
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
@@ -162,7 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
__set_current_state(TASK_RUNNING);
}
-bool __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
+bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
{
if (__percpu_down_read_trylock(sem))
return true;
@@ -211,7 +212,7 @@ static bool readers_active_check(struct percpu_rw_semaphore *sem)
return true;
}
-void percpu_down_write(struct percpu_rw_semaphore *sem)
+void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 69aba4abe104..acde5d6f1254 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1048,7 +1048,7 @@ out_nolock:
/*
* Wait until we successfully acquire the write lock
*/
-static struct rw_semaphore *
+static struct rw_semaphore __sched *
rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
{
long count;
diff --git a/kernel/module.c b/kernel/module.c
index 24dab046e16c..6cea788fd965 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -335,7 +335,7 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
/*
* A thread that wants to hold a reference to a module only while it
- * is running can call this to safely exit. nfsd and lockd use this.
+ * is running can call this to safely exit.
*/
void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
{
@@ -3725,12 +3725,6 @@ static noinline int do_init_module(struct module *mod)
}
freeinit->module_init = mod->init_layout.base;
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
-
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
@@ -3756,22 +3750,13 @@ static noinline int do_init_module(struct module *mod)
/*
* We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
*
- * http://thread.gmane.org/gmane.linux.kernel/1420814
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
*/
- if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
+ if (!mod->async_probe_requested)
async_synchronize_full();
ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
diff --git a/kernel/module_decompress.c b/kernel/module_decompress.c
index b01c69c2ff99..ffef98a20320 100644
--- a/kernel/module_decompress.c
+++ b/kernel/module_decompress.c
@@ -250,6 +250,7 @@ void module_decompress_cleanup(struct load_info *info)
info->max_pages = info->used_pages = 0;
}
+#ifdef CONFIG_SYSFS
static ssize_t compression_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
@@ -269,3 +270,4 @@ static int __init module_decompress_sysfs_init(void)
return 0;
}
late_initcall(module_decompress_sysfs_init);
+#endif
diff --git a/kernel/padata.c b/kernel/padata.c
index 18d3a5c699d8..e5819bb8bd1d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -181,7 +181,7 @@ int padata_do_parallel(struct padata_shell *ps,
goto out;
if (!cpumask_test_cpu(*cb_cpu, pd->cpumask.cbcpu)) {
- if (!cpumask_weight(pd->cpumask.cbcpu))
+ if (cpumask_empty(pd->cpumask.cbcpu))
goto out;
/* Select an alternate fallback CPU and notify the caller. */
diff --git a/kernel/panic.c b/kernel/panic.c
index 55b50e052ec3..eb4dfb932c85 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL_GPL(panic_timeout);
#define PANIC_PRINT_LOCK_INFO 0x00000008
#define PANIC_PRINT_FTRACE_INFO 0x00000010
#define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020
+#define PANIC_PRINT_ALL_CPU_BT 0x00000040
unsigned long panic_print;
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -147,10 +148,16 @@ void nmi_panic(struct pt_regs *regs, const char *msg)
}
EXPORT_SYMBOL(nmi_panic);
-static void panic_print_sys_info(void)
+static void panic_print_sys_info(bool console_flush)
{
- if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
- console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ if (console_flush) {
+ if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG)
+ console_flush_on_panic(CONSOLE_REPLAY_ALL);
+ return;
+ }
+
+ if (panic_print & PANIC_PRINT_ALL_CPU_BT)
+ trigger_all_cpu_backtrace();
if (panic_print & PANIC_PRINT_TASK_INFO)
show_state();
@@ -185,6 +192,16 @@ void panic(const char *fmt, ...)
int old_cpu, this_cpu;
bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers;
+ if (panic_on_warn) {
+ /*
+ * This thread may hit another WARN() in the panic path.
+ * Resetting this prevents additional WARN() from panicking the
+ * system on this thread. Other threads are blocked by the
+ * panic_mutex in panic().
+ */
+ panic_on_warn = 0;
+ }
+
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -272,6 +289,8 @@ void panic(const char *fmt, ...)
*/
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
+ panic_print_sys_info(false);
+
kmsg_dump(KMSG_DUMP_PANIC);
/*
@@ -302,7 +321,7 @@ void panic(const char *fmt, ...)
debug_locks_off();
console_flush_on_panic(CONSOLE_FLUSH_PENDING);
- panic_print_sys_info();
+ panic_print_sys_info(true);
if (!panic_blink)
panic_blink = no_blink;
@@ -576,16 +595,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
if (regs)
show_regs(regs);
- if (panic_on_warn) {
- /*
- * This thread may hit another WARN() in the panic path.
- * Resetting this prevents additional WARN() from panicking the
- * system on this thread. Other threads are blocked by the
- * panic_mutex in panic().
- */
- panic_on_warn = 0;
+ if (panic_on_warn)
panic("panic_on_warn set ...\n");
- }
if (!regs)
dump_stack();
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index e6af502c2fd7..938d5c78b421 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,7 +28,6 @@
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/ktime.h>
#include <linux/security.h>
#include <linux/secretmem.h>
@@ -689,8 +688,10 @@ static int load_image_and_restore(void)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
+ }
error = swsusp_read(&flags);
swsusp_close(FMODE_READ | FMODE_EXCL);
@@ -1328,7 +1329,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 44169f3081fd..7e646079fbeb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -504,7 +504,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sprintf(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b7e7798637b8..11b570fcf049 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -134,7 +134,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
- pm_wakeup_clear(true);
+ pm_wakeup_clear(0);
pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f7a986078213..330d49937692 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -978,8 +978,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
* Register a range of page frames the contents of which should not be saved
* during hibernation (to be used in the early initialization code).
*/
-void __init __register_nosave_region(unsigned long start_pfn,
- unsigned long end_pfn, int use_kmalloc)
+void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
{
struct nosave_region *region;
@@ -995,18 +994,12 @@ void __init __register_nosave_region(unsigned long start_pfn,
goto Report;
}
}
- if (use_kmalloc) {
- /* During init, this shouldn't fail */
- region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
- BUG_ON(!region);
- } else {
- /* This allocation cannot fail */
- region = memblock_alloc(sizeof(struct nosave_region),
- SMP_CACHE_BYTES);
- if (!region)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(struct nosave_region));
- }
+ /* This allocation cannot fail */
+ region = memblock_alloc(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 80cc1f0f502b..6fcdee7e87a5 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -136,8 +136,6 @@ static void s2idle_loop(void)
break;
}
- pm_wakeup_clear(false);
-
s2idle_enter();
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index d20526c5be15..b663a97f5867 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index ad10359030a4..91fffdd2c7fb 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -16,7 +16,6 @@
#include <linux/file.h>
#include <linux/delay.h>
#include <linux/bitops.h>
-#include <linux/genhd.h>
#include <linux/device.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
@@ -89,7 +88,7 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
-/**
+/*
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
@@ -117,7 +116,7 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
-/**
+/*
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
@@ -171,7 +170,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
-/**
+/*
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
@@ -190,7 +189,7 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
-/**
+/*
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entries had been
* allocated.
@@ -277,10 +276,9 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(GFP_NOIO | __GFP_HIGH, 1);
+ bio = bio_alloc(hib_resume_bdev, 1, op | op_flags,
+ GFP_NOIO | __GFP_HIGH);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
- bio_set_dev(bio, hib_resume_bdev);
- bio_set_op_attrs(bio, op, op_flags);
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
pr_err("Adding page to bio failed at %llu\n",
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 105df4dfc783..52571dcad768 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
{
struct rb_node *node;
struct wakelock *wl;
- char *str = buf;
- char *end = buf + PAGE_SIZE;
+ int len = 0;
mutex_lock(&wakelocks_lock);
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
if (wl->ws->active == show_active)
- str += scnprintf(str, end - str, "%s ", wl->name);
+ len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
- if (str > buf)
- str--;
- str += scnprintf(str, end - str, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
- return (str - buf);
+ return len;
}
#if CONFIG_PM_WAKELOCKS_LIMIT > 0
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 82abfaf3c2aa..da03c15ecc89 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -93,6 +93,12 @@ EXPORT_SYMBOL_GPL(console_drivers);
*/
int __read_mostly suppress_printk;
+/*
+ * During panic, heavy printk by other CPUs can delay the
+ * panic and risk deadlock on console resources.
+ */
+static int __read_mostly suppress_panic_printk;
+
#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
.name = "console_lock"
@@ -146,8 +152,10 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
@@ -166,7 +174,7 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
@@ -257,6 +265,11 @@ static void __up_console_sem(unsigned long ip)
}
#define up_console_sem() __up_console_sem(_RET_IP_)
+static bool panic_in_progress(void)
+{
+ return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
+}
+
/*
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
@@ -1843,6 +1856,16 @@ static int console_trylock_spinning(void)
if (console_trylock())
return 1;
+ /*
+ * It's unsafe to spin once a panic has begun. If we are the
+ * panic CPU, we may have already halted the owner of the
+ * console_sem. If we are not the panic CPU, then we should
+ * avoid taking console_sem, so the panic CPU has a better
+ * chance of cleanly acquiring it later.
+ */
+ if (panic_in_progress())
+ return 0;
+
printk_safe_enter_irqsave(flags);
raw_spin_lock(&console_owner_lock);
@@ -2218,6 +2241,10 @@ asmlinkage int vprintk_emit(int facility, int level,
if (unlikely(suppress_printk))
return 0;
+ if (unlikely(suppress_panic_printk) &&
+ atomic_read(&panic_cpu) != raw_smp_processor_id())
+ return 0;
+
if (level == LOGLEVEL_SCHED) {
level = LOGLEVEL_DEFAULT;
in_sched = true;
@@ -2324,6 +2351,20 @@ asmlinkage __visible void early_printk(const char *fmt, ...)
}
#endif
+static void set_user_specified(struct console_cmdline *c, bool user_specified)
+{
+ if (!user_specified)
+ return;
+
+ /*
+ * @c console was defined by the user on the command line.
+ * Do not clear when added twice also by SPCR or the device tree.
+ */
+ c->user_specified = true;
+ /* At least one console defined by the user on the command line. */
+ console_set_on_cmdline = 1;
+}
+
static int __add_preferred_console(char *name, int idx, char *options,
char *brl_options, bool user_specified)
{
@@ -2340,8 +2381,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
if (strcmp(c->name, name) == 0 && c->index == idx) {
if (!brl_options)
preferred_console = i;
- if (user_specified)
- c->user_specified = true;
+ set_user_specified(c, user_specified);
return 0;
}
}
@@ -2351,7 +2391,7 @@ static int __add_preferred_console(char *name, int idx, char *options,
preferred_console = i;
strlcpy(c->name, name, sizeof(c->name));
c->options = options;
- c->user_specified = user_specified;
+ set_user_specified(c, user_specified);
braille_set_options(c, brl_options);
c->index = idx;
@@ -2417,7 +2457,6 @@ static int __init console_setup(char *str)
*s = 0;
__add_preferred_console(buf, idx, options, brl_options, true);
- console_set_on_cmdline = 1;
return 1;
}
__setup("console=", console_setup);
@@ -2574,6 +2613,25 @@ static int have_callable_console(void)
}
/*
+ * Return true when this CPU should unlock console_sem without pushing all
+ * messages to the console. This reduces the chance that the console is
+ * locked when the panic CPU tries to use it.
+ */
+static bool abandon_console_lock_in_panic(void)
+{
+ if (!panic_in_progress())
+ return false;
+
+ /*
+ * We can use raw_smp_processor_id() here because it is impossible for
+ * the task to be migrated to the panic_cpu, or away from it. If
+ * panic_cpu has already been set, and we're not currently executing on
+ * that CPU, then we never will be.
+ */
+ return atomic_read(&panic_cpu) != raw_smp_processor_id();
+}
+
+/*
* Can we actually use the console at this time on this cpu?
*
* Console drivers may assume that per-cpu resources have been allocated. So
@@ -2603,6 +2661,7 @@ void console_unlock(void)
{
static char ext_text[CONSOLE_EXT_LOG_MAX];
static char text[CONSOLE_LOG_MAX];
+ static int panic_console_dropped;
unsigned long flags;
bool do_cond_resched, retry;
struct printk_info info;
@@ -2657,6 +2716,10 @@ skip:
if (console_seq != r.info->seq) {
console_dropped += r.info->seq - console_seq;
console_seq = r.info->seq;
+ if (panic_in_progress() && panic_console_dropped++ > 10) {
+ suppress_panic_printk = 1;
+ pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
+ }
}
if (suppress_message_printing(r.info->level)) {
@@ -2716,6 +2779,10 @@ skip:
if (handover)
return;
+ /* Allow panic_cpu to take over the consoles safely */
+ if (abandon_console_lock_in_panic())
+ break;
+
if (do_cond_resched)
cond_resched();
}
@@ -2733,7 +2800,7 @@ skip:
* flush, no worries.
*/
retry = prb_read_valid(prb, next_seq, NULL);
- if (retry && console_trylock())
+ if (retry && !abandon_console_lock_in_panic() && console_trylock())
goto again;
}
EXPORT_SYMBOL(console_unlock);
@@ -3228,7 +3295,7 @@ static DEFINE_PER_CPU(int, printk_pending);
static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
- int pending = __this_cpu_xchg(printk_pending, 0);
+ int pending = this_cpu_xchg(printk_pending, 0);
if (pending & PRINTK_PENDING_OUTPUT) {
/* If trylock fails, someone else is doing the printing */
@@ -3262,7 +3329,7 @@ void defer_console_output(void)
return;
preempt_disable();
- __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
+ this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
}
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
index 8a7b7362c0dd..2b7b6ddab4f7 100644
--- a/kernel/printk/printk_ringbuffer.c
+++ b/kernel/printk/printk_ringbuffer.c
@@ -474,8 +474,10 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
* state has been re-checked. A memcpy() for all of @desc
* cannot be used because of the atomic_t @state_var field.
*/
- memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
- sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ if (desc_out) {
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ }
if (seq_out)
*seq_out = info->seq; /* also part of desc_read:C */
if (caller_id_out)
@@ -528,7 +530,8 @@ static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
d_state = get_desc_state(id, state_val);
out:
- atomic_long_set(&desc_out->state_var, state_val);
+ if (desc_out)
+ atomic_long_set(&desc_out->state_var, state_val);
return d_state;
}
@@ -1449,6 +1452,9 @@ static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, id);
}
/**
@@ -1657,7 +1663,12 @@ void prb_commit(struct prb_reserved_entry *e)
*/
void prb_final_commit(struct prb_reserved_entry *e)
{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+
_prb_commit(e, desc_finalized);
+
+ /* Best effort to remember the last finalized @id. */
+ atomic_long_set(&desc_ring->last_finalized_id, e->id);
}
/*
@@ -2005,9 +2016,39 @@ u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
*/
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
- u64 seq = 0;
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ unsigned long id;
+ u64 seq;
+
+ /* Check if the cached @id still points to a valid @seq. */
+ id = atomic_long_read(&desc_ring->last_finalized_id);
+ d_state = desc_read(desc_ring, id, NULL, &seq, NULL);
- /* Search forward from the oldest descriptor. */
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Begin searching after the last finalized record.
+ *
+ * On 0, the search must begin at 0 because of hack#2
+ * of the bootstrapping phase it is not known if a
+ * record at index 0 exists.
+ */
+ if (seq != 0)
+ seq++;
+ } else {
+ /*
+ * The information about the last finalized sequence number
+ * has gone. It should happen only when there is a flood of
+ * new messages and the ringbuffer is rapidly recycled.
+ * Give up and start from the beginning.
+ */
+ seq = 0;
+ }
+
+ /*
+ * The information about the last finalized @seq might be inaccurate.
+ * Search forward to find the current one.
+ */
while (_prb_read_valid(rb, &seq, NULL, NULL))
seq++;
@@ -2044,6 +2085,7 @@ void prb_init(struct printk_ringbuffer *rb,
rb->desc_ring.infos = infos;
atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.last_finalized_id, DESC0_ID(descbits));
rb->text_data_ring.size_bits = textbits;
rb->text_data_ring.data = text_buf;
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
index 73cc80e01cef..18cd25e489b8 100644
--- a/kernel/printk/printk_ringbuffer.h
+++ b/kernel/printk/printk_ringbuffer.h
@@ -75,6 +75,7 @@ struct prb_desc_ring {
struct printk_info *infos;
atomic_long_t head_id;
atomic_long_t tail_id;
+ atomic_long_t last_finalized_id;
};
/*
@@ -258,6 +259,7 @@ static struct printk_ringbuffer name = { \
.infos = &_##name##_infos[0], \
.head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
.tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .last_finalized_id = ATOMIC_INIT(DESC0_ID(descbits)), \
}, \
.text_data_ring = { \
.size_bits = (avgtextbits) + (descbits), \
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
index 653ae04aab7f..c228343eeb97 100644
--- a/kernel/printk/sysctl.c
+++ b/kernel/printk/sysctl.c
@@ -12,7 +12,7 @@
static const int ten_thousand = 10000;
static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eea265082e97..ccc4b465775b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -371,6 +371,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -382,8 +402,16 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize) {
if (addr != 0)
goto out;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
goto out;
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -654,22 +682,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
-
- if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
- !IS_ENABLED(CONFIG_SECCOMP))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
- current->ptrace & PT_SUSPEND_SECCOMP)
- return -EPERM;
- }
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index e373fbe44da5..431cee212467 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -56,13 +56,13 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags |= flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags | flags);
}
static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp,
int flags)
{
- rsclp->flags &= ~flags;
+ WRITE_ONCE(rsclp->flags, rsclp->flags & ~flags);
}
static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 422f7e4cc08d..55d049c39608 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -284,7 +284,7 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-static bool rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
+static atomic_t rcu_fwd_cb_nodelay; /* Short rcu_torture_delay() delays. */
/*
* Allocate an element from the rcu_tortures pool.
@@ -387,7 +387,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
+ if (!atomic_read(&rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -674,6 +674,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcu"
@@ -708,6 +709,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
.name = "srcud"
@@ -997,7 +999,7 @@ static int rcu_torture_boost(void *arg)
goto checkwait;
/* Wait for the next test interval. */
- oldstarttime = boost_starttime;
+ oldstarttime = READ_ONCE(boost_starttime);
while (time_before(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
if (stutter_wait("rcu_torture_boost"))
@@ -1041,10 +1043,11 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime && !kthread_should_stop()) {
+ while (oldstarttime == READ_ONCE(boost_starttime) && !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
if (oldstarttime == boost_starttime) {
- boost_starttime = jiffies + test_boost_interval * HZ;
+ WRITE_ONCE(boost_starttime,
+ jiffies + test_boost_interval * HZ);
n_rcu_torture_boosts++;
}
mutex_unlock(&boost_mutex);
@@ -1276,7 +1279,7 @@ rcu_torture_writer(void *arg)
boot_ended = rcu_inkernel_boot_has_ended();
stutter_waited = stutter_wait("rcu_torture_writer");
if (stutter_waited &&
- !READ_ONCE(rcu_fwd_cb_nodelay) &&
+ !atomic_read(&rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
!torture_must_stop() &&
boot_ended)
@@ -2180,7 +2183,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
for (i = ARRAY_SIZE(rfp->n_launders_hist) - 1; i > 0; i--)
if (rfp->n_launders_hist[i].n_launders > 0)
break;
- mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
pr_alert("%s: Callback-invocation histogram %d (duration %lu jiffies):",
__func__, rfp->rcu_fwd_id, jiffies - rfp->rcu_fwd_startat);
gps_old = rfp->rcu_launder_gp_seq_start;
@@ -2193,7 +2195,6 @@ static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp)
gps_old = gps;
}
pr_cont("\n");
- mutex_unlock(&rcu_fwd_mutex);
}
/* Callback function for continuous-flood RCU callbacks. */
@@ -2281,6 +2282,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
unsigned long stopat;
static DEFINE_TORTURE_RANDOM(trs);
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (!cur_ops->sync)
return; // Cannot do need_resched() forward progress testing without ->sync.
if (cur_ops->call && cur_ops->cb_barrier) {
@@ -2289,7 +2291,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
}
/* Tight loop containing cond_resched(). */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 0);
@@ -2325,6 +2327,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
if (selfpropcb) {
WRITE_ONCE(fcs.stop, 1);
cur_ops->sync(); /* Wait for running CB to complete. */
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for queued callbacks. */
}
@@ -2333,7 +2336,7 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp,
destroy_rcu_head_on_stack(&fcs.rh);
}
schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
/* Carry out call_rcu() forward-progress testing. */
@@ -2353,13 +2356,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
unsigned long stopat;
unsigned long stoppedat;
+ pr_alert("%s: Starting forward-progress test %d\n", __func__, rfp->rcu_fwd_id);
if (READ_ONCE(rcu_fwd_emergency_stop))
return; /* Get out of the way quickly, no GP wait! */
if (!cur_ops->call)
return; /* Can't do call_rcu() fwd prog without ->call. */
/* Loop continuously posting RCU callbacks. */
- WRITE_ONCE(rcu_fwd_cb_nodelay, true);
+ atomic_inc(&rcu_fwd_cb_nodelay);
cur_ops->sync(); /* Later readers see above write. */
WRITE_ONCE(rfp->rcu_fwd_startat, jiffies);
stopat = rfp->rcu_fwd_startat + MAX_FWD_CB_JIFFIES;
@@ -2414,6 +2418,7 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders_cb_snap = READ_ONCE(rfp->n_launders_cb);
cver = READ_ONCE(rcu_torture_current_version) - cver;
gps = rcutorture_seq_diff(cur_ops->get_gp_seq(), gps);
+ pr_alert("%s: Waiting for CBs: %pS() %d\n", __func__, cur_ops->cb_barrier, rfp->rcu_fwd_id);
cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */
(void)rcu_torture_fwd_prog_cbfree(rfp);
@@ -2427,11 +2432,13 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
n_launders, n_launders_sa,
n_max_gps, n_max_cbs, cver, gps);
atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs);
+ mutex_lock(&rcu_fwd_mutex); // Serialize histograms.
rcu_torture_fwd_cb_hist(rfp);
+ mutex_unlock(&rcu_fwd_mutex);
}
schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */
tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
- WRITE_ONCE(rcu_fwd_cb_nodelay, false);
+ atomic_dec(&rcu_fwd_cb_nodelay);
}
@@ -2511,7 +2518,7 @@ static int rcu_torture_fwd_prog(void *args)
firsttime = false;
WRITE_ONCE(rcu_fwd_seq, rcu_fwd_seq + 1);
} else {
- while (READ_ONCE(rcu_fwd_seq) == oldseq)
+ while (READ_ONCE(rcu_fwd_seq) == oldseq && !torture_must_stop())
schedule_timeout_interruptible(1);
oldseq = READ_ONCE(rcu_fwd_seq);
}
@@ -2905,8 +2912,10 @@ rcu_torture_cleanup(void)
int i;
if (torture_cleanup_begin()) {
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
return;
}
if (!cur_ops) {
@@ -2961,8 +2970,10 @@ rcu_torture_cleanup(void)
* Wait for all RCU callbacks to fire, then do torture-type-specific
* cleanup operations.
*/
- if (cur_ops->cb_barrier != NULL)
+ if (cur_ops->cb_barrier != NULL) {
+ pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier);
cur_ops->cb_barrier();
+ }
if (cur_ops->cleanup != NULL)
cur_ops->cleanup();
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 84f1d91604cc..99cf3a13954c 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -123,7 +123,7 @@ static struct rcu_tasks rt_name = \
.call_func = call, \
.rtpcpu = &rt_name ## __percpu, \
.name = n, \
- .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \
+ .percpu_enqueue_shift = order_base_2(CONFIG_NR_CPUS), \
.percpu_enqueue_lim = 1, \
.percpu_dequeue_lim = 1, \
.barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \
@@ -216,6 +216,7 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
int cpu;
unsigned long flags;
int lim;
+ int shift;
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rcu_task_enqueue_lim < 0) {
@@ -229,7 +230,10 @@ static void cblist_init_generic(struct rcu_tasks *rtp)
if (lim > nr_cpu_ids)
lim = nr_cpu_ids;
- WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim));
+ shift = ilog2(nr_cpu_ids / lim);
+ if (((nr_cpu_ids - 1) >> shift) >= lim)
+ shift++;
+ WRITE_ONCE(rtp->percpu_enqueue_shift, shift);
WRITE_ONCE(rtp->percpu_dequeue_lim, lim);
smp_store_release(&rtp->percpu_enqueue_lim, lim);
for_each_possible_cpu(cpu) {
@@ -298,7 +302,7 @@ static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
if (unlikely(needadjust)) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim != nr_cpu_ids) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_enqueue_shift, 0);
WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids);
smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids);
pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name);
@@ -413,7 +417,7 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) {
raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags);
if (rtp->percpu_enqueue_lim > 1) {
- WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids));
+ WRITE_ONCE(rtp->percpu_enqueue_shift, order_base_2(nr_cpu_ids));
smp_store_release(&rtp->percpu_enqueue_lim, 1);
rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu();
pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name);
@@ -492,7 +496,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_tasks *rtp = arg;
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
- housekeeping_affine(current, HK_FLAG_RCU);
+ housekeeping_affine(current, HK_TYPE_RCU);
WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start!
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a4c25a6283b0..a4b8189455d5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -87,11 +87,12 @@ static struct rcu_state rcu_state = {
.gp_state = RCU_GP_IDLE,
.gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
.barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+ .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
.name = RCU_NAME,
.abbr = RCU_ABBR,
.exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
.exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex),
- .ofl_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.ofl_lock),
+ .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -153,7 +154,7 @@ static void sync_sched_exp_online_cleanup(int cpu);
static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp);
-/* rcuc/rcub kthread realtime priority */
+/* rcuc/rcub/rcuop kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
module_param(kthread_prio, int, 0444);
@@ -222,6 +223,16 @@ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
}
/*
+ * Is the CPU corresponding to the specified rcu_data structure online
+ * from RCU's perspective? This perspective is given by that structure's
+ * ->qsmaskinitnext field rather than by the global cpu_online_mask.
+ */
+static bool rcu_rdp_cpu_online(struct rcu_data *rdp)
+{
+ return !!(rdp->grpmask & rcu_rnp_online_cpus(rdp->mynode));
+}
+
+/*
* Return true if an RCU grace period is in progress. The READ_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -1086,9 +1097,8 @@ void rcu_irq_enter_irqson(void)
* Just check whether or not this CPU has non-offloaded RCU callbacks
* queued.
*/
-int rcu_needs_cpu(u64 basemono, u64 *nextevt)
+int rcu_needs_cpu(void)
{
- *nextevt = KTIME_MAX;
return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) &&
!rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data));
}
@@ -1167,15 +1177,20 @@ void rcu_request_urgent_qs_task(struct task_struct *t)
bool rcu_lockdep_current_cpu_online(void)
{
struct rcu_data *rdp;
- struct rcu_node *rnp;
bool ret = false;
if (in_nmi() || !rcu_scheduler_fully_active)
return true;
preempt_disable_notrace();
rdp = this_cpu_ptr(&rcu_data);
- rnp = rdp->mynode;
- if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1)
+ /*
+ * Strictly, we care here about the case where the current CPU is
+ * in rcu_cpu_starting() and thus has an excuse for rdp->grpmask
+ * not being up to date. So arch_spin_is_locked() might have a
+ * false positive if it's held by some *other* CPU, but that's
+ * OK because that just means a false *negative* on the warning.
+ */
+ if (rcu_rdp_cpu_online(rdp) || arch_spin_is_locked(&rcu_state.ofl_lock))
ret = true;
preempt_enable_notrace();
return ret;
@@ -1260,8 +1275,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* For more detail, please refer to the "Hotplug CPU" section
* of RCU's Requirements documentation.
*/
- if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
- bool onl;
+ if (WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp))) {
struct rcu_node *rnp1;
pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
@@ -1270,9 +1284,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx ->rcu_gp_init_mask %#lx\n",
__func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n",
- __func__, rdp->cpu, ".o"[onl],
+ __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
return 1; /* Break things loose after complaining. */
@@ -1739,7 +1752,6 @@ static void rcu_strict_gp_boundary(void *unused)
*/
static noinline_for_stack bool rcu_gp_init(void)
{
- unsigned long firstseq;
unsigned long flags;
unsigned long oldmask;
unsigned long mask;
@@ -1782,22 +1794,17 @@ static noinline_for_stack bool rcu_gp_init(void)
* of RCU's Requirements documentation.
*/
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
+ /* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- // Wait for CPU-hotplug operations that might have
- // started before this grace period did.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values.
- firstseq = READ_ONCE(rnp->ofl_seq);
- if (firstseq & 0x1)
- while (firstseq == READ_ONCE(rnp->ofl_seq))
- schedule_timeout_idle(1); // Can't wake unless RCU is watching.
- smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values.
- raw_spin_lock(&rcu_state.ofl_lock);
- raw_spin_lock_irq_rcu_node(rnp);
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
+ raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
continue;
}
@@ -1832,8 +1839,9 @@ static noinline_for_stack bool rcu_gp_init(void)
rcu_cleanup_dead_rnp(rnp);
}
- raw_spin_unlock_irq_rcu_node(rnp);
- raw_spin_unlock(&rcu_state.ofl_lock);
+ raw_spin_unlock_rcu_node(rnp);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -2850,10 +2858,12 @@ static void rcu_cpu_kthread(unsigned int cpu)
{
unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status);
char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work);
+ unsigned long *j = this_cpu_ptr(&rcu_data.rcuc_activity);
int spincnt;
trace_rcu_utilization(TPS("Start CPU kthread@rcu_run"));
for (spincnt = 0; spincnt < 10; spincnt++) {
+ WRITE_ONCE(*j, jiffies);
local_bh_disable();
*statusp = RCU_KTHREAD_RUNNING;
local_irq_disable();
@@ -2874,6 +2884,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
schedule_timeout_idle(2);
trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
*statusp = RCU_KTHREAD_WAITING;
+ WRITE_ONCE(*j, jiffies);
}
static struct smp_hotplug_thread rcu_cpu_thread_spec = {
@@ -2894,7 +2905,7 @@ static int __init rcu_spawn_core_kthreads(void)
for_each_possible_cpu(cpu)
per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0;
- if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq)
+ if (use_softirq)
return 0;
WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec),
"%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__);
@@ -2995,9 +3006,47 @@ static void check_cb_ovld(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp);
}
-/* Helper function for call_rcu() and friends. */
-static void
-__call_rcu(struct rcu_head *head, rcu_callback_t func)
+/**
+ * call_rcu() - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all pre-existing RCU read-side
+ * critical sections have completed. However, the callback function
+ * might well execute concurrently with RCU read-side critical sections
+ * that started after call_rcu() was invoked.
+ *
+ * RCU read-side critical sections are delimited by rcu_read_lock()
+ * and rcu_read_unlock(), and may be nested. In addition, but only in
+ * v5.0 and later, regions of code across which interrupts, preemption,
+ * or softirqs have been disabled also serve as RCU read-side critical
+ * sections. This includes hardware interrupt handlers, softirq handlers,
+ * and NMI handlers.
+ *
+ * Note that all CPUs must agree that the grace period extended beyond
+ * all pre-existing RCU read-side critical section. On systems with more
+ * than one CPU, this means that when "func()" is invoked, each CPU is
+ * guaranteed to have executed a full memory barrier since the end of its
+ * last RCU read-side critical section whose beginning preceded the call
+ * to call_rcu(). It also means that each CPU executing an RCU read-side
+ * critical section that continues beyond the start of "func()" must have
+ * executed a memory barrier after the call_rcu() but before the beginning
+ * of that RCU read-side critical section. Note that these guarantees
+ * include CPUs that are offline, idle, or executing in user mode, as
+ * well as CPUs that are executing in the kernel.
+ *
+ * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
+ * resulting RCU callback function "func()", then both CPU A and CPU B are
+ * guaranteed to execute a full memory barrier during the time interval
+ * between the call to call_rcu() and the invocation of "func()" -- even
+ * if CPU A and CPU B are the same CPU (but again only if the system has
+ * more than one CPU).
+ *
+ * Implementation of these memory-ordering guarantees is described here:
+ * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ */
+void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
static atomic_t doublefrees;
unsigned long flags;
@@ -3011,7 +3060,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
- * time callback was passed to __call_rcu().
+ * time callback was passed to call_rcu().
*/
if (atomic_inc_return(&doublefrees) < 4) {
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
@@ -3022,8 +3071,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
}
head->func = func;
head->next = NULL;
- local_irq_save(flags);
kasan_record_aux_stack_noalloc(head);
+ local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
/* Add the callback to our list. */
@@ -3060,51 +3109,6 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
local_irq_restore(flags);
}
}
-
-/**
- * call_rcu() - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual callback function to be invoked after the grace period
- *
- * The callback function will be invoked some time after a full grace
- * period elapses, in other words after all pre-existing RCU read-side
- * critical sections have completed. However, the callback function
- * might well execute concurrently with RCU read-side critical sections
- * that started after call_rcu() was invoked.
- *
- * RCU read-side critical sections are delimited by rcu_read_lock()
- * and rcu_read_unlock(), and may be nested. In addition, but only in
- * v5.0 and later, regions of code across which interrupts, preemption,
- * or softirqs have been disabled also serve as RCU read-side critical
- * sections. This includes hardware interrupt handlers, softirq handlers,
- * and NMI handlers.
- *
- * Note that all CPUs must agree that the grace period extended beyond
- * all pre-existing RCU read-side critical section. On systems with more
- * than one CPU, this means that when "func()" is invoked, each CPU is
- * guaranteed to have executed a full memory barrier since the end of its
- * last RCU read-side critical section whose beginning preceded the call
- * to call_rcu(). It also means that each CPU executing an RCU read-side
- * critical section that continues beyond the start of "func()" must have
- * executed a memory barrier after the call_rcu() but before the beginning
- * of that RCU read-side critical section. Note that these guarantees
- * include CPUs that are offline, idle, or executing in user mode, as
- * well as CPUs that are executing in the kernel.
- *
- * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
- * resulting RCU callback function "func()", then both CPU A and CPU B are
- * guaranteed to execute a full memory barrier during the time interval
- * between the call to call_rcu() and the invocation of "func()" -- even
- * if CPU A and CPU B are the same CPU (but again only if the system has
- * more than one CPU).
- *
- * Implementation of these memory-ordering guarantees is described here:
- * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
- */
-void call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
- __call_rcu(head, func);
-}
EXPORT_SYMBOL_GPL(call_rcu);
@@ -3984,13 +3988,16 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
}
/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
+ * If needed, entrain an rcu_barrier() callback on rdp->cblist.
*/
-static void rcu_barrier_func(void *cpu_in)
+static void rcu_barrier_entrain(struct rcu_data *rdp)
{
- uintptr_t cpu = (uintptr_t)cpu_in;
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
+ unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+ lockdep_assert_held(&rcu_state.barrier_lock);
+ if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
+ return;
rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
@@ -4000,10 +4007,26 @@ static void rcu_barrier_func(void *cpu_in)
atomic_inc(&rcu_state.barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- rcu_barrier_trace(TPS("IRQNQ"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
}
rcu_nocb_unlock(rdp);
+ smp_store_release(&rdp->barrier_seq_snap, gseq);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_handler(void *cpu_in)
+{
+ uintptr_t cpu = (uintptr_t)cpu_in;
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ lockdep_assert_irqs_disabled();
+ WARN_ON_ONCE(cpu != rdp->cpu);
+ WARN_ON_ONCE(cpu != smp_processor_id());
+ raw_spin_lock(&rcu_state.barrier_lock);
+ rcu_barrier_entrain(rdp);
+ raw_spin_unlock(&rcu_state.barrier_lock);
}
/**
@@ -4017,6 +4040,8 @@ static void rcu_barrier_func(void *cpu_in)
void rcu_barrier(void)
{
uintptr_t cpu;
+ unsigned long flags;
+ unsigned long gseq;
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rcu_state.barrier_sequence);
@@ -4027,15 +4052,16 @@ void rcu_barrier(void)
/* Did someone else do our work for us? */
if (rcu_seq_done(&rcu_state.barrier_sequence, s)) {
- rcu_barrier_trace(TPS("EarlyExit"), -1,
- rcu_state.barrier_sequence);
+ rcu_barrier_trace(TPS("EarlyExit"), -1, rcu_state.barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rcu_state.barrier_mutex);
return;
}
/* Mark the start of the barrier operation. */
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
rcu_seq_start(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
/*
@@ -4047,7 +4073,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- cpus_read_lock();
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
/*
* Force each CPU with callbacks to register a new callback.
@@ -4056,29 +4082,31 @@ void rcu_barrier(void)
*/
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- if (cpu_is_offline(cpu) &&
- !rcu_rdp_is_offloaded(rdp))
+retry:
+ if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
continue;
- if (rcu_segcblist_n_cbs(&rdp->cblist) && cpu_online(cpu)) {
- rcu_barrier_trace(TPS("OnlineQ"), cpu,
- rcu_state.barrier_sequence);
- smp_call_function_single(cpu, rcu_barrier_func, (void *)cpu, 1);
- } else if (rcu_segcblist_n_cbs(&rdp->cblist) &&
- cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu,
- rcu_state.barrier_sequence);
- local_irq_disable();
- rcu_barrier_func((void *)cpu);
- local_irq_enable();
- } else if (cpu_is_offline(cpu)) {
- rcu_barrier_trace(TPS("OfflineNoCBNoQ"), cpu,
- rcu_state.barrier_sequence);
- } else {
- rcu_barrier_trace(TPS("OnlineNQ"), cpu,
- rcu_state.barrier_sequence);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ if (!rcu_rdp_cpu_online(rdp)) {
+ rcu_barrier_entrain(rdp);
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
+ continue;
+ }
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
+ schedule_timeout_uninterruptible(1);
+ goto retry;
}
+ WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
+ rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
}
- cpus_read_unlock();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -4093,6 +4121,12 @@ void rcu_barrier(void)
/* Mark the end of the barrier operation. */
rcu_barrier_trace(TPS("Inc2"), -1, rcu_state.barrier_sequence);
rcu_seq_end(&rcu_state.barrier_sequence);
+ gseq = rcu_state.barrier_sequence;
+ for_each_possible_cpu(cpu) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+
+ WRITE_ONCE(rdp->barrier_seq_snap, gseq);
+ }
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rcu_state.barrier_mutex);
@@ -4140,6 +4174,7 @@ rcu_boot_init_percpu_data(int cpu)
INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(rdp->dynticks_nesting != 1);
WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
+ rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
@@ -4287,12 +4322,13 @@ void rcu_cpu_starting(unsigned int cpu)
rnp = rdp->mynode;
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
+ local_irq_save(flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
rcu_dynticks_eqs_online();
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ raw_spin_lock(&rcu_state.barrier_lock);
+ raw_spin_lock_rcu_node(rnp);
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
+ raw_spin_unlock(&rcu_state.barrier_lock);
newcpu = !(rnp->expmaskinitnext & mask);
rnp->expmaskinitnext |= mask;
/* Allow lockless access for expedited grace periods. */
@@ -4304,15 +4340,18 @@ void rcu_cpu_starting(unsigned int cpu)
/* An incoming CPU should never be blocking a grace period. */
if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */
+ /* rcu_report_qs_rnp() *really* wants some flags to restore */
+ unsigned long flags2;
+
+ local_irq_save(flags2);
rcu_disable_urgency_upon_qs(rdp);
/* Report QS -after- changing ->qsmaskinitnext! */
- rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
+ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags2);
} else {
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ raw_spin_unlock_rcu_node(rnp);
}
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(flags);
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
@@ -4326,7 +4365,7 @@ void rcu_cpu_starting(unsigned int cpu)
*/
void rcu_report_dead(unsigned int cpu)
{
- unsigned long flags;
+ unsigned long flags, seq_flags;
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -4340,10 +4379,8 @@ void rcu_report_dead(unsigned int cpu)
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(!(rnp->ofl_seq & 0x1));
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- raw_spin_lock(&rcu_state.ofl_lock);
+ local_irq_save(seq_flags);
+ arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq);
rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
@@ -4354,10 +4391,8 @@ void rcu_report_dead(unsigned int cpu)
}
WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- raw_spin_unlock(&rcu_state.ofl_lock);
- smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier().
- WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1);
- WARN_ON_ONCE(rnp->ofl_seq & 0x1);
+ arch_spin_unlock(&rcu_state.ofl_lock);
+ local_irq_restore(seq_flags);
rdp->cpu_started = false;
}
@@ -4380,7 +4415,9 @@ void rcutree_migrate_callbacks(int cpu)
rcu_segcblist_empty(&rdp->cblist))
return; /* No callbacks to migrate. */
- local_irq_save(flags);
+ raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
+ rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
my_rnp = my_rdp->mynode;
rcu_nocb_lock(my_rdp); /* irqs already disabled. */
@@ -4390,10 +4427,10 @@ void rcutree_migrate_callbacks(int cpu)
needwake = rcu_advance_cbs(my_rnp, rdp) ||
rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
rcu_segcblist_disable(&rdp->cblist);
- WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
- !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
if (rcu_rdp_is_offloaded(my_rdp)) {
raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
__call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4440,26 +4477,10 @@ static int rcu_pm_notify(struct notifier_block *self,
static int __init rcu_spawn_gp_kthread(void)
{
unsigned long flags;
- int kthread_prio_in = kthread_prio;
struct rcu_node *rnp;
struct sched_param sp;
struct task_struct *t;
- /* Force priority into range. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
- && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
- kthread_prio = 2;
- else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
- kthread_prio = 1;
- else if (kthread_prio < 0)
- kthread_prio = 0;
- else if (kthread_prio > 99)
- kthread_prio = 99;
-
- if (kthread_prio != kthread_prio_in)
- pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n",
- kthread_prio, kthread_prio_in);
-
rcu_scheduler_fully_active = 1;
t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
@@ -4570,6 +4591,7 @@ static void __init rcu_init_one(void)
init_waitqueue_head(&rnp->exp_wq[2]);
init_waitqueue_head(&rnp->exp_wq[3]);
spin_lock_init(&rnp->exp_lock);
+ mutex_init(&rnp->boost_kthread_mutex);
}
}
@@ -4585,6 +4607,28 @@ static void __init rcu_init_one(void)
}
/*
+ * Force priority from the kernel command-line into range.
+ */
+static void __init sanitize_kthread_prio(void)
+{
+ int kthread_prio_in = kthread_prio;
+
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 2
+ && IS_BUILTIN(CONFIG_RCU_TORTURE_TEST))
+ kthread_prio = 2;
+ else if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1)
+ kthread_prio = 1;
+ else if (kthread_prio < 0)
+ kthread_prio = 0;
+ else if (kthread_prio > 99)
+ kthread_prio = 99;
+
+ if (kthread_prio != kthread_prio_in)
+ pr_alert("%s: Limited prio to %d from %d\n",
+ __func__, kthread_prio, kthread_prio_in);
+}
+
+/*
* Compute the rcu_node tree geometry from kernel parameters. This cannot
* replace the definitions in tree.h because those are needed to size
* the ->node array in the rcu_state structure.
@@ -4744,6 +4788,7 @@ void __init rcu_init(void)
kfree_rcu_batch_init();
rcu_bootup_announce();
+ sanitize_kthread_prio();
rcu_init_geometry();
rcu_init_one();
if (dump_tree)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 486fc901bd08..926673ebe355 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -56,8 +56,6 @@ struct rcu_node {
/* Initialized from ->qsmaskinitnext at the */
/* beginning of each grace period. */
unsigned long qsmaskinitnext;
- unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */
- /* Online CPUs for next grace period. */
unsigned long expmask; /* CPUs or groups that need to check in */
/* to allow the current expedited GP */
/* to complete. */
@@ -110,6 +108,9 @@ struct rcu_node {
/* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
+ struct mutex boost_kthread_mutex;
+ /* Exclusion for thread spawning and affinity */
+ /* manipulation. */
struct task_struct *boost_kthread_task;
/* kthread that takes care of priority */
/* boosting for this rcu_node structure. */
@@ -190,6 +191,7 @@ struct rcu_data {
bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */
/* 4) rcu_barrier(), OOM callbacks, and expediting. */
+ unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
struct rcu_head barrier_head;
int exp_dynticks_snap; /* Double-check need for IPI. */
@@ -203,6 +205,8 @@ struct rcu_data {
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
+ struct mutex nocb_gp_kthread_mutex; /* Exclusion for nocb gp kthread */
+ /* spawning */
/* The following fields are used by call_rcu, hence own cacheline. */
raw_spinlock_t nocb_bypass_lock ____cacheline_internodealigned_in_smp;
@@ -237,6 +241,7 @@ struct rcu_data {
/* rcuc per-CPU kthread or NULL. */
unsigned int rcu_cpu_kthread_status;
char rcu_cpu_has_work;
+ unsigned long rcuc_activity;
/* 7) Diagnostic data, including RCU CPU stall warnings. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
@@ -302,9 +307,8 @@ struct rcu_state {
/* The following fields are guarded by the root rcu_node's lock. */
- u8 boost ____cacheline_internodealigned_in_smp;
- /* Subject to priority boost. */
- unsigned long gp_seq; /* Grace-period sequence #. */
+ unsigned long gp_seq ____cacheline_internodealigned_in_smp;
+ /* Grace-period sequence #. */
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
struct task_struct *gp_kthread; /* Task for grace periods. */
@@ -323,6 +327,8 @@ struct rcu_state {
/* rcu_barrier(). */
/* End of fields guarded by barrier_mutex. */
+ raw_spinlock_t barrier_lock; /* Protects ->barrier_seq_snap. */
+
struct mutex exp_mutex; /* Serialize expedited GP. */
struct mutex exp_wake_mutex; /* Serialize wakeup. */
unsigned long expedited_sequence; /* Take a ticket. */
@@ -355,7 +361,7 @@ struct rcu_state {
const char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
- raw_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
+ arch_spinlock_t ofl_lock ____cacheline_internodealigned_in_smp;
/* Synchronize offline with */
/* GP pre-initialization. */
};
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 237a79989aba..60197ea24ceb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -502,7 +502,8 @@ static void synchronize_rcu_expedited_wait(void)
if (synchronize_rcu_expedited_wait_once(1))
return;
rcu_for_each_leaf_node(rnp) {
- for_each_leaf_node_cpu_mask(rnp, cpu, rnp->expmask) {
+ mask = READ_ONCE(rnp->expmask);
+ for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
if (rdp->rcu_forced_tick_exp)
continue;
@@ -656,7 +657,7 @@ static void rcu_exp_handler(void *unused)
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_dynticks_curr_cpu_in_eqs()) {
+ rcu_is_cpu_rrupt_from_idle()) {
rcu_report_exp_rdp(rdp);
} else {
WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index eeafb546a7a0..636d0546a4e9 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1169,7 +1169,7 @@ void __init rcu_init_nohz(void)
struct rcu_data *rdp;
#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask))
need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */
@@ -1226,6 +1226,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
raw_spin_lock_init(&rdp->nocb_gp_lock);
timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
rcu_cblist_init(&rdp->nocb_bypass);
+ mutex_init(&rdp->nocb_gp_kthread_mutex);
}
/*
@@ -1238,6 +1239,7 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
struct rcu_data *rdp_gp;
struct task_struct *t;
+ struct sched_param sp;
if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup)
return;
@@ -1247,20 +1249,30 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
return;
/* If we didn't spawn the GP kthread first, reorganize! */
+ sp.sched_priority = kthread_prio;
rdp_gp = rdp->nocb_gp_rdp;
+ mutex_lock(&rdp_gp->nocb_gp_kthread_mutex);
if (!rdp_gp->nocb_gp_kthread) {
t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
"rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__)) {
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
return;
+ }
WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
+ mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_cb_kthread, rdp,
"rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
return;
+
+ if (kthread_prio)
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
WRITE_ONCE(rdp->nocb_cb_kthread, t);
WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
}
@@ -1348,7 +1360,7 @@ static void __init rcu_organize_nocb_kthreads(void)
*/
void rcu_bind_current_to_nocb(void)
{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ if (cpumask_available(rcu_nocb_mask) && !cpumask_empty(rcu_nocb_mask))
WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
}
EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c5b45c2f68a1..8360d86db1c0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -330,7 +330,7 @@ void rcu_note_context_switch(bool preempt)
* then queue the task as required based on the states
* of any ongoing and expedited grace periods.
*/
- WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
+ WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
trace_rcu_preempt_task(rcu_state.name,
t->pid,
@@ -556,16 +556,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
} else {
local_irq_restore(flags);
}
@@ -773,7 +773,6 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
int cpu;
int i;
struct list_head *lhp;
- bool onl;
struct rcu_data *rdp;
struct rcu_node *rnp1;
@@ -797,9 +796,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
pr_cont("\n");
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- onl = !!(rdp->grpmask & rcu_rnp_online_cpus(rnp));
pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
- cpu, ".o"[onl],
+ cpu, ".o"[rcu_rdp_cpu_online(rdp)],
(long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
(long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
}
@@ -996,12 +994,15 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
*/
static void rcu_cpu_kthread_setup(unsigned int cpu)
{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
#ifdef CONFIG_RCU_BOOST
struct sched_param sp;
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ WRITE_ONCE(rdp->rcuc_activity, jiffies);
}
#ifdef CONFIG_RCU_BOOST
@@ -1172,15 +1173,14 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
struct sched_param sp;
struct task_struct *t;
+ mutex_lock(&rnp->boost_kthread_mutex);
if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
- return;
-
- rcu_state.boost = 1;
+ goto out;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
"rcub/%d", rnp_index);
if (WARN_ON_ONCE(IS_ERR(t)))
- return;
+ goto out;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
@@ -1188,6 +1188,9 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
+
+ out:
+ mutex_unlock(&rnp->boost_kthread_mutex);
}
/*
@@ -1210,14 +1213,16 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
return;
if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
+ mutex_lock(&rnp->boost_kthread_mutex);
for_each_leaf_node_possible_cpu(rnp, cpu)
if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU));
- if (cpumask_weight(cm) == 0)
- cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU));
+ cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
+ if (cpumask_empty(cm))
+ cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
set_cpus_allowed_ptr(t, cm);
+ mutex_unlock(&rnp->boost_kthread_mutex);
free_cpumask_var(cm);
}
@@ -1291,7 +1296,7 @@ static void rcu_bind_gp_kthread(void)
{
if (!tick_nohz_full_enabled())
return;
- housekeeping_affine(current, HK_FLAG_RCU);
+ housekeeping_affine(current, HK_TYPE_RCU);
}
/* Record the current task on dyntick-idle entry. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 21bebf7c9030..0c5d8516516a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -379,6 +379,15 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp)
return j > 2 * HZ;
}
+static bool rcu_is_rcuc_kthread_starving(struct rcu_data *rdp, unsigned long *jp)
+{
+ unsigned long j = jiffies - READ_ONCE(rdp->rcuc_activity);
+
+ if (jp)
+ *jp = j;
+ return j > 2 * HZ;
+}
+
/*
* Print out diagnostic information for the specified stalled CPU.
*
@@ -430,6 +439,29 @@ static void print_cpu_stall_info(int cpu)
falsepositive ? " (false positive?)" : "");
}
+static void rcuc_kthread_dump(struct rcu_data *rdp)
+{
+ int cpu;
+ unsigned long j;
+ struct task_struct *rcuc;
+
+ rcuc = rdp->rcu_cpu_kthread_task;
+ if (!rcuc)
+ return;
+
+ cpu = task_cpu(rcuc);
+ if (cpu_is_offline(cpu) || idle_cpu(cpu))
+ return;
+
+ if (!rcu_is_rcuc_kthread_starving(rdp, &j))
+ return;
+
+ pr_err("%s kthread starved for %ld jiffies\n", rcuc->comm, j);
+ sched_show_task(rcuc);
+ if (!trigger_single_cpu_backtrace(cpu))
+ dump_cpu_task(cpu);
+}
+
/* Complain about starvation of grace-period kthread. */
static void rcu_check_gp_kthread_starvation(void)
{
@@ -601,6 +633,9 @@ static void print_cpu_stall(unsigned long gps)
rcu_check_gp_kthread_expired_fqs_timer();
rcu_check_gp_kthread_starvation();
+ if (!use_softirq)
+ rcuc_kthread_dump(rdp);
+
rcu_dump_cpu_stacks();
raw_spin_lock_irqsave_rcu_node(rnp, flags);
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 156892c22bb5..180ff9c41fa8 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -407,6 +407,13 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
}
EXPORT_SYMBOL_GPL(__wait_rcu_gp);
+void finish_rcuwait(struct rcuwait *w)
+{
+ rcu_assign_pointer(w->task, NULL);
+ __set_current_state(TASK_RUNNING);
+}
+EXPORT_SYMBOL_GPL(finish_rcuwait);
+
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head)
{
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c08d6e9eef2..34eaee179689 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -56,14 +56,6 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
-/*
- * For memory hotplug, there is no way to free resource entries allocated
- * by boot mem after the system is up. So for reusing the resource entry
- * we need to remember the resource.
- */
-static struct resource *bootmem_resource_free;
-static DEFINE_SPINLOCK(bootmem_resource_lock);
-
static struct resource *next_resource(struct resource *p)
{
if (p->child)
@@ -160,36 +152,19 @@ __initcall(ioresources_init);
static void free_resource(struct resource *res)
{
- if (!res)
- return;
-
- if (!PageSlab(virt_to_head_page(res))) {
- spin_lock(&bootmem_resource_lock);
- res->sibling = bootmem_resource_free;
- bootmem_resource_free = res;
- spin_unlock(&bootmem_resource_lock);
- } else {
+ /**
+ * If the resource was allocated using memblock early during boot
+ * we'll leak it here: we can only return full pages back to the
+ * buddy and trying to be smart and reusing them eventually in
+ * alloc_resource() overcomplicates resource handling.
+ */
+ if (res && PageSlab(virt_to_head_page(res)))
kfree(res);
- }
}
static struct resource *alloc_resource(gfp_t flags)
{
- struct resource *res = NULL;
-
- spin_lock(&bootmem_resource_lock);
- if (bootmem_resource_free) {
- res = bootmem_resource_free;
- bootmem_resource_free = res->sibling;
- }
- spin_unlock(&bootmem_resource_lock);
-
- if (res)
- memset(res, 0, sizeof(struct resource));
- else
- res = kzalloc(sizeof(struct resource), flags);
-
- return res;
+ return kzalloc(sizeof(struct resource), flags);
}
/* Return the conflict entry if you can't request it */
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 6d45ac3dae7f..97ac20b4f738 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -128,10 +128,10 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
int ret;
#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
+ if (get_user(ptr, &t->rseq->rseq_cs))
return -EFAULT;
#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
+ if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
return -EFAULT;
#endif
if (!ptr) {
@@ -217,9 +217,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs.ptr64);
+ return put_user(0UL, &t->rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
+ if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index c83b37af155b..976092b7bd45 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -1,7 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-ifdef CONFIG_FUNCTION_TRACER
-CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
-endif
# The compilers are complaining about unused variables inside an if(0) scope
# block. This is daft, shut them up.
@@ -25,18 +22,13 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o
-
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
-obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
-obj-$(CONFIG_SCHED_DEBUG) += debug.o
-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
-obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
-obj-$(CONFIG_CPU_ISOLATION) += isolation.o
-obj-$(CONFIG_PSI) += psi.o
-obj-$(CONFIG_SCHED_CORE) += core_sched.o
+#
+# Build efficiency:
+#
+# These compilation units have roughly the same size and complexity - so their
+# build parallelizes well and finishes roughly at once:
+#
+obj-y += core.o
+obj-y += fair.o
+obj-y += build_policy.o
+obj-y += build_utility.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 8629b37d118e..16092b49ff6a 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,14 +1,35 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Auto-group scheduling implementation:
*/
-#include <linux/nospec.h>
-#include "sched.h"
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
+#ifdef CONFIG_SYSCTL
+static struct ctl_table sched_autogroup_sysctls[] = {
+ {
+ .procname = "sched_autogroup_enabled",
+ .data = &sysctl_sched_autogroup_enabled,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {}
+};
+
+static void __init sched_autogroup_sysctl_init(void)
+{
+ register_sysctl_init("kernel", sched_autogroup_sysctls);
+}
+#else
+#define sched_autogroup_sysctl_init() do { } while (0)
+#endif
+
void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
@@ -198,6 +219,7 @@ void sched_autogroup_exit(struct signal_struct *sig)
static int __init setup_autogroup(char *str)
{
sysctl_sched_autogroup_enabled = 0;
+ sched_autogroup_sysctl_init();
return 1;
}
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index b96419974a1f..90d69f2c5eaf 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,4 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_SCHED_AUTOGROUP_H
+#define _KERNEL_SCHED_AUTOGROUP_H
+
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
@@ -27,6 +30,7 @@ extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
static inline struct task_group *
autogroup_task_group(struct task_struct *p, struct task_group *tg)
{
+ extern unsigned int sysctl_sched_autogroup_enabled;
int enabled = READ_ONCE(sysctl_sched_autogroup_enabled);
if (enabled && task_wants_autogroup(p, tg))
@@ -58,3 +62,5 @@ static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
}
#endif /* CONFIG_SCHED_AUTOGROUP */
+
+#endif /* _KERNEL_SCHED_AUTOGROUP_H */
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
new file mode 100644
index 000000000000..e0104b45029a
--- /dev/null
+++ b/kernel/sched/build_policy.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are the scheduling policy related scheduler files, built
+ * in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c and fair.c, the other two big
+ * compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ *
+ * core.c and fair.c are built separately.
+ */
+
+/* Headers: */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/posix-timers.h>
+#include <linux/sched/rt.h>
+
+#include <linux/cpuidle.h>
+#include <linux/jiffies.h>
+#include <linux/livepatch.h>
+#include <linux/psi.h>
+#include <linux/seqlock_api.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+#include <linux/tsacct_kern.h>
+#include <linux/vtime.h>
+
+#include <uapi/linux/sched/types.h>
+
+#include "sched.h"
+
+#include "autogroup.h"
+#include "stats.h"
+#include "pelt.h"
+
+/* Source code modules: */
+
+#include "idle.c"
+
+#include "rt.c"
+
+#ifdef CONFIG_SMP
+# include "cpudeadline.c"
+# include "pelt.c"
+#endif
+
+#include "cputime.c"
+#include "deadline.c"
+
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
new file mode 100644
index 000000000000..eec0849b2aae
--- /dev/null
+++ b/kernel/sched/build_utility.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * These are various utility functions of the scheduler,
+ * built in a single compilation unit for build efficiency reasons.
+ *
+ * ( Incidentally, the size of the compilation unit is roughly
+ * comparable to core.c, fair.c, smp.c and policy.c, the other
+ * big compilation units. This helps balance build time, while
+ * coalescing source files to amortize header inclusion
+ * cost. )
+ */
+#include <linux/sched/clock.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/task_stack.h>
+
+#include <linux/cpufreq.h>
+#include <linux/cpumask_api.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/energy_model.h>
+#include <linux/hashtable_api.h>
+#include <linux/irq.h>
+#include <linux/kobject_api.h>
+#include <linux/membarrier.h>
+#include <linux/mempolicy.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/proc_fs.h>
+#include <linux/psi.h>
+#include <linux/psi.h>
+#include <linux/ptrace_api.h>
+#include <linux/sched_clock.h>
+#include <linux/security.h>
+#include <linux/spinlock_api.h>
+#include <linux/swait_api.h>
+#include <linux/timex.h>
+#include <linux/utsname.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#include <uapi/linux/prctl.h>
+#include <uapi/linux/sched/types.h>
+
+#include <asm/switch_to.h>
+
+#include "sched.h"
+#include "sched-pelt.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "clock.c"
+
+#ifdef CONFIG_CGROUP_CPUACCT
+# include "cpuacct.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ
+# include "cpufreq.c"
+#endif
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+# include "cpufreq_schedutil.c"
+#endif
+
+#ifdef CONFIG_SCHED_DEBUG
+# include "debug.c"
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+# include "stats.c"
+#endif
+
+#include "loadavg.c"
+#include "completion.c"
+#include "swait.c"
+#include "wait_bit.c"
+#include "wait.c"
+
+#ifdef CONFIG_SMP
+# include "cpupri.c"
+# include "stop_task.c"
+# include "topology.c"
+#endif
+
+#ifdef CONFIG_SCHED_CORE
+# include "core_sched.c"
+#endif
+
+#ifdef CONFIG_PSI
+# include "psi.c"
+#endif
+
+#ifdef CONFIG_MEMBARRIER
+# include "membarrier.c"
+#endif
+
+#ifdef CONFIG_CPU_ISOLATION
+# include "isolation.c"
+#endif
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+# include "autogroup.c"
+#endif
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c2b2859ddd82..d9272d9061a3 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -53,15 +53,13 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
-#include "sched.h"
-#include <linux/sched_clock.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __weak sched_clock(void)
+notrace unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
@@ -95,28 +93,28 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-static inline struct sched_clock_data *this_scd(void)
+notrace static inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
-static inline struct sched_clock_data *cpu_sdc(int cpu)
+notrace static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
-int sched_clock_stable(void)
+notrace int sched_clock_stable(void)
{
return static_branch_likely(&__sched_clock_stable);
}
-static void __scd_stamp(struct sched_clock_data *scd)
+notrace static void __scd_stamp(struct sched_clock_data *scd)
{
scd->tick_gtod = ktime_get_ns();
scd->tick_raw = sched_clock();
}
-static void __set_sched_clock_stable(void)
+notrace static void __set_sched_clock_stable(void)
{
struct sched_clock_data *scd;
@@ -151,7 +149,7 @@ static void __set_sched_clock_stable(void)
* The only way to fully avoid random clock jumps is to boot with:
* "tsc=unstable".
*/
-static void __sched_clock_work(struct work_struct *work)
+notrace static void __sched_clock_work(struct work_struct *work)
{
struct sched_clock_data *scd;
int cpu;
@@ -177,7 +175,7 @@ static void __sched_clock_work(struct work_struct *work)
static DECLARE_WORK(sched_clock_work, __sched_clock_work);
-static void __clear_sched_clock_stable(void)
+notrace static void __clear_sched_clock_stable(void)
{
if (!sched_clock_stable())
return;
@@ -186,7 +184,7 @@ static void __clear_sched_clock_stable(void)
schedule_work(&sched_clock_work);
}
-void clear_sched_clock_stable(void)
+notrace void clear_sched_clock_stable(void)
{
__sched_clock_stable_early = 0;
@@ -196,7 +194,7 @@ void clear_sched_clock_stable(void)
__clear_sched_clock_stable();
}
-static void __sched_clock_gtod_offset(void)
+notrace static void __sched_clock_gtod_offset(void)
{
struct sched_clock_data *scd = this_scd();
@@ -246,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-static inline u64 wrap_min(u64 x, u64 y)
+notrace static inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-static inline u64 wrap_max(u64 x, u64 y)
+notrace static inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -262,7 +260,7 @@ static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-static u64 sched_clock_local(struct sched_clock_data *scd)
+notrace static u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -295,7 +293,7 @@ again:
return clock;
}
-static u64 sched_clock_remote(struct sched_clock_data *scd)
+notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
@@ -362,7 +360,7 @@ again:
*
* See cpu_clock().
*/
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd;
u64 clock;
@@ -386,7 +384,7 @@ u64 sched_clock_cpu(int cpu)
}
EXPORT_SYMBOL_GPL(sched_clock_cpu);
-void sched_clock_tick(void)
+notrace void sched_clock_tick(void)
{
struct sched_clock_data *scd;
@@ -403,7 +401,7 @@ void sched_clock_tick(void)
sched_clock_local(scd);
}
-void sched_clock_tick_stable(void)
+notrace void sched_clock_tick_stable(void)
{
if (!sched_clock_stable())
return;
@@ -423,7 +421,7 @@ void sched_clock_tick_stable(void)
/*
* We are going deep-idle (irqs are disabled):
*/
-void sched_clock_idle_sleep_event(void)
+notrace void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
@@ -432,7 +430,7 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled; resync with ktime.
*/
-void sched_clock_idle_wakeup_event(void)
+notrace void sched_clock_idle_wakeup_event(void)
{
unsigned long flags;
@@ -458,7 +456,7 @@ void __init sched_clock_init(void)
local_irq_enable();
}
-u64 sched_clock_cpu(int cpu)
+notrace u64 sched_clock_cpu(int cpu)
{
if (!static_branch_likely(&sched_clock_running))
return 0;
@@ -476,7 +474,7 @@ u64 sched_clock_cpu(int cpu)
* On bare metal this function should return the same as local_clock.
* Architectures and sub-architectures can override this.
*/
-u64 __weak running_clock(void)
+notrace u64 __weak running_clock(void)
{
return local_clock();
}
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a778554f9dad..35f15c26ed54 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* Generic wait-for-completion handler;
*
@@ -11,7 +12,6 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
-#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e4ae00e52d1..d575b4914925 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6,26 +6,90 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
-#define CREATE_TRACE_POINTS
-#include <trace/events/sched.h>
-#undef CREATE_TRACE_POINTS
-
-#include "sched.h"
+#include <linux/highmem.h>
+#include <linux/hrtimer_api.h>
+#include <linux/ktime_api.h>
+#include <linux/sched/signal.h>
+#include <linux/syscalls_api.h>
+#include <linux/debug_locks.h>
+#include <linux/prefetch.h>
+#include <linux/capability.h>
+#include <linux/pgtable_api.h>
+#include <linux/wait_bit.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/hardirq.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/debug.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/rseq_api.h>
+#include <linux/sched/rt.h>
-#include <linux/nospec.h>
#include <linux/blkdev.h>
+#include <linux/context_tracking.h>
+#include <linux/cpuset.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/interrupt.h>
+#include <linux/ioprio.h>
+#include <linux/kallsyms.h>
#include <linux/kcov.h>
+#include <linux/kprobes.h>
+#include <linux/llist_api.h>
+#include <linux/mmu_context.h>
+#include <linux/mmzone.h>
+#include <linux/mutex_api.h>
+#include <linux/nmi.h>
+#include <linux/nospec.h>
+#include <linux/perf_event_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/rcuwait_api.h>
+#include <linux/sched/wake_q.h>
#include <linux/scs.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/vtime.h>
+#include <linux/wait_api.h>
+#include <linux/workqueue_api.h>
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+# ifdef CONFIG_GENERIC_ENTRY
+# include <linux/entry-common.h>
+# endif
+#endif
+
+#include <uapi/linux/sched/types.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
-#include "../workqueue_internal.h"
-#include "../../fs/io-wq.h"
-#include "../smpboot.h"
+#define CREATE_TRACE_POINTS
+#include <linux/sched/rseq_api.h>
+#include <trace/events/sched.h>
+#undef CREATE_TRACE_POINTS
+#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
+
+#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
+#include "stats.h"
+
+#include "../workqueue_internal.h"
+#include "../../fs/io-wq.h"
+#include "../smpboot.h"
/*
* Export tracepoints that act as a bare tracehook (ie: have no trace event
@@ -36,6 +100,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -1024,13 +1089,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -1046,7 +1111,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
@@ -4279,7 +4344,9 @@ DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
#ifdef CONFIG_NUMA_BALANCING
-void set_numabalancing_state(bool enabled)
+int sysctl_numa_balancing_mode;
+
+static void __set_numabalancing_state(bool enabled)
{
if (enabled)
static_branch_enable(&sched_numa_balancing);
@@ -4287,13 +4354,22 @@ void set_numabalancing_state(bool enabled)
static_branch_disable(&sched_numa_balancing);
}
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sysctl_numa_balancing_mode = NUMA_BALANCING_NORMAL;
+ else
+ sysctl_numa_balancing_mode = NUMA_BALANCING_DISABLED;
+ __set_numabalancing_state(enabled);
+}
+
#ifdef CONFIG_PROC_SYSCTL
int sysctl_numa_balancing(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
- int state = static_branch_likely(&sched_numa_balancing);
+ int state = sysctl_numa_balancing_mode;
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4303,8 +4379,10 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
- if (write)
- set_numabalancing_state(state);
+ if (write) {
+ sysctl_numa_balancing_mode = state;
+ __set_numabalancing_state(state);
+ }
return err;
}
#endif
@@ -4424,6 +4502,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
init_entity_runnable_average(&p->se);
+
#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -4439,18 +4518,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
-#ifdef CONFIG_CGROUP_SCHED
- struct task_group *tg;
-#endif
+ /*
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED
- tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
- struct task_group, css);
- p->sched_task_group = autogroup_task_group(p, tg);
+ if (1) {
+ struct task_group *tg;
+ tg = container_of(kargs->cset->subsys[cpu_cgrp_id],
+ struct task_group, css);
+ tg = autogroup_task_group(p, tg);
+ p->sched_task_group = tg;
+ }
#endif
rseq_migrate(p);
/*
@@ -4461,7 +4545,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+void sched_post_fork(struct task_struct *p)
+{
uclamp_post_fork(p);
}
@@ -4825,7 +4912,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
- long prev_state;
+ unsigned int prev_state;
/*
* The previous task will have left us with a preempt_count of 2
@@ -5370,7 +5457,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5391,7 +5478,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_TICK))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5822,8 +5909,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
if (schedstat_enabled() && rq->core->core_forceidle_count) {
- if (cookie)
- rq->core->core_forceidle_start = rq_clock(rq->core);
+ rq->core->core_forceidle_start = rq_clock(rq->core);
rq->core->core_forceidle_occupation = occ;
}
@@ -6290,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -6345,8 +6431,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
- if (blk_needs_flush_plug(tsk))
- blk_flush_plug(tsk->plug, true);
+ blk_flush_plug(tsk->plug, true);
}
static void sched_update_worker(struct task_struct *tsk)
@@ -6483,17 +6568,31 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
*/
if (likely(!preemptible()))
return;
-
preempt_schedule_common();
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_dynamic_enabled
+#define preempt_schedule_dynamic_enabled preempt_schedule
+#define preempt_schedule_dynamic_disabled NULL
+#endif
+DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
+void __sched notrace dynamic_preempt_schedule(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule))
+ return;
+ preempt_schedule();
+}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule);
+EXPORT_SYMBOL(dynamic_preempt_schedule);
+#endif
#endif
-
/**
* preempt_schedule_notrace - preempt_schedule called by tracing
@@ -6548,147 +6647,27 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#ifdef CONFIG_PREEMPT_DYNAMIC
-DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#ifndef preempt_schedule_notrace_dynamic_enabled
+#define preempt_schedule_notrace_dynamic_enabled preempt_schedule_notrace
+#define preempt_schedule_notrace_dynamic_disabled NULL
#endif
-
-#endif /* CONFIG_PREEMPTION */
-
-#ifdef CONFIG_PREEMPT_DYNAMIC
-
-#include <linux/entry-common.h>
-
-/*
- * SC:cond_resched
- * SC:might_resched
- * SC:preempt_schedule
- * SC:preempt_schedule_notrace
- * SC:irqentry_exit_cond_resched
- *
- *
- * NONE:
- * cond_resched <- __cond_resched
- * might_resched <- RET0
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * VOLUNTARY:
- * cond_resched <- __cond_resched
- * might_resched <- __cond_resched
- * preempt_schedule <- NOP
- * preempt_schedule_notrace <- NOP
- * irqentry_exit_cond_resched <- NOP
- *
- * FULL:
- * cond_resched <- RET0
- * might_resched <- RET0
- * preempt_schedule <- preempt_schedule
- * preempt_schedule_notrace <- preempt_schedule_notrace
- * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
- */
-
-enum {
- preempt_dynamic_undefined = -1,
- preempt_dynamic_none,
- preempt_dynamic_voluntary,
- preempt_dynamic_full,
-};
-
-int preempt_dynamic_mode = preempt_dynamic_undefined;
-
-int sched_dynamic_mode(const char *str)
-{
- if (!strcmp(str, "none"))
- return preempt_dynamic_none;
-
- if (!strcmp(str, "voluntary"))
- return preempt_dynamic_voluntary;
-
- if (!strcmp(str, "full"))
- return preempt_dynamic_full;
-
- return -EINVAL;
-}
-
-void sched_dynamic_update(int mode)
-{
- /*
- * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
- * the ZERO state, which is invalid.
- */
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
-
- switch (mode) {
- case preempt_dynamic_none:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: none\n");
- break;
-
- case preempt_dynamic_voluntary:
- static_call_update(cond_resched, __cond_resched);
- static_call_update(might_resched, __cond_resched);
- static_call_update(preempt_schedule, NULL);
- static_call_update(preempt_schedule_notrace, NULL);
- static_call_update(irqentry_exit_cond_resched, NULL);
- pr_info("Dynamic Preempt: voluntary\n");
- break;
-
- case preempt_dynamic_full:
- static_call_update(cond_resched, (void *)&__static_call_return0);
- static_call_update(might_resched, (void *)&__static_call_return0);
- static_call_update(preempt_schedule, __preempt_schedule_func);
- static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func);
- static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
- break;
- }
-
- preempt_dynamic_mode = mode;
-}
-
-static int __init setup_preempt_mode(char *str)
-{
- int mode = sched_dynamic_mode(str);
- if (mode < 0) {
- pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
- return 0;
- }
-
- sched_dynamic_update(mode);
- return 1;
-}
-__setup("preempt=", setup_preempt_mode);
-
-static void __init preempt_dynamic_init(void)
+DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
+EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
+void __sched notrace dynamic_preempt_schedule_notrace(void)
{
- if (preempt_dynamic_mode == preempt_dynamic_undefined) {
- if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
- sched_dynamic_update(preempt_dynamic_none);
- } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
- sched_dynamic_update(preempt_dynamic_voluntary);
- } else {
- /* Default static call setting, nothing to do */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
- preempt_dynamic_mode = preempt_dynamic_full;
- pr_info("Dynamic Preempt: full\n");
- }
- }
+ if (!static_branch_unlikely(&sk_dynamic_preempt_schedule_notrace))
+ return;
+ preempt_schedule_notrace();
}
+NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
+EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
+#endif
+#endif
-#else /* !CONFIG_PREEMPT_DYNAMIC */
-
-static inline void preempt_dynamic_init(void) { }
-
-#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+#endif /* CONFIG_PREEMPTION */
/*
* This is the entry point to schedule() from kernel preemption
@@ -8195,11 +8174,35 @@ EXPORT_SYMBOL(__cond_resched);
#endif
#ifdef CONFIG_PREEMPT_DYNAMIC
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define cond_resched_dynamic_enabled __cond_resched
+#define cond_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);
+#define might_resched_dynamic_enabled __cond_resched
+#define might_resched_dynamic_disabled ((void *)&__static_call_return0)
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
+int __sched dynamic_cond_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_cond_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_cond_resched);
+
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
+int __sched dynamic_might_resched(void)
+{
+ if (!static_branch_unlikely(&sk_dynamic_might_resched))
+ return 0;
+ return __cond_resched();
+}
+EXPORT_SYMBOL(dynamic_might_resched);
+#endif
#endif
/*
@@ -8219,9 +8222,7 @@ int __cond_resched_lock(spinlock_t *lock)
if (spin_needbreak(lock) || resched) {
spin_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
spin_lock(lock);
@@ -8239,9 +8240,7 @@ int __cond_resched_rwlock_read(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
read_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
read_lock(lock);
@@ -8259,9 +8258,7 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
if (rwlock_needbreak(lock) || resched) {
write_unlock(lock);
- if (resched)
- preempt_schedule_common();
- else
+ if (!_cond_resched())
cpu_relax();
ret = 1;
write_lock(lock);
@@ -8270,6 +8267,154 @@ int __cond_resched_rwlock_write(rwlock_t *lock)
}
EXPORT_SYMBOL(__cond_resched_rwlock_write);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+#ifdef CONFIG_GENERIC_ENTRY
+#include <linux/entry-common.h>
+#endif
+
+/*
+ * SC:cond_resched
+ * SC:might_resched
+ * SC:preempt_schedule
+ * SC:preempt_schedule_notrace
+ * SC:irqentry_exit_cond_resched
+ *
+ *
+ * NONE:
+ * cond_resched <- __cond_resched
+ * might_resched <- RET0
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * VOLUNTARY:
+ * cond_resched <- __cond_resched
+ * might_resched <- __cond_resched
+ * preempt_schedule <- NOP
+ * preempt_schedule_notrace <- NOP
+ * irqentry_exit_cond_resched <- NOP
+ *
+ * FULL:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ */
+
+enum {
+ preempt_dynamic_undefined = -1,
+ preempt_dynamic_none,
+ preempt_dynamic_voluntary,
+ preempt_dynamic_full,
+};
+
+int preempt_dynamic_mode = preempt_dynamic_undefined;
+
+int sched_dynamic_mode(const char *str)
+{
+ if (!strcmp(str, "none"))
+ return preempt_dynamic_none;
+
+ if (!strcmp(str, "voluntary"))
+ return preempt_dynamic_voluntary;
+
+ if (!strcmp(str, "full"))
+ return preempt_dynamic_full;
+
+ return -EINVAL;
+}
+
+#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
+#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
+#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#else
+#error "Unsupported PREEMPT_DYNAMIC mechanism"
+#endif
+
+void sched_dynamic_update(int mode)
+{
+ /*
+ * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
+ * the ZERO state, which is invalid.
+ */
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+
+ switch (mode) {
+ case preempt_dynamic_none:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: none\n");
+ break;
+
+ case preempt_dynamic_voluntary:
+ preempt_dynamic_enable(cond_resched);
+ preempt_dynamic_enable(might_resched);
+ preempt_dynamic_disable(preempt_schedule);
+ preempt_dynamic_disable(preempt_schedule_notrace);
+ preempt_dynamic_disable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: voluntary\n");
+ break;
+
+ case preempt_dynamic_full:
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ pr_info("Dynamic Preempt: full\n");
+ break;
+ }
+
+ preempt_dynamic_mode = mode;
+}
+
+static int __init setup_preempt_mode(char *str)
+{
+ int mode = sched_dynamic_mode(str);
+ if (mode < 0) {
+ pr_warn("Dynamic Preempt: unsupported mode: %s\n", str);
+ return 0;
+ }
+
+ sched_dynamic_update(mode);
+ return 1;
+}
+__setup("preempt=", setup_preempt_mode);
+
+static void __init preempt_dynamic_init(void)
+{
+ if (preempt_dynamic_mode == preempt_dynamic_undefined) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
+ sched_dynamic_update(preempt_dynamic_none);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
+ sched_dynamic_update(preempt_dynamic_voluntary);
+ } else {
+ /* Default static call setting, nothing to do */
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
+ preempt_dynamic_mode = preempt_dynamic_full;
+ pr_info("Dynamic Preempt: full\n");
+ }
+ }
+}
+
+#else /* !CONFIG_PREEMPT_DYNAMIC */
+
+static inline void preempt_dynamic_init(void) { }
+
+#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */
+
/**
* yield - yield the current processor to other threads.
*
@@ -8378,9 +8523,7 @@ int io_schedule_prepare(void)
int old_iowait = current->in_iowait;
current->in_iowait = 1;
- if (current->plug)
- blk_flush_plug(current->plug, true);
-
+ blk_flush_plug(current->plug, true);
return old_iowait;
}
@@ -8707,7 +8850,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
{
int ret = 1;
- if (!cpumask_weight(cur))
+ if (cpumask_empty(cur))
return ret;
ret = dl_cpuset_cpumask_can_shrink(cur, trial);
@@ -8735,8 +8878,11 @@ int task_can_attach(struct task_struct *p,
}
if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_cpus_allowed))
- ret = dl_task_can_attach(p, cs_cpus_allowed);
+ cs_cpus_allowed)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
+
+ ret = dl_cpu_busy(cpu, p);
+ }
out:
return ret;
@@ -9020,8 +9166,10 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- if (dl_cpu_busy(cpu))
- return -EBUSY;
+ int ret = dl_cpu_busy(cpu, NULL);
+
+ if (ret)
+ return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
@@ -9051,6 +9199,7 @@ int sched_cpu_activate(unsigned int cpu)
set_cpu_active(cpu, true);
if (sched_smp_initialized) {
+ sched_update_numa(cpu, true);
sched_domains_numa_masks_set(cpu);
cpuset_cpu_active();
}
@@ -9129,10 +9278,12 @@ int sched_cpu_deactivate(unsigned int cpu)
if (!sched_smp_initialized)
return 0;
+ sched_update_numa(cpu, false);
ret = cpuset_cpu_inactive(cpu);
if (ret) {
balance_push_set(cpu, false);
set_cpu_active(cpu, true);
+ sched_update_numa(cpu, true);
return ret;
}
sched_domains_numa_masks_clear(cpu);
@@ -9235,7 +9386,7 @@ int sched_cpu_dying(unsigned int cpu)
void __init sched_init_smp(void)
{
- sched_init_numa();
+ sched_init_numa(NUMA_NO_NODE);
/*
* There's no userspace yet to cause hotplug operations; hence all the
@@ -9247,7 +9398,7 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
BUG();
current->flags &= ~PF_NO_SETAFFINITY;
sched_init_granularity();
@@ -9347,7 +9498,6 @@ void __init sched_init(void)
#endif /* CONFIG_CPUMASK_OFFSTACK */
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#ifdef CONFIG_SMP
init_defrootdomain();
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1fb45672ec85..38a2cec21014 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -1,8 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/prctl.h>
-#include "sched.h"
-
/*
* A simple wrapper around refcount. An allocated sched_core_cookie's
* address is used to compute the cookie of the task.
@@ -277,7 +274,7 @@ void __sched_core_account_forceidle(struct rq *rq)
rq_i = cpu_rq(i);
p = rq_i->core_pick ?: rq_i->curr;
- if (!p->core_cookie)
+ if (p == rq_i->idle)
continue;
__schedstat_add(p->stats.core_forceidle_sum, delta);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 3d06c5e4220d..0de9dda09949 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
+
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
-#include <asm/irq_regs.h>
-#include "sched.h"
/* Time spent by the tasks of the CPU accounting group executing in ... */
enum cpuacct_stat_index {
@@ -334,14 +333,13 @@ static struct cftype files[] = {
*/
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
+ unsigned int cpu = task_cpu(tsk);
struct cpuacct *ca;
- rcu_read_lock();
+ lockdep_assert_rq_held(cpu_rq(cpu));
for (ca = task_ca(tsk); ca; ca = parent_ca(ca))
- __this_cpu_add(*ca->cpuusage, cputime);
-
- rcu_read_unlock();
+ *per_cpu_ptr(ca->cpuusage, cpu) += cputime;
}
/*
@@ -353,10 +351,8 @@ void cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
{
struct cpuacct *ca;
- rcu_read_lock();
for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca))
__this_cpu_add(ca->cpustat->cpustat[index], val);
- rcu_read_unlock();
}
struct cgroup_subsys cpuacct_cgrp_subsys = {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ceb03d76c0cc..02d970a879ed 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -1,12 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * kernel/sched/cpudl.c
+ * kernel/sched/cpudeadline.c
*
* Global CPU deadline management
*
* Author: Juri Lelli <j.lelli@sssup.it>
*/
-#include "sched.h"
static inline int parent(int i)
{
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 7c2fe50fd76d..5252fb191fae 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -5,9 +5,6 @@
* Copyright (C) 2016, Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#include <linux/cpufreq.h>
-
-#include "sched.h"
DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 26778884d9ab..3dbf351d12d5 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -6,13 +6,6 @@
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include "sched.h"
-
-#include <linux/sched/cpufreq.h>
-#include <trace/events/power.h>
-
#define IOWAIT_BOOST_MIN (SCHED_CAPACITY_SCALE / 8)
struct sugov_tunables {
@@ -289,6 +282,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* into the same scale so we can compare.
*/
boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
}
@@ -348,8 +342,11 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
/* Restore cached freq as next_freq has changed */
@@ -395,8 +392,11 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
/*
* Do not reduce the target performance level if the CPU has not been
* idle recently, as the reduction is likely to be premature then.
+ *
+ * Except when the rq is capped by uclamp_max.
*/
- if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
+ sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
@@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov);
static void sugov_tunables_free(struct kobject *kobj)
{
- struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
+ struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
kfree(to_sugov_tunables(attr_set));
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d583f2aa744e..fa9ce9d83683 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -22,7 +22,6 @@
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
-#include "sched.h"
/*
* p->rt_priority p->prio newpri cpupri
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b7ec42732b28..78a233d43757 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -2,7 +2,6 @@
/*
* Simple CPU accounting cgroup controller
*/
-#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d2c072b0ef01..fb4255ae0b2c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -15,10 +15,6 @@
* Michael Trimarchi <michael@amarulasolutions.com>,
* Fabio Checconi <fchecconi@gmail.com>
*/
-#include "sched.h"
-#include "pelt.h"
-
-struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
@@ -130,6 +126,21 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
rd->visit_gen = gen;
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+ int i;
+
+ RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+ "sched RCU must be held");
+ for_each_cpu_and(i, rd->span, cpu_active_mask) {
+ struct rq *rq = cpu_rq(i);
+
+ rq->dl.extra_bw += bw;
+ }
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -150,9 +161,38 @@ static inline bool dl_bw_visited(int cpu, u64 gen)
{
return false;
}
+
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+ struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+ dl->extra_bw += bw;
+}
#endif
static inline
+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw -= tsk_bw;
+ __dl_update(dl_b, (s32)tsk_bw / cpus);
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
+{
+ dl_b->total_bw += tsk_bw;
+ __dl_update(dl_b, -((s32)tsk_bw / cpus));
+}
+
+static inline bool
+__dl_overflow(struct dl_bw *dl_b, unsigned long cap, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
+}
+
+static inline
void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
{
u64 old = dl_rq->running_bw;
@@ -408,7 +448,7 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
- return dl_rq->root.rb_leftmost == &dl_se->rb_node;
+ return rb_first_cached(&dl_rq->root) == &dl_se->rb_node;
}
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
@@ -423,12 +463,10 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
- raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_rt_runtime() == RUNTIME_INF)
dl_b->bw = -1;
else
dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
- raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -683,15 +721,6 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
-static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_dl_task(struct rq *rq)
-{
-}
-
static inline void deadline_queue_push_tasks(struct rq *rq)
{
}
@@ -1393,6 +1422,9 @@ void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
timer->function = inactive_task_timer;
}
+#define __node_2_dle(node) \
+ rb_entry((node), struct sched_dl_entity, rb_node)
+
#ifdef CONFIG_SMP
static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
@@ -1422,10 +1454,9 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
cpudl_clear(&rq->rd->cpudl, rq->cpu);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
- struct rb_node *leftmost = dl_rq->root.rb_leftmost;
- struct sched_dl_entity *entry;
+ struct rb_node *leftmost = rb_first_cached(&dl_rq->root);
+ struct sched_dl_entity *entry = __node_2_dle(leftmost);
- entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
dl_rq->earliest_dl.curr = entry->deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
}
@@ -1466,9 +1497,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
dec_dl_migration(dl_se, dl_rq);
}
-#define __node_2_dle(node) \
- rb_entry((node), struct sched_dl_entity, rb_node)
-
static inline bool __dl_less(struct rb_node *a, const struct rb_node *b)
{
return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline);
@@ -1931,15 +1959,14 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
deadline_queue_push_tasks(rq);
}
-static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
- struct dl_rq *dl_rq)
+static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq)
{
struct rb_node *left = rb_first_cached(&dl_rq->root);
if (!left)
return NULL;
- return rb_entry(left, struct sched_dl_entity, rb_node);
+ return __node_2_dle(left);
}
static struct task_struct *pick_task_dl(struct rq *rq)
@@ -1951,7 +1978,7 @@ static struct task_struct *pick_task_dl(struct rq *rq)
if (!sched_dl_runnable(rq))
return NULL;
- dl_se = pick_next_dl_entity(rq, dl_rq);
+ dl_se = pick_next_dl_entity(dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
@@ -2034,15 +2061,17 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
*/
static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu)
{
- struct rb_node *next_node = rq->dl.pushable_dl_tasks_root.rb_leftmost;
struct task_struct *p = NULL;
+ struct rb_node *next_node;
if (!has_pushable_dl_tasks(rq))
return NULL;
+ next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
+
next_node:
if (next_node) {
- p = rb_entry(next_node, struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(next_node);
if (pick_dl_task(rq, p, cpu))
return p;
@@ -2208,8 +2237,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
if (!has_pushable_dl_tasks(rq))
return NULL;
- p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost,
- struct task_struct, pushable_dl_tasks);
+ p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root));
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
@@ -2240,12 +2268,6 @@ static int push_dl_task(struct rq *rq)
return 0;
retry:
- if (is_migration_disabled(next_task))
- return 0;
-
- if (WARN_ON(next_task == rq->curr))
- return 0;
-
/*
* If next_task preempts rq->curr, and rq->curr
* can move away, it makes sense to just reschedule
@@ -2258,6 +2280,12 @@ retry:
return 0;
}
+ if (is_migration_disabled(next_task))
+ return 0;
+
+ if (WARN_ON(next_task == rq->curr))
+ return 0;
+
/* We might release rq lock */
get_task_struct(next_task);
@@ -2731,9 +2759,6 @@ void sched_dl_do_global(void)
int cpu;
unsigned long flags;
- def_dl_bandwidth.dl_period = global_rt_period();
- def_dl_bandwidth.dl_runtime = global_rt_runtime();
-
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
@@ -2955,41 +2980,6 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
}
#ifdef CONFIG_SMP
-int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
-{
- unsigned long flags, cap;
- unsigned int dest_cpu;
- struct dl_bw *dl_b;
- bool overflow;
- int ret;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
-
- rcu_read_lock_sched();
- dl_b = dl_bw_of(dest_cpu);
- raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(dest_cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p->dl.dl_bw);
- if (overflow) {
- ret = -EBUSY;
- } else {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- int cpus = dl_bw_cpus(dest_cpu);
-
- __dl_add(dl_b, p->dl.dl_bw, cpus);
- ret = 0;
- }
- raw_spin_unlock_irqrestore(&dl_b->lock, flags);
- rcu_read_unlock_sched();
-
- return ret;
-}
-
int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
const struct cpumask *trial)
{
@@ -3011,7 +3001,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-bool dl_cpu_busy(unsigned int cpu)
+int dl_cpu_busy(int cpu, struct task_struct *p)
{
unsigned long flags, cap;
struct dl_bw *dl_b;
@@ -3021,11 +3011,22 @@ bool dl_cpu_busy(unsigned int cpu)
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, 0);
+ overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
+
+ if (!overflow && p) {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ }
+
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
- return overflow;
+ return overflow ? -EBUSY : 0;
}
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..bb3d63bdf4ae 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -6,7 +6,6 @@
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*/
-#include "sched.h"
/*
* This allows printing both to /proc/sched_debug and
@@ -931,25 +930,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 095b0aa378df..d4bd299d67ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,7 +20,39 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
+#include <linux/energy_model.h>
+#include <linux/mmap_lock.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/jiffies.h>
+#include <linux/mm_api.h>
+#include <linux/highmem.h>
+#include <linux/spinlock_api.h>
+#include <linux/cpumask_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/softirq.h>
+#include <linux/refcount_api.h>
+#include <linux/topology.h>
+#include <linux/sched/clock.h>
+#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/isolation.h>
+
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/mutex_api.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/ratelimit.h>
+#include <linux/task_work.h>
+
+#include <asm/switch_to.h>
+
+#include <linux/sched/cond_resched.h>
+
#include "sched.h"
+#include "stats.h"
+#include "autogroup.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -1259,10 +1291,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng)
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
- int maxdist, bool task)
+ int lim_dist, bool task)
{
unsigned long score = 0;
- int node;
+ int node, max_dist;
/*
* All nodes are directly connected, and the same distance
@@ -1271,6 +1303,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
if (sched_numa_topology_type == NUMA_DIRECT)
return 0;
+ /* sched_max_numa_distance may be changed in parallel. */
+ max_dist = READ_ONCE(sched_max_numa_distance);
/*
* This code is called for each node, introducing N^2 complexity,
* which should be ok given the number of nodes rarely exceeds 8.
@@ -1283,7 +1317,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* The furthest away nodes in the system are not interesting
* for placement; nid was already counted.
*/
- if (dist == sched_max_numa_distance || node == nid)
+ if (dist >= max_dist || node == nid)
continue;
/*
@@ -1293,8 +1327,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* "hoplimit", only nodes closer by than "hoplimit" are part
* of each group. Skip other nodes.
*/
- if (sched_numa_topology_type == NUMA_BACKPLANE &&
- dist >= maxdist)
+ if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist)
continue;
/* Add up the faults from nearby nodes. */
@@ -1312,8 +1345,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
* This seems to result in good task placement.
*/
if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
- faults *= (sched_max_numa_distance - dist);
- faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ faults *= (max_dist - dist);
+ faults /= (max_dist - LOCAL_DISTANCE);
}
score += faults;
@@ -1489,6 +1522,7 @@ struct task_numa_env {
int src_cpu, src_nid;
int dst_cpu, dst_nid;
+ int imb_numa_nr;
struct numa_stats src_stats, dst_stats;
@@ -1503,7 +1537,7 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight);
+ int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1884,7 +1918,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
imbalance = adjust_numa_imbalance(imbalance, dst_running,
- env->dst_stats.weight);
+ env->imb_numa_nr);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -1949,8 +1983,10 @@ static int task_numa_migrate(struct task_struct *p)
*/
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
- if (sd)
+ if (sd) {
env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+ env.imb_numa_nr = sd->imb_numa_nr;
+ }
rcu_read_unlock();
/*
@@ -1985,7 +2021,7 @@ static int task_numa_migrate(struct task_struct *p)
*/
ng = deref_curr_numa_group(p);
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -2083,13 +2119,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group)
unsigned long faults, max_faults = 0;
int nid, active_nodes = 0;
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults > max_faults)
max_faults = faults;
}
- for_each_online_node(nid) {
+ for_each_node_state(nid, N_CPU) {
faults = group_faults_cpu(numa_group, nid);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
@@ -2243,7 +2279,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
dist = sched_max_numa_distance;
- for_each_online_node(node) {
+ for_each_node_state(node, N_CPU) {
score = group_weight(p, node, dist);
if (score > max_score) {
max_score = score;
@@ -2262,7 +2298,7 @@ static int preferred_group_nid(struct task_struct *p, int nid)
* inside the highest scoring group of nodes. The nodemask tricks
* keep the complexity of the search down.
*/
- nodes = node_online_map;
+ nodes = node_states[N_CPU];
for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
unsigned long max_faults = 0;
nodemask_t max_group = NODE_MASK_NONE;
@@ -2401,6 +2437,21 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ /* Cannot migrate task to CPU-less node */
+ if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) {
+ int near_nid = max_nid;
+ int distance, near_distance = INT_MAX;
+
+ for_each_node_state(nid, N_CPU) {
+ distance = node_distance(max_nid, nid);
+ if (distance < near_distance) {
+ near_nid = nid;
+ near_distance = distance;
+ }
+ }
+ max_nid = near_nid;
+ }
+
if (ng) {
numa_group_count_active_nodes(ng);
spin_unlock_irq(group_lock);
@@ -2825,6 +2876,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
+ p->numa_pages_migrated = 0;
+ p->total_numa_faults = 0;
RCU_INIT_POINTER(p->numa_group, NULL);
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
@@ -3028,9 +3081,11 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- u32 divider = get_pelt_divider(&se->avg);
sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
#else
static inline void
@@ -3381,7 +3436,6 @@ void set_task_rq_fair(struct sched_entity *se,
se->avg.last_update_time = n_last_update_time;
}
-
/*
* When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
* propagate its contribution. The key to this propagation is the invariant
@@ -3449,15 +3503,14 @@ void set_task_rq_fair(struct sched_entity *se,
* XXX: only do this for the part of runnable > running ?
*
*/
-
static inline void
update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3466,23 +3519,30 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
*/
divider = get_pelt_divider(&cfs_rq->avg);
+
/* Set new sched_entity's utilization */
se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * divider;
+ new_sum = se->avg.util_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.util_sum;
+ se->avg.util_sum = new_sum;
/* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ add_positive(&cfs_rq->avg.util_avg, delta_avg);
+ add_positive(&cfs_rq->avg.util_sum, delta_sum);
+
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
- u32 divider;
+ long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+ u32 new_sum, divider;
/* Nothing to update */
- if (!delta)
+ if (!delta_avg)
return;
/*
@@ -3493,19 +3553,25 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/* Set new sched_entity's runnable */
se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * divider;
+ new_sum = se->avg.runnable_avg * divider;
+ delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
+ se->avg.runnable_sum = new_sum;
/* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
+ add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
}
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
- long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
+ long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
unsigned long load_avg;
u64 load_sum = 0;
+ s64 delta_sum;
u32 divider;
if (!runnable_sum)
@@ -3532,7 +3598,7 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
* assuming all tasks are equally runnable.
*/
if (scale_load_down(gcfs_rq->load.weight)) {
- load_sum = div_s64(gcfs_rq->avg.load_sum,
+ load_sum = div_u64(gcfs_rq->avg.load_sum,
scale_load_down(gcfs_rq->load.weight));
}
@@ -3549,19 +3615,22 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
runnable_sum = max(runnable_sum, running_sum);
- load_sum = (s64)se_weight(se) * runnable_sum;
- load_avg = div_s64(load_sum, divider);
+ load_sum = se_weight(se) * runnable_sum;
+ load_avg = div_u64(load_sum, divider);
- se->avg.load_sum = runnable_sum;
-
- delta = load_avg - se->avg.load_avg;
- if (!delta)
+ delta_avg = load_avg - se->avg.load_avg;
+ if (!delta_avg)
return;
- se->avg.load_avg = load_avg;
+ delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
- add_positive(&cfs_rq->avg.load_avg, delta);
- cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
+ se->avg.load_sum = runnable_sum;
+ se->avg.load_avg = load_avg;
+ add_positive(&cfs_rq->avg.load_avg, delta_avg);
+ add_positive(&cfs_rq->avg.load_sum, delta_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
+ cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
}
static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
@@ -3652,7 +3721,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
*
* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
*
- * Returns true if the load decayed or we removed load.
+ * Return: true if the load decayed or we removed load.
*
* Since both these conditions indicate a changed cfs_rq->avg.load we should
* call update_tg_load_avg() when this function returns true.
@@ -3677,15 +3746,32 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
r = removed_load;
sub_positive(&sa->load_avg, r);
- sa->load_sum = sa->load_avg * divider;
+ sub_positive(&sa->load_sum, r * divider);
+ /* See sa->util_sum below */
+ sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
r = removed_util;
sub_positive(&sa->util_avg, r);
- sa->util_sum = sa->util_avg * divider;
+ sub_positive(&sa->util_sum, r * divider);
+ /*
+ * Because of rounding, se->util_sum might ends up being +1 more than
+ * cfs->util_sum. Although this is not a problem by itself, detaching
+ * a lot of tasks with the rounding problem between 2 updates of
+ * util_avg (~1ms) can make cfs->util_sum becoming null whereas
+ * cfs_util_avg is not.
+ * Check that util_sum is still above its lower bound for the new
+ * util_avg. Given that period_contrib might have moved since the last
+ * sync, we are only sure that util_sum must be above or equal to
+ * util_avg * minimum possible divider
+ */
+ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
r = removed_runnable;
sub_positive(&sa->runnable_avg, r);
- sa->runnable_sum = sa->runnable_avg * divider;
+ sub_positive(&sa->runnable_sum, r * divider);
+ /* See sa->util_sum above */
+ sa->runnable_sum = max_t(u32, sa->runnable_sum,
+ sa->runnable_avg * PELT_MIN_DIVIDER);
/*
* removed_runnable is the unweighted version of removed_load so we
@@ -3772,17 +3858,18 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- /*
- * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
- * See ___update_load_avg() for details.
- */
- u32 divider = get_pelt_divider(&cfs_rq->avg);
-
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider;
+ sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
+ cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
+
sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider;
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
+ /* See update_cfs_rq_load_avg() */
+ cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
+ cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
@@ -8539,6 +8626,8 @@ group_type group_classify(unsigned int imbalance_pct,
*
* If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
* of @dst_cpu are idle and @sg has lower priority.
+ *
+ * Return: true if @dst_cpu can pull tasks, false otherwise.
*/
static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
struct sg_lb_stats *sgs,
@@ -8614,6 +8703,7 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
+ * @sds: Load-balancing data with statistics of the local group.
* @group: sched_group whose statistics are to be updated.
* @sgs: variable to hold the statistics for this group.
* @sg_status: Holds flag indicating the status of the sched_group
@@ -9003,9 +9093,9 @@ static bool update_pick_idlest(struct sched_group *idlest,
* This is an approximation as the number of running tasks may not be
* related to the number of busy CPUs due to sched_setaffinity.
*/
-static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
+static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
{
- return (dst_running < (dst_weight >> 2));
+ return running <= imb_numa_nr;
}
/*
@@ -9139,12 +9229,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
#endif
/*
- * Otherwise, keep the task on this node to stay close
- * its wakeup source and improve locality. If there is
- * a real need of migration, periodic load balance will
- * take care of it.
+ * Otherwise, keep the task close to the wakeup source
+ * and improve locality if the number of running tasks
+ * would remain below threshold where an imbalance is
+ * allowed. If there is a real need of migration,
+ * periodic load balance will take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
+ if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
return NULL;
}
@@ -9236,9 +9327,9 @@ next_group:
#define NUMA_IMBALANCE_MIN 2
static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int dst_weight)
+ int dst_running, int imb_numa_nr)
{
- if (!allow_numa_imbalance(dst_running, dst_weight))
+ if (!allow_numa_imbalance(dst_running, imb_numa_nr))
return imbalance;
/*
@@ -9350,7 +9441,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- busiest->sum_nr_running, busiest->group_weight);
+ local->sum_nr_running + 1, env->sd->imb_numa_nr);
}
return;
@@ -9421,12 +9512,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/**
* find_busiest_group - Returns the busiest group within the sched_domain
* if there is an imbalance.
+ * @env: The load balancing environment.
*
* Also calculates the amount of runnable load which should be moved
* to restore balance.
*
- * @env: The load balancing environment.
- *
* Return: - The busiest group if imbalance exists.
*/
static struct sched_group *find_busiest_group(struct lb_env *env)
@@ -10315,7 +10405,7 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set
* anywhere yet.
*/
@@ -10324,7 +10414,7 @@ static inline int find_new_ilb(void)
int ilb;
const struct cpumask *hk_mask;
- hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -10340,7 +10430,7 @@ static inline int find_new_ilb(void)
/*
* Kick a CPU to do the nohz balancing, if it is time for it. We pick any
- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
+ * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -10553,7 +10643,7 @@ void nohz_balance_enter_idle(int cpu)
return;
/* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
return;
/*
@@ -10769,7 +10859,7 @@ static void nohz_newidle_balance(struct rq *this_rq)
* This CPU doesn't want to be disturbed by scheduler
* housekeeping
*/
- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
+ if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
return;
/* Will wake up very soon. No time for doing anything else*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d17b0a5ce6ac..8f8b5020e76a 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -6,9 +6,6 @@
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
-#include "sched.h"
-
-#include <trace/events/power.h>
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 7f06eaf12818..373d42c707bc 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -7,136 +7,179 @@
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
-#include "sched.h"
+
+enum hk_flags {
+ HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
+ HK_FLAG_RCU = BIT(HK_TYPE_RCU),
+ HK_FLAG_MISC = BIT(HK_TYPE_MISC),
+ HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
+ HK_FLAG_TICK = BIT(HK_TYPE_TICK),
+ HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
+ HK_FLAG_WQ = BIT(HK_TYPE_WQ),
+ HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
+ HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
+};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
EXPORT_SYMBOL_GPL(housekeeping_overridden);
-static cpumask_var_t housekeeping_mask;
-static unsigned int housekeeping_flags;
-bool housekeeping_enabled(enum hk_flags flags)
+struct housekeeping {
+ cpumask_var_t cpumasks[HK_TYPE_MAX];
+ unsigned long flags;
+};
+
+static struct housekeeping housekeeping;
+
+bool housekeeping_enabled(enum hk_type type)
{
- return !!(housekeeping_flags & flags);
+ return !!(housekeeping.flags & BIT(type));
}
EXPORT_SYMBOL_GPL(housekeeping_enabled);
-int housekeeping_any_cpu(enum hk_flags flags)
+int housekeeping_any_cpu(enum hk_type type)
{
int cpu;
if (static_branch_unlikely(&housekeeping_overridden)) {
- if (housekeeping_flags & flags) {
- cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
+ if (housekeeping.flags & BIT(type)) {
+ cpu = sched_numa_find_closest(housekeeping.cpumasks[type], smp_processor_id());
if (cpu < nr_cpu_ids)
return cpu;
- return cpumask_any_and(housekeeping_mask, cpu_online_mask);
+ return cpumask_any_and(housekeeping.cpumasks[type], cpu_online_mask);
}
}
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
-const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
+const struct cpumask *housekeeping_cpumask(enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return housekeeping_mask;
+ if (housekeeping.flags & BIT(type))
+ return housekeeping.cpumasks[type];
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
-void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
+void housekeeping_affine(struct task_struct *t, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- set_cpus_allowed_ptr(t, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ set_cpus_allowed_ptr(t, housekeeping.cpumasks[type]);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
-bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
+bool housekeeping_test_cpu(int cpu, enum hk_type type)
{
if (static_branch_unlikely(&housekeeping_overridden))
- if (housekeeping_flags & flags)
- return cpumask_test_cpu(cpu, housekeeping_mask);
+ if (housekeeping.flags & BIT(type))
+ return cpumask_test_cpu(cpu, housekeeping.cpumasks[type]);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
- if (!housekeeping_flags)
+ enum hk_type type;
+
+ if (!housekeeping.flags)
return;
static_branch_enable(&housekeeping_overridden);
- if (housekeeping_flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_TICK)
sched_tick_offload_init();
- /* We need at least one CPU to handle housekeeping work */
- WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
+ for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
+ /* We need at least one CPU to handle housekeeping work */
+ WARN_ON_ONCE(cpumask_empty(housekeeping.cpumasks[type]));
+ }
}
-static int __init housekeeping_setup(char *str, enum hk_flags flags)
+static void __init housekeeping_setup_type(enum hk_type type,
+ cpumask_var_t housekeeping_staging)
{
- cpumask_var_t non_housekeeping_mask;
- cpumask_var_t tmp;
+
+ alloc_bootmem_cpumask_var(&housekeeping.cpumasks[type]);
+ cpumask_copy(housekeeping.cpumasks[type],
+ housekeeping_staging);
+}
+
+static int __init housekeeping_setup(char *str, unsigned long flags)
+{
+ cpumask_var_t non_housekeeping_mask, housekeeping_staging;
+ int err = 0;
+
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+ pr_warn("Housekeeping: nohz unsupported."
+ " Build with CONFIG_NO_HZ_FULL\n");
+ return 0;
+ }
+ }
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
if (cpulist_parse(str, non_housekeeping_mask) < 0) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ goto free_non_housekeeping_mask;
}
- alloc_bootmem_cpumask_var(&tmp);
- if (!housekeeping_flags) {
- alloc_bootmem_cpumask_var(&housekeeping_mask);
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, non_housekeeping_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_staging);
+ cpumask_andnot(housekeeping_staging,
+ cpu_possible_mask, non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp)) {
+ if (!cpumask_intersects(cpu_present_mask, housekeeping_staging)) {
+ __cpumask_set_cpu(smp_processor_id(), housekeeping_staging);
+ __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+ if (!housekeeping.flags) {
pr_warn("Housekeeping: must include one present CPU, "
"using boot CPU:%d\n", smp_processor_id());
- __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- }
- } else {
- cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
- if (cpumask_empty(tmp))
- __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
- cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
- if (!cpumask_equal(tmp, housekeeping_mask)) {
- pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
- free_bootmem_cpumask_var(tmp);
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
}
}
- free_bootmem_cpumask_var(tmp);
- if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
- if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
- tick_nohz_full_setup(non_housekeeping_mask);
- } else {
- pr_warn("Housekeeping: nohz unsupported."
- " Build with CONFIG_NO_HZ_FULL\n");
- free_bootmem_cpumask_var(non_housekeeping_mask);
- return 0;
+ if (!housekeeping.flags) {
+ /* First setup call ("nohz_full=" or "isolcpus=") */
+ enum hk_type type;
+
+ for_each_set_bit(type, &flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
+ } else {
+ /* Second setup call ("nohz_full=" after "isolcpus=" or the reverse) */
+ enum hk_type type;
+ unsigned long iter_flags = flags & housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX) {
+ if (!cpumask_equal(housekeeping_staging,
+ housekeeping.cpumasks[type])) {
+ pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
+ goto free_housekeeping_staging;
+ }
}
+
+ iter_flags = flags & ~housekeeping.flags;
+
+ for_each_set_bit(type, &iter_flags, HK_TYPE_MAX)
+ housekeeping_setup_type(type, housekeeping_staging);
}
- housekeeping_flags |= flags;
+ if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ tick_nohz_full_setup(non_housekeeping_mask);
+
+ housekeeping.flags |= flags;
+ err = 1;
+free_housekeeping_staging:
+ free_bootmem_cpumask_var(housekeeping_staging);
+free_non_housekeeping_mask:
free_bootmem_cpumask_var(non_housekeeping_mask);
- return 1;
+ return err;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
- unsigned int flags;
+ unsigned long flags;
flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
HK_FLAG_MISC | HK_FLAG_KTHREAD;
@@ -147,7 +190,7 @@ __setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
- unsigned int flags = 0;
+ unsigned long flags = 0;
bool illegal = false;
char *par;
int len;
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 954b229868d9..52c8f8226b0d 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,7 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
-#include "sched.h"
/*
* Global load-average calculations
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index b5add64d9698..0c5be7ebb1dc 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -4,7 +4,6 @@
*
* membarrier system call
*/
-#include "sched.h"
/*
* For documentation purposes, here are some membarrier ordering
@@ -147,11 +146,11 @@
#endif
#ifdef CONFIG_RSEQ
-#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
- | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ)
#else
-#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#define MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \
@@ -159,7 +158,8 @@
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
- | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
+ | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
static void ipi_mb(void *info)
{
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a554e3bbab2b..0f310768260c 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -24,10 +24,6 @@
* Author: Vincent Guittot <vincent.guittot@linaro.org>
*/
-#include <linux/sched.h>
-#include "sched.h"
-#include "pelt.h"
-
/*
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index e06071bf3472..c336f5f481bc 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running)
}
#endif
+#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024)
+
static inline u32 get_pelt_divider(struct sched_avg *avg)
{
- return LOAD_AVG_MAX - 1024 + avg->period_contrib;
+ return PELT_MIN_DIVIDER + avg->period_contrib;
}
static inline void cfs_se_util_change(struct sched_avg *avg)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a679613a7cb7..a4fa3aadfcba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -137,21 +137,6 @@
* sampling of the aggregate task states would be.
*/
-#include "../workqueue_internal.h"
-#include <linux/sched/loadavg.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-#include <linux/seqlock.h>
-#include <linux/uaccess.h>
-#include <linux/cgroup.h>
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/ctype.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/psi.h>
-#include "sched.h"
-
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
@@ -523,7 +508,7 @@ static void init_triggers(struct psi_group *group, u64 now)
static u64 update_triggers(struct psi_group *group, u64 now)
{
struct psi_trigger *t;
- bool new_stall = false;
+ bool update_total = false;
u64 *total = group->total[PSI_POLL];
/*
@@ -532,24 +517,35 @@ static u64 update_triggers(struct psi_group *group, u64 now)
*/
list_for_each_entry(t, &group->triggers, node) {
u64 growth;
+ bool new_stall;
- /* Check for stall activity */
- if (group->polling_total[t->state] == total[t->state])
- continue;
+ new_stall = group->polling_total[t->state] != total[t->state];
+ /* Check for stall activity or a previous threshold breach */
+ if (!new_stall && !t->pending_event)
+ continue;
/*
- * Multiple triggers might be looking at the same state,
- * remember to update group->polling_total[] once we've
- * been through all of them. Also remember to extend the
- * polling time if we see new stall activity.
+ * Check for new stall activity, as well as deferred
+ * events that occurred in the last window after the
+ * trigger had already fired (we want to ratelimit
+ * events without dropping any).
*/
- new_stall = true;
-
- /* Calculate growth since last update */
- growth = window_update(&t->win, now, total[t->state]);
- if (growth < t->threshold)
- continue;
-
+ if (new_stall) {
+ /*
+ * Multiple triggers might be looking at the same state,
+ * remember to update group->polling_total[] once we've
+ * been through all of them. Also remember to extend the
+ * polling time if we see new stall activity.
+ */
+ update_total = true;
+
+ /* Calculate growth since last update */
+ growth = window_update(&t->win, now, total[t->state]);
+ if (growth < t->threshold)
+ continue;
+
+ t->pending_event = true;
+ }
/* Limit event signaling to once per window */
if (now < t->last_event_time + t->win.size)
continue;
@@ -558,9 +554,11 @@ static u64 update_triggers(struct psi_group *group, u64 now)
if (cmpxchg(&t->event, 0, 1) == 0)
wake_up_interruptible(&t->event_wait);
t->last_event_time = now;
+ /* Reset threshold breach flag once event got generated */
+ t->pending_event = false;
}
- if (new_stall)
+ if (update_total)
memcpy(group->polling_total, total,
sizeof(group->polling_total));
@@ -1082,44 +1080,6 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-static int psi_io_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_IO);
-}
-
-static int psi_memory_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_MEM);
-}
-
-static int psi_cpu_show(struct seq_file *m, void *v)
-{
- return psi_show(m, &psi_system, PSI_CPU);
-}
-
-static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
-{
- if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return single_open(file, psi_show, NULL);
-}
-
-static int psi_io_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_io_show);
-}
-
-static int psi_memory_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_memory_show);
-}
-
-static int psi_cpu_open(struct inode *inode, struct file *file)
-{
- return psi_open(file, psi_cpu_show);
-}
-
struct psi_trigger *psi_trigger_create(struct psi_group *group,
char *buf, size_t nbytes, enum psi_res res)
{
@@ -1162,7 +1122,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
+ t->pending_event = false;
mutex_lock(&group->trigger_lock);
@@ -1191,15 +1151,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
+ struct psi_group *group;
struct task_struct *task_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1235,9 +1199,9 @@ static void psi_trigger_destroy(struct kref *ref)
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_task RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_task
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
@@ -1254,18 +1218,6 @@ static void psi_trigger_destroy(struct kref *ref)
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1275,27 +1227,57 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+static int psi_io_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_IO);
+}
+
+static int psi_memory_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_MEM);
+}
+
+static int psi_cpu_show(struct seq_file *m, void *v)
+{
+ return psi_show(m, &psi_system, PSI_CPU);
+}
+
+static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
+{
+ if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ return single_open(file, psi_show, NULL);
+}
+
+static int psi_io_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_io_show);
+}
+
+static int psi_memory_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_memory_show);
+}
+
+static int psi_cpu_open(struct inode *inode, struct file *file)
+{
+ return psi_open(file, psi_cpu_show);
+}
+
static ssize_t psi_write(struct file *file, const char __user *user_buf,
size_t nbytes, enum psi_res res)
{
@@ -1316,14 +1298,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1358,7 +1350,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
@@ -1400,3 +1392,5 @@ static int __init psi_proc_init(void)
return 0;
}
module_init(psi_proc_init);
+
+#endif /* CONFIG_PROC_FS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7b4f4fbbb404..a32c46889af8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,9 +3,6 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
-#include "sched.h"
-
-#include "pelt.h"
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -271,8 +268,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static void pull_rt_task(struct rq *this_rq);
-
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
@@ -429,15 +424,6 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
-static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
-{
- return false;
-}
-
-static inline void pull_rt_task(struct rq *this_rq)
-{
-}
-
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
@@ -1730,8 +1716,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
rt_queue_push_tasks(rq);
}
-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
- struct rt_rq *rt_rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
struct sched_rt_entity *next = NULL;
@@ -1753,7 +1738,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
+ rt_se = pick_next_rt_entity(rt_rq);
BUG_ON(!rt_se);
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
@@ -2026,6 +2011,16 @@ static int push_rt_task(struct rq *rq, bool pull)
return 0;
retry:
+ /*
+ * It's possible that the next_task slipped in of
+ * higher priority than current. If that's the case
+ * just reschedule current.
+ */
+ if (unlikely(next_task->prio < rq->curr->prio)) {
+ resched_curr(rq);
+ return 0;
+ }
+
if (is_migration_disabled(next_task)) {
struct task_struct *push_task = NULL;
int cpu;
@@ -2033,6 +2028,18 @@ retry:
if (!pull || rq->push_busy)
return 0;
+ /*
+ * Invoking find_lowest_rq() on anything but an RT task doesn't
+ * make sense. Per the above priority check, curr has to
+ * be of higher priority than next_task, so no need to
+ * reschedule when bailing out.
+ *
+ * Note that the stoppers are masqueraded as SCHED_FIFO
+ * (cf. sched_set_stop_task()), so we can't rely on rt_task().
+ */
+ if (rq->curr->sched_class != &rt_sched_class)
+ return 0;
+
cpu = find_lowest_rq(rq->curr);
if (cpu == -1 || cpu == rq->cpu)
return 0;
@@ -2057,16 +2064,6 @@ retry:
if (WARN_ON(next_task == rq->curr))
return 0;
- /*
- * It's possible that the next_task slipped in of
- * higher priority than current. If that's the case
- * just reschedule current.
- */
- if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_curr(rq);
- return 0;
- }
-
/* We might release rq lock */
get_task_struct(next_task);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..58263f90c559 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,86 +2,98 @@
/*
* Scheduler internal types and methods:
*/
-#include <linux/sched.h>
+#ifndef _KERNEL_SCHED_SCHED_H
+#define _KERNEL_SCHED_SCHED_H
+#include <linux/sched/affinity.h>
#include <linux/sched/autogroup.h>
-#include <linux/sched/clock.h>
-#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
-#include <linux/sched/cputime.h>
#include <linux/sched/deadline.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/hotplug.h>
-#include <linux/sched/idle.h>
-#include <linux/sched/init.h>
-#include <linux/sched/isolation.h>
-#include <linux/sched/jobctl.h>
+#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
-#include <linux/sched/nohz.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/prio.h>
-#include <linux/sched/rt.h>
+#include <linux/sched/rseq_api.h>
#include <linux/sched/signal.h>
#include <linux/sched/smt.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
+#include <linux/sched/task_flags.h>
#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
#include <linux/sched/topology.h>
-#include <linux/sched/user.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/xacct.h>
-#include <uapi/linux/sched/types.h>
-
-#include <linux/binfmts.h>
-#include <linux/bitops.h>
-#include <linux/compat.h>
-#include <linux/context_tracking.h>
+#include <linux/atomic.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/capability.h>
+#include <linux/cgroup_api.h>
+#include <linux/cgroup.h>
#include <linux/cpufreq.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuset.h>
+#include <linux/cpumask_api.h>
#include <linux/ctype.h>
-#include <linux/debugfs.h>
-#include <linux/delayacct.h>
-#include <linux/energy_model.h>
-#include <linux/init_task.h>
-#include <linux/kprobes.h>
+#include <linux/file.h>
+#include <linux/fs_api.h>
+#include <linux/hrtimer_api.h>
+#include <linux/interrupt.h>
+#include <linux/irq_work.h>
+#include <linux/jiffies.h>
+#include <linux/kref_api.h>
#include <linux/kthread.h>
-#include <linux/membarrier.h>
-#include <linux/migrate.h>
-#include <linux/mmu_context.h>
-#include <linux/nmi.h>
+#include <linux/ktime_api.h>
+#include <linux/lockdep_api.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex_api.h>
+#include <linux/plist.h>
+#include <linux/poll.h>
#include <linux/proc_fs.h>
-#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/psi.h>
-#include <linux/ratelimit.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/security.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/seqlock.h>
+#include <linux/softirq.h>
+#include <linux/spinlock_api.h>
+#include <linux/static_key.h>
#include <linux/stop_machine.h>
-#include <linux/suspend.h>
-#include <linux/swait.h>
+#include <linux/syscalls_api.h>
#include <linux/syscalls.h>
-#include <linux/task_work.h>
-#include <linux/tsacct_kern.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
+#include <linux/types.h>
+#include <linux/u64_stats_sync_api.h>
+#include <linux/uaccess.h>
+#include <linux/wait_api.h>
+#include <linux/wait_bit.h>
+#include <linux/workqueue_api.h>
+
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
+#include "../workqueue_internal.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+#include <linux/psi.h>
+#endif
-#include <asm/tlb.h>
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/static_key.h>
+#endif
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
+# include <asm/paravirt_api_clock.h>
#endif
#include "cpupri.h"
#include "cpudeadline.h"
-#include <trace/events/sched.h>
-
#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
+# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
#endif
struct rq;
@@ -301,29 +313,6 @@ struct dl_bw {
u64 total_bw;
};
-static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
-
-static inline
-void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw -= tsk_bw;
- __dl_update(dl_b, (s32)tsk_bw / cpus);
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
-{
- dl_b->total_bw += tsk_bw;
- __dl_update(dl_b, -((s32)tsk_bw / cpus));
-}
-
-static inline bool __dl_overflow(struct dl_bw *dl_b, unsigned long cap,
- u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- cap_scale(dl_b->bw, cap) < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* Verify the fitness of task @p to run on @cpu taking into account the
* CPU original capacity and the runtime/deadline ratio of the task.
@@ -347,15 +336,11 @@ extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern bool dl_cpu_busy(unsigned int cpu);
+extern int dl_cpu_busy(int cpu, struct task_struct *p);
#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-#include <linux/psi.h>
-
struct cfs_rq;
struct rt_rq;
@@ -1662,12 +1647,14 @@ enum numa_topology_type {
extern enum numa_topology_type sched_numa_topology_type;
extern int sched_max_numa_distance;
extern bool find_numa_distance(int distance);
-extern void sched_init_numa(void);
+extern void sched_init_numa(int offline_node);
+extern void sched_update_numa(int cpu, bool online);
extern void sched_domains_numa_masks_set(unsigned int cpu);
extern void sched_domains_numa_masks_clear(unsigned int cpu);
extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
#else
-static inline void sched_init_numa(void) { }
+static inline void sched_init_numa(int offline_node) { }
+static inline void sched_update_numa(int cpu, bool online) { }
static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
@@ -1854,7 +1841,6 @@ static inline void flush_smp_call_function_from_idle(void) { }
#endif
#include "stats.h"
-#include "autogroup.h"
#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
@@ -1950,7 +1936,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
* Tunables that become constants when CONFIG_SCHED_DEBUG is off:
*/
#ifdef CONFIG_SCHED_DEBUG
-# include <linux/static_key.h>
# define const_debug __read_mostly
#else
# define const_debug const
@@ -2331,7 +2316,6 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@@ -2747,32 +2731,6 @@ extern void nohz_run_idle_balance(int cpu);
static inline void nohz_run_idle_balance(int cpu) { }
#endif
-#ifdef CONFIG_SMP
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
- int i;
-
- RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
- "sched RCU must be held");
- for_each_cpu_and(i, rd->span, cpu_active_mask) {
- struct rq *rq = cpu_rq(i);
-
- rq->dl.extra_bw += bw;
- }
-}
-#else
-static inline
-void __dl_update(struct dl_bw *dl_b, s64 bw)
-{
- struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
-
- dl->extra_bw += bw;
-}
-#endif
-
-
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
u64 total;
@@ -2841,88 +2799,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
-#ifdef CONFIG_UCLAMP_TASK
-unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
-
-/**
- * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
- * @rq: The rq to clamp against. Must not be NULL.
- * @util: The util value to clamp.
- * @p: The task to clamp against. Can be NULL if you want to clamp
- * against @rq only.
- *
- * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
- *
- * If sched_uclamp_used static key is disabled, then just return the util
- * without any clamping since uclamp aggregation at the rq level in the fast
- * path is disabled, rendering this operation a NOP.
- *
- * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
- * will return the correct effective uclamp value of the task even if the
- * static key is disabled.
- */
-static __always_inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- unsigned long min_util = 0;
- unsigned long max_util = 0;
-
- if (!static_branch_likely(&sched_uclamp_used))
- return util;
-
- if (p) {
- min_util = uclamp_eff_value(p, UCLAMP_MIN);
- max_util = uclamp_eff_value(p, UCLAMP_MAX);
-
- /*
- * Ignore last runnable task's max clamp, as this task will
- * reset it. Similarly, no need to read the rq's min clamp.
- */
- if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
- goto out;
- }
-
- min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
- max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
-out:
- /*
- * Since CPU's {min,max}_util clamps are MAX aggregated considering
- * RUNNABLE tasks with _different_ clamps, we can end up with an
- * inversion. Fix it now when the clamps are applied.
- */
- if (unlikely(min_util >= max_util))
- return min_util;
-
- return clamp(util, min_util, max_util);
-}
-
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
- return static_branch_likely(&sched_uclamp_used);
-}
-#else /* CONFIG_UCLAMP_TASK */
-static inline
-unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
- struct task_struct *p)
-{
- return util;
-}
-
-static inline bool uclamp_is_used(void)
-{
- return false;
-}
-#endif /* CONFIG_UCLAMP_TASK */
-
#ifdef arch_scale_freq_capacity
# ifndef arch_scale_freq_invariant
# define arch_scale_freq_invariant() true
@@ -3020,6 +2896,105 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
}
#endif
+#ifdef CONFIG_UCLAMP_TASK
+unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+
+/**
+ * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
+ * @rq: The rq to clamp against. Must not be NULL.
+ * @util: The util value to clamp.
+ * @p: The task to clamp against. Can be NULL if you want to clamp
+ * against @rq only.
+ *
+ * Clamps the passed @util to the max(@rq, @p) effective uclamp values.
+ *
+ * If sched_uclamp_used static key is disabled, then just return the util
+ * without any clamping since uclamp aggregation at the rq level in the fast
+ * path is disabled, rendering this operation a NOP.
+ *
+ * Use uclamp_eff_value() if you don't care about uclamp values at rq level. It
+ * will return the correct effective uclamp value of the task even if the
+ * static key is disabled.
+ */
+static __always_inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
+{
+ unsigned long min_util = 0;
+ unsigned long max_util = 0;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return util;
+
+ if (p) {
+ min_util = uclamp_eff_value(p, UCLAMP_MIN);
+ max_util = uclamp_eff_value(p, UCLAMP_MAX);
+
+ /*
+ * Ignore last runnable task's max clamp, as this task will
+ * reset it. Similarly, no need to read the rq's min clamp.
+ */
+ if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+ goto out;
+ }
+
+ min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
+ max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+out:
+ /*
+ * Since CPU's {min,max}_util clamps are MAX aggregated considering
+ * RUNNABLE tasks with _different_ clamps, we can end up with an
+ * inversion. Fix it now when the clamps are applied.
+ */
+ if (unlikely(min_util >= max_util))
+ return min_util;
+
+ return clamp(util, min_util, max_util);
+}
+
+/* Is the rq being capped/throttled by uclamp_max? */
+static inline bool uclamp_rq_is_capped(struct rq *rq)
+{
+ unsigned long rq_util;
+ unsigned long max_util;
+
+ if (!static_branch_likely(&sched_uclamp_used))
+ return false;
+
+ rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
+ max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+
+ return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
+}
+
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline
+unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
+ struct task_struct *p)
+{
+ return util;
+}
+
+static inline bool uclamp_rq_is_capped(struct rq *rq) { return false; }
+
+static inline bool uclamp_is_used(void)
+{
+ return false;
+}
+#endif /* CONFIG_UCLAMP_TASK */
+
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
static inline unsigned long cpu_util_irq(struct rq *rq)
{
@@ -3118,3 +3093,4 @@ extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+#endif /* _KERNEL_SCHED_SCHED_H */
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 07dde2928c79..857f837f52cb 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -2,7 +2,6 @@
/*
* /proc/schedstat implementation
*/
-#include "sched.h"
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 3a3c826dd83a..baa839c1ba96 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -1,4 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _KERNEL_STATS_H
+#define _KERNEL_STATS_H
#ifdef CONFIG_SCHEDSTATS
@@ -298,3 +300,5 @@ sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *n
# define sched_info_dequeue(rq, t) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif /* CONFIG_SCHED_INFO */
+
+#endif /* _KERNEL_STATS_H */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 0b165a25f22f..d04073a93eb4 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -7,7 +7,6 @@
*
* See kernel/stop_machine.c
*/
-#include "sched.h"
#ifdef CONFIG_SMP
static int
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index e1c655f928c7..76b9b796e695 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -2,7 +2,6 @@
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
-#include "sched.h"
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..810750e62118 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,7 +2,6 @@
/*
* Scheduler topology setup/handling methods
*/
-#include "sched.h"
DEFINE_MUTEX(sched_domains_mutex);
@@ -74,7 +73,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
break;
}
- if (!cpumask_weight(sched_group_span(group))) {
+ if (cpumask_empty(sched_group_span(group))) {
printk(KERN_CONT "\n");
printk(KERN_ERR "ERROR: empty group\n");
break;
@@ -1366,7 +1365,7 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry(entry, &asym_cap_list, link)
cpumask_clear(cpu_capacity_span(entry));
- for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_FLAG_DOMAIN))
+ for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN))
asym_cpu_capacity_update_data(cpu);
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
@@ -1492,8 +1491,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-
-static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1651,6 +1648,7 @@ static struct sched_domain_topology_level default_topology[] = {
static struct sched_domain_topology_level *sched_domain_topology =
default_topology;
+static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
@@ -1661,6 +1659,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl)
return;
sched_domain_topology = tl;
+ sched_domain_topology_saved = NULL;
}
#ifdef CONFIG_NUMA
@@ -1684,8 +1683,12 @@ static void sched_numa_warn(const char *str)
for (i = 0; i < nr_node_ids; i++) {
printk(KERN_WARNING " ");
- for (j = 0; j < nr_node_ids; j++)
- printk(KERN_CONT "%02d ", node_distance(i,j));
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_state(i, N_CPU) || !node_state(j, N_CPU))
+ printk(KERN_CONT "(%02d) ", node_distance(i,j));
+ else
+ printk(KERN_CONT " %02d ", node_distance(i,j));
+ }
printk(KERN_CONT "\n");
}
printk(KERN_WARNING "\n");
@@ -1693,19 +1696,34 @@ static void sched_numa_warn(const char *str)
bool find_numa_distance(int distance)
{
- int i;
+ bool found = false;
+ int i, *distances;
if (distance == node_distance(0, 0))
return true;
+ rcu_read_lock();
+ distances = rcu_dereference(sched_domains_numa_distance);
+ if (!distances)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- if (sched_domains_numa_distance[i] == distance)
- return true;
+ if (distances[i] == distance) {
+ found = true;
+ break;
+ }
}
+unlock:
+ rcu_read_unlock();
- return false;
+ return found;
}
+#define for_each_cpu_node_but(n, nbut) \
+ for_each_node_state(n, N_CPU) \
+ if (n == nbut) \
+ continue; \
+ else
+
/*
* A system can have three types of NUMA topology:
* NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
@@ -1725,7 +1743,7 @@ bool find_numa_distance(int distance)
* there is an intermediary node C, which is < N hops away from both
* nodes A and B, the system is a glueless mesh.
*/
-static void init_numa_topology_type(void)
+static void init_numa_topology_type(int offline_node)
{
int a, b, c, n;
@@ -1736,14 +1754,14 @@ static void init_numa_topology_type(void)
return;
}
- for_each_online_node(a) {
- for_each_online_node(b) {
+ for_each_cpu_node_but(a, offline_node) {
+ for_each_cpu_node_but(b, offline_node) {
/* Find two nodes furthest removed from each other. */
if (node_distance(a, b) < n)
continue;
/* Is there an intermediary node between a and b? */
- for_each_online_node(c) {
+ for_each_cpu_node_but(c, offline_node) {
if (node_distance(a, c) < n &&
node_distance(b, c) < n) {
sched_numa_topology_type =
@@ -1756,17 +1774,22 @@ static void init_numa_topology_type(void)
return;
}
}
+
+ pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n");
+ sched_numa_topology_type = NUMA_DIRECT;
}
#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
-void sched_init_numa(void)
+void sched_init_numa(int offline_node)
{
struct sched_domain_topology_level *tl;
unsigned long *distance_map;
int nr_levels = 0;
int i, j;
+ int *distances;
+ struct cpumask ***masks;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
@@ -1777,12 +1800,13 @@ void sched_init_numa(void)
return;
bitmap_zero(distance_map, NR_DISTANCE_VALUES);
- for (i = 0; i < nr_node_ids; i++) {
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(i, offline_node) {
+ for_each_cpu_node_but(j, offline_node) {
int distance = node_distance(i, j);
if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
sched_numa_warn("Invalid distance value range");
+ bitmap_free(distance_map);
return;
}
@@ -1795,16 +1819,17 @@ void sched_init_numa(void)
*/
nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
- if (!sched_domains_numa_distance) {
+ distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!distances) {
bitmap_free(distance_map);
return;
}
for (i = 0, j = 0; i < nr_levels; i++, j++) {
j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
- sched_domains_numa_distance[i] = j;
+ distances[i] = j;
}
+ rcu_assign_pointer(sched_domains_numa_distance, distances);
bitmap_free(distance_map);
@@ -1826,8 +1851,8 @@ void sched_init_numa(void)
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
- if (!sched_domains_numa_masks)
+ masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
+ if (!masks)
return;
/*
@@ -1835,31 +1860,20 @@ void sched_init_numa(void)
* CPUs of nodes that are that many hops away from us.
*/
for (i = 0; i < nr_levels; i++) {
- sched_domains_numa_masks[i] =
- kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
- if (!sched_domains_numa_masks[i])
+ masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+ if (!masks[i])
return;
- for (j = 0; j < nr_node_ids; j++) {
+ for_each_cpu_node_but(j, offline_node) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
int k;
if (!mask)
return;
- sched_domains_numa_masks[i][j] = mask;
-
- for_each_node(k) {
- /*
- * Distance information can be unreliable for
- * offline nodes, defer building the node
- * masks to its bringup.
- * This relies on all unique distance values
- * still being visible at init time.
- */
- if (!node_online(j))
- continue;
+ masks[i][j] = mask;
+ for_each_cpu_node_but(k, offline_node) {
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1870,6 +1884,7 @@ void sched_init_numa(void)
}
}
}
+ rcu_assign_pointer(sched_domains_numa_masks, masks);
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
@@ -1907,59 +1922,67 @@ void sched_init_numa(void)
};
}
+ sched_domain_topology_saved = sched_domain_topology;
sched_domain_topology = tl;
sched_domains_numa_levels = nr_levels;
- sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
+ WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]);
- init_numa_topology_type();
-
- sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
- if (!sched_numa_onlined_nodes)
- return;
-
- bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
- for_each_online_node(i)
- bitmap_set(sched_numa_onlined_nodes, i, 1);
+ init_numa_topology_type(offline_node);
}
-static void __sched_domains_numa_masks_set(unsigned int node)
-{
- int i, j;
-
- /*
- * NUMA masks are not built for offline nodes in sched_init_numa().
- * Thus, when a CPU of a never-onlined-before node gets plugged in,
- * adding that new CPU to the right NUMA masks is not sufficient: the
- * masks of that CPU's node must also be updated.
- */
- if (test_bit(node, sched_numa_onlined_nodes))
- return;
- bitmap_set(sched_numa_onlined_nodes, node, 1);
-
- for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++) {
- if (!node_online(j) || node == j)
- continue;
+static void sched_reset_numa(void)
+{
+ int nr_levels, *distances;
+ struct cpumask ***masks;
- if (node_distance(j, node) > sched_domains_numa_distance[i])
+ nr_levels = sched_domains_numa_levels;
+ sched_domains_numa_levels = 0;
+ sched_max_numa_distance = 0;
+ sched_numa_topology_type = NUMA_DIRECT;
+ distances = sched_domains_numa_distance;
+ rcu_assign_pointer(sched_domains_numa_distance, NULL);
+ masks = sched_domains_numa_masks;
+ rcu_assign_pointer(sched_domains_numa_masks, NULL);
+ if (distances || masks) {
+ int i, j;
+
+ synchronize_rcu();
+ kfree(distances);
+ for (i = 0; i < nr_levels && masks; i++) {
+ if (!masks[i])
continue;
-
- /* Add remote nodes in our masks */
- cpumask_or(sched_domains_numa_masks[i][node],
- sched_domains_numa_masks[i][node],
- sched_domains_numa_masks[0][j]);
+ for_each_node(j)
+ kfree(masks[i][j]);
+ kfree(masks[i]);
}
+ kfree(masks);
+ }
+ if (sched_domain_topology_saved) {
+ kfree(sched_domain_topology);
+ sched_domain_topology = sched_domain_topology_saved;
+ sched_domain_topology_saved = NULL;
}
+}
+/*
+ * Call with hotplug lock held
+ */
+void sched_update_numa(int cpu, bool online)
+{
+ int node;
+
+ node = cpu_to_node(cpu);
/*
- * A new node has been brought up, potentially changing the topology
- * classification.
- *
- * Note that this is racy vs any use of sched_numa_topology_type :/
+ * Scheduler NUMA topology is updated when the first CPU of a
+ * node is onlined or the last CPU of a node is offlined.
*/
- init_numa_topology_type();
+ if (cpumask_weight(cpumask_of_node(node)) != 1)
+ return;
+
+ sched_reset_numa();
+ sched_init_numa(online ? NUMA_NO_NODE : node);
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1967,11 +1990,9 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
- __sched_domains_numa_masks_set(node);
-
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
- if (!node_online(j))
+ if (!node_state(j, N_CPU))
continue;
/* Set ourselves in the remote node's masks */
@@ -1986,8 +2007,10 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
int i, j;
for (i = 0; i < sched_domains_numa_levels; i++) {
- for (j = 0; j < nr_node_ids; j++)
- cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ for (j = 0; j < nr_node_ids; j++) {
+ if (sched_domains_numa_masks[i][j])
+ cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+ }
}
}
@@ -2001,14 +2024,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
*/
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
- int i, j = cpu_to_node(cpu);
+ int i, j = cpu_to_node(cpu), found = nr_cpu_ids;
+ struct cpumask ***masks;
+ rcu_read_lock();
+ masks = rcu_dereference(sched_domains_numa_masks);
+ if (!masks)
+ goto unlock;
for (i = 0; i < sched_domains_numa_levels; i++) {
- cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
- if (cpu < nr_cpu_ids)
- return cpu;
+ if (!masks[i][j])
+ break;
+ cpu = cpumask_any_and(cpus, masks[i][j]);
+ if (cpu < nr_cpu_ids) {
+ found = cpu;
+ break;
+ }
}
- return nr_cpu_ids;
+unlock:
+ rcu_read_unlock();
+
+ return found;
}
#endif /* CONFIG_NUMA */
@@ -2242,6 +2277,57 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
}
}
+ /*
+ * Calculate an allowed NUMA imbalance such that LLCs do not get
+ * imbalanced.
+ */
+ for_each_cpu(i, cpu_map) {
+ unsigned int imb = 0;
+ unsigned int imb_span = 1;
+
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ struct sched_domain *child = sd->child;
+
+ if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child &&
+ (child->flags & SD_SHARE_PKG_RESOURCES)) {
+ struct sched_domain __rcu *top_p;
+ unsigned int nr_llcs;
+
+ /*
+ * For a single LLC per node, allow an
+ * imbalance up to 25% of the node. This is an
+ * arbitrary cutoff based on SMT-2 to balance
+ * between memory bandwidth and avoiding
+ * premature sharing of HT resources and SMT-4
+ * or SMT-8 *may* benefit from a different
+ * cutoff.
+ *
+ * For multiple LLCs, allow an imbalance
+ * until multiple tasks would share an LLC
+ * on one node while LLCs on another node
+ * remain idle.
+ */
+ nr_llcs = sd->span_weight / child->span_weight;
+ if (nr_llcs == 1)
+ imb = sd->span_weight >> 2;
+ else
+ imb = nr_llcs;
+ sd->imb_numa_nr = imb;
+
+ /* Set span based on the first NUMA domain. */
+ top_p = sd->parent;
+ while (top_p && !(top_p->flags & SD_NUMA)) {
+ top_p = top_p->parent;
+ }
+ imb_span = top_p ? top_p->span_weight : sd->span_weight;
+ } else {
+ int factor = max(1U, (sd->span_weight / imb_span));
+
+ sd->imb_numa_nr = imb * factor;
+ }
+ }
+ }
+
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
@@ -2351,7 +2437,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
- cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_TYPE_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
return err;
@@ -2440,7 +2526,7 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
if (doms_new) {
n = 1;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
} else {
n = ndoms_new;
@@ -2475,7 +2561,7 @@ match1:
n = 0;
doms_new = &fallback_doms;
cpumask_and(doms_new[0], cpu_active_mask,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index eca38107b32f..9860bb9a847c 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -4,7 +4,6 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
-#include "sched.h"
void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
{
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 02ce292b9bc0..d4788f810b55 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
-#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
diff --git a/kernel/scs.c b/kernel/scs.c
index 579841be8864..b7e1b096d906 100644
--- a/kernel/scs.c
+++ b/kernel/scs.c
@@ -32,15 +32,19 @@ static void *__scs_alloc(int node)
for (i = 0; i < NR_CACHED_SCS; i++) {
s = this_cpu_xchg(scs_cache[i], NULL);
if (s) {
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ s = kasan_unpoison_vmalloc(s, SCS_SIZE,
+ KASAN_VMALLOC_PROT_NORMAL);
memset(s, 0, SCS_SIZE);
- return s;
+ goto out;
}
}
- return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
+ s = __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END,
GFP_SCS, PAGE_KERNEL, 0, node,
__builtin_return_address(0));
+
+out:
+ return kasan_reset_tag(s);
}
void *scs_alloc(int node)
@@ -78,7 +82,7 @@ void scs_free(void *s)
if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL)
return;
- kasan_unpoison_vmalloc(s, SCS_SIZE);
+ kasan_unpoison_vmalloc(s, SCS_SIZE, KASAN_VMALLOC_PROT_NORMAL);
vfree_atomic(s);
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 4d8f44a17727..b5ac87f6dbd4 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -29,6 +29,9 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
+/* Not exposed in headers: strictly internal use only. */
+#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
+
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
#endif
@@ -39,7 +42,6 @@
#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/capability.h>
-#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
#include <linux/lockdep.h>
@@ -1010,6 +1012,7 @@ static void __secure_computing_strict(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -1261,6 +1264,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_KILL_PROCESS:
default:
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
if (action != SECCOMP_RET_KILL_THREAD ||
@@ -1309,6 +1313,11 @@ int __secure_computing(const struct seccomp_data *sd)
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
+ /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
+ case SECCOMP_MODE_DEAD:
+ WARN_ON_ONCE(1);
+ do_exit(SIGKILL);
+ return -1;
default:
BUG();
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 38602738866e..30cd1ca43bcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,7 +32,7 @@
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
-#include <linux/tracehook.h>
+#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
@@ -1342,9 +1342,10 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
}
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
- * debugging to leave init killable.
+ * debugging to leave init killable. But HANDLER_EXIT is always fatal.
*/
- if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
+ if (action->sa.sa_handler == SIG_DFL &&
+ (!t->ptrace || (handler == HANDLER_EXIT)))
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = send_signal(sig, info, t, PIDTYPE_PID);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -2188,14 +2189,17 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
* That makes it a way to test a stopped process for
* being ptrace-stopped vs being job-control-stopped.
*
- * If we actually decide not to stop at all because the tracer
- * is gone, we keep current->exit_code unless clear_code.
+ * Returns the signal the ptracer requested the code resume
+ * with. If the code did not stop because the tracer is gone,
+ * the stop signal remains unchanged unless clear_code.
*/
-static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t *info)
+static int ptrace_stop(int exit_code, int why, int clear_code,
+ unsigned long message, kernel_siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
bool gstop_done = false;
+ bool read_code = true;
if (arch_ptrace_stop_needed()) {
/*
@@ -2237,6 +2241,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
*/
smp_wmb();
+ current->ptrace_message = message;
current->last_siginfo = info;
current->exit_code = exit_code;
@@ -2303,8 +2308,9 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
/* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
+ read_code = false;
if (clear_code)
- current->exit_code = 0;
+ exit_code = 0;
read_unlock(&tasklist_lock);
}
@@ -2314,7 +2320,11 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* any signal-sending on another CPU that wants to examine it.
*/
spin_lock_irq(&current->sighand->siglock);
+ if (read_code)
+ exit_code = current->exit_code;
current->last_siginfo = NULL;
+ current->ptrace_message = 0;
+ current->exit_code = 0;
/* LISTENING can be set only during STOP traps, clear it */
current->jobctl &= ~JOBCTL_LISTENING;
@@ -2325,9 +2335,10 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* This sets TIF_SIGPENDING, but never clears it.
*/
recalc_sigpending_tsk(current);
+ return exit_code;
}
-static void ptrace_do_notify(int signr, int exit_code, int why)
+static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
kernel_siginfo_t info;
@@ -2338,18 +2349,21 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
/* Let the debugger run. */
- ptrace_stop(exit_code, why, 1, &info);
+ return ptrace_stop(exit_code, why, 1, message, &info);
}
-void ptrace_notify(int exit_code)
+int ptrace_notify(int exit_code, unsigned long message)
{
+ int signr;
+
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
- if (unlikely(current->task_works))
+ if (unlikely(task_work_pending(current)))
task_work_run();
spin_lock_irq(&current->sighand->siglock);
- ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
+ signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
spin_unlock_irq(&current->sighand->siglock);
+ return signr;
}
/**
@@ -2504,11 +2518,10 @@ static void do_jobctl_trap(void)
signr = SIGTRAP;
WARN_ON_ONCE(!signr);
ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
- CLD_STOPPED);
+ CLD_STOPPED, 0);
} else {
WARN_ON_ONCE(!signr);
- ptrace_stop(signr, CLD_STOPPED, 0, NULL);
- current->exit_code = 0;
+ ptrace_stop(signr, CLD_STOPPED, 0, 0, NULL);
}
}
@@ -2561,15 +2574,12 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
* comment in dequeue_signal().
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
- ptrace_stop(signr, CLD_TRAPPED, 0, info);
+ signr = ptrace_stop(signr, CLD_TRAPPED, 0, 0, info);
/* We're back. Did the debugger cancel the sig? */
- signr = current->exit_code;
if (signr == 0)
return signr;
- current->exit_code = 0;
-
/*
* Update the siginfo structure if the signal has
* changed. If the debugger wanted something
@@ -2626,20 +2636,12 @@ bool get_signal(struct ksignal *ksig)
struct signal_struct *signal = current->signal;
int signr;
- if (unlikely(current->task_works))
+ clear_notify_signal();
+ if (unlikely(task_work_pending(current)))
task_work_run();
- /*
- * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so
- * that the arch handlers don't all have to do it. If we get here
- * without TIF_SIGPENDING, just exit after running signal work.
- */
- if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- tracehook_notify_signal();
- if (!task_sigpending(current))
- return false;
- }
+ if (!task_sigpending(current))
+ return false;
if (unlikely(uprobe_deny_signal()))
return false;
@@ -2898,7 +2900,8 @@ static void signal_delivered(struct ksignal *ksig, int stepping)
set_current_blocked(&blocked);
if (current->sas_ss_flags & SS_AUTODISARM)
sas_ss_reset(current);
- tracehook_signal_handler(stepping);
+ if (stepping)
+ ptrace_notify(SIGTRAP, 0);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 41f470929e99..fac801815554 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -222,7 +222,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
u32 pending;
int curcnt;
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
local_irq_save(flags);
@@ -305,7 +305,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
raw_local_irq_save(flags);
/*
@@ -352,14 +352,14 @@ static void __local_bh_enable(unsigned int cnt)
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(in_hardirq());
lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
@@ -618,7 +618,7 @@ static inline void tick_irq_exit(void)
/* Make sure that timer wheel updates are propagated */
if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
- if (!in_irq())
+ if (!in_hardirq())
tick_nohz_irq_exit();
}
#endif
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 66b8af394e58..ddb5a7f48d69 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -70,7 +70,7 @@ late_initcall(stackleak_sysctls_init);
#define skip_erasing() false
#endif /* CONFIG_STACKLEAK_RUNTIME_DISABLE */
-asmlinkage void notrace stackleak_erase(void)
+asmlinkage void noinstr stackleak_erase(void)
{
/* It would be nice not to have 'kstack_ptr' and 'boundary' on stack */
unsigned long kstack_ptr = current->lowest_stack;
@@ -124,9 +124,8 @@ asmlinkage void notrace stackleak_erase(void)
/* Reset the 'lowest_stack' value for the next syscall */
current->lowest_stack = current_top_of_stack() - THREAD_SIZE/64;
}
-NOKPROBE_SYMBOL(stackleak_erase);
-void __used __no_caller_saved_registers notrace stackleak_track_stack(void)
+void __used __no_caller_saved_registers noinstr stackleak_track_stack(void)
{
unsigned long sp = current_stack_pointer;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9c625257023d..9ed5ce989415 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,15 +226,12 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
.store = store,
.size = size,
};
- mm_segment_t fs;
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
- fs = force_uaccess_begin();
arch_stack_walk_user(consume_entry, &c, task_pt_regs(current));
- force_uaccess_end(fs);
return c.len;
}
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 43ba0b1e0edb..f2b8baea35d2 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -503,6 +503,7 @@ long __static_call_return0(void)
{
return 0;
}
+EXPORT_SYMBOL_GPL(__static_call_return0);
#ifdef CONFIG_STATIC_CALL_SELFTEST
diff --git a/kernel/sys.c b/kernel/sys.c
index ecc4cf019242..374f83e95239 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/utsname.h>
#include <linux/mman.h>
#include <linux/reboot.h>
@@ -472,6 +473,16 @@ static int set_user(struct cred *new)
if (!new_user)
return -EAGAIN;
+ free_uid(new->user);
+ new->user = new_user;
+ return 0;
+}
+
+static void flag_nproc_exceeded(struct cred *new)
+{
+ if (new->ucounts == current_ucounts())
+ return;
+
/*
* We don't fail in case of NPROC limit excess here because too many
* poorly written programs don't check set*uid() return code, assuming
@@ -480,15 +491,10 @@ static int set_user(struct cred *new)
* failure to the execve() stage.
*/
if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
- new_user != INIT_USER &&
- !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
+ new->user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED;
else
current->flags &= ~PF_NPROC_EXCEEDED;
-
- free_uid(new->user);
- new->user = new_user;
- return 0;
}
/*
@@ -563,6 +569,7 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0)
goto error;
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -625,6 +632,7 @@ long __sys_setuid(uid_t uid)
if (retval < 0)
goto error;
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -704,6 +712,7 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0)
goto error;
+ flag_nproc_exceeded(new);
return commit_creds(new);
error:
@@ -1415,6 +1424,68 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
return errno;
}
+/* make sure you are allowed to change @tsk limits before calling this */
+static int do_prlimit(struct task_struct *tsk, unsigned int resource,
+ struct rlimit *new_rlim, struct rlimit *old_rlim)
+{
+ struct rlimit *rlim;
+ int retval = 0;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+ if (new_rlim) {
+ if (new_rlim->rlim_cur > new_rlim->rlim_max)
+ return -EINVAL;
+ if (resource == RLIMIT_NOFILE &&
+ new_rlim->rlim_max > sysctl_nr_open)
+ return -EPERM;
+ }
+
+ /* Holding a refcount on tsk protects tsk->signal from disappearing. */
+ rlim = tsk->signal->rlim + resource;
+ task_lock(tsk->group_leader);
+ if (new_rlim) {
+ /*
+ * Keep the capable check against init_user_ns until cgroups can
+ * contain all limits.
+ */
+ if (new_rlim->rlim_max > rlim->rlim_max &&
+ !capable(CAP_SYS_RESOURCE))
+ retval = -EPERM;
+ if (!retval)
+ retval = security_task_setrlimit(tsk, resource, new_rlim);
+ }
+ if (!retval) {
+ if (old_rlim)
+ *old_rlim = *rlim;
+ if (new_rlim)
+ *rlim = *new_rlim;
+ }
+ task_unlock(tsk->group_leader);
+
+ /*
+ * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
+ * infinite. In case of RLIM_INFINITY the posix CPU timer code
+ * ignores the rlimit.
+ */
+ if (!retval && new_rlim && resource == RLIMIT_CPU &&
+ new_rlim->rlim_cur != RLIM_INFINITY &&
+ IS_ENABLED(CONFIG_POSIX_TIMERS)) {
+ /*
+ * update_rlimit_cpu can fail if the task is exiting, but there
+ * may be other tasks in the thread group that are not exiting,
+ * and they need their cpu timers adjusted.
+ *
+ * The group_leader is the last task to be released, so if we
+ * cannot update_rlimit_cpu on it, then the entire process is
+ * exiting and we do not need to update at all.
+ */
+ update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
+ }
+
+ return retval;
+}
+
SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
{
struct rlimit value;
@@ -1558,63 +1629,6 @@ static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim)
rlim->rlim_max = (unsigned long)rlim64->rlim_max;
}
-/* make sure you are allowed to change @tsk limits before calling this */
-int do_prlimit(struct task_struct *tsk, unsigned int resource,
- struct rlimit *new_rlim, struct rlimit *old_rlim)
-{
- struct rlimit *rlim;
- int retval = 0;
-
- if (resource >= RLIM_NLIMITS)
- return -EINVAL;
- if (new_rlim) {
- if (new_rlim->rlim_cur > new_rlim->rlim_max)
- return -EINVAL;
- if (resource == RLIMIT_NOFILE &&
- new_rlim->rlim_max > sysctl_nr_open)
- return -EPERM;
- }
-
- /* protect tsk->signal and tsk->sighand from disappearing */
- read_lock(&tasklist_lock);
- if (!tsk->sighand) {
- retval = -ESRCH;
- goto out;
- }
-
- rlim = tsk->signal->rlim + resource;
- task_lock(tsk->group_leader);
- if (new_rlim) {
- /* Keep the capable check against init_user_ns until
- cgroups can contain all limits */
- if (new_rlim->rlim_max > rlim->rlim_max &&
- !capable(CAP_SYS_RESOURCE))
- retval = -EPERM;
- if (!retval)
- retval = security_task_setrlimit(tsk, resource, new_rlim);
- }
- if (!retval) {
- if (old_rlim)
- *old_rlim = *rlim;
- if (new_rlim)
- *rlim = *new_rlim;
- }
- task_unlock(tsk->group_leader);
-
- /*
- * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not
- * infinite. In case of RLIM_INFINITY the posix CPU timer code
- * ignores the rlimit.
- */
- if (!retval && new_rlim && resource == RLIMIT_CPU &&
- new_rlim->rlim_cur != RLIM_INFINITY &&
- IS_ENABLED(CONFIG_POSIX_TIMERS))
- update_rlimit_cpu(tsk, new_rlim->rlim_cur);
-out:
- read_unlock(&tasklist_lock);
- return retval;
-}
-
/* rcu lock must be held */
static int check_prlimit_permission(struct task_struct *task,
unsigned int flags)
@@ -2278,15 +2292,16 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
{
struct mm_struct *mm = current->mm;
const char __user *uname;
- char *name, *pch;
+ struct anon_vma_name *anon_name = NULL;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
uname = (const char __user *)arg;
if (uname) {
- name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
+ char *name, *pch;
+ name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
if (IS_ERR(name))
return PTR_ERR(name);
@@ -2296,15 +2311,18 @@ static int prctl_set_vma(unsigned long opt, unsigned long addr,
return -EINVAL;
}
}
- } else {
- /* Reset the name */
- name = NULL;
+ /* anon_vma has its own copy */
+ anon_name = anon_vma_name_alloc(name);
+ kfree(name);
+ if (!anon_name)
+ return -ENOMEM;
+
}
mmap_write_lock(mm);
- error = madvise_set_anon_name(mm, addr, size, name);
+ error = madvise_set_anon_name(mm, addr, size, anon_name);
mmap_write_unlock(mm);
- kfree(name);
+ anon_vma_name_put(anon_name);
break;
default:
error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ae443b2882e..830aaf8ca08e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -180,6 +180,10 @@ static int bpf_stats_handler(struct ctl_table *table, int write,
return ret;
}
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
static int bpf_unpriv_handler(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -197,6 +201,9 @@ static int bpf_unpriv_handler(struct ctl_table *table, int write,
return -EPERM;
*(int *)table->data = unpriv_enable;
}
+
+ unpriv_ebpf_notify(unpriv_enable);
+
return ret;
}
#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */
@@ -1689,7 +1696,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_FOUR,
},
#endif /* CONFIG_NUMA_BALANCING */
{
@@ -1750,17 +1757,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = sysctl_sched_uclamp_handler,
},
#endif
-#ifdef CONFIG_SCHED_AUTOGROUP
- {
- .procname = "sched_autogroup_enabled",
- .data = &sysctl_sched_autogroup_enabled,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
- },
-#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 1698fbe6f0e1..c59e1a49bc40 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/spinlock.h>
#include <linux/task_work.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
static struct callback_head work_exited; /* all we need is ->next == NULL */
@@ -78,7 +78,7 @@ task_work_cancel_match(struct task_struct *task,
struct callback_head *work;
unsigned long flags;
- if (likely(!task->task_works))
+ if (likely(!task_work_pending(task)))
return NULL;
/*
* If cmpxchg() fails we continue without updating pprev.
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 2b4898b4752e..bcac5a9043aa 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -113,13 +113,14 @@ static void send_cpu_listeners(struct sk_buff *skb,
struct listener *s, *tmp;
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
- int rc, delcount = 0;
+ int delcount = 0;
genlmsg_end(skb, reply);
- rc = 0;
down_read(&listeners->sem);
list_for_each_entry(s, &listeners->list, list) {
+ int rc;
+
skb_next = NULL;
if (!list_is_last(&s->list, &listeners->list)) {
skb_next = skb_clone(skb_cur, GFP_KERNEL);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 04bfd62f5e5c..27b7868b5c30 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -181,5 +181,14 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
+config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+ int "Clocksource watchdog maximum allowable skew (in μs)"
+ depends on CLOCKSOURCE_WATCHDOG
+ range 50 1000
+ default 100
+ help
+ Specify the maximum amount of allowable watchdog skew in
+ microseconds before reporting the clocksource to be unstable.
+
endmenu
endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1cf73807b450..95d7ca35bdf2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,7 +107,13 @@ static u64 suspend_start;
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
*/
-#define WATCHDOG_MAX_SKEW (100 * NSEC_PER_USEC)
+#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
+#else
+#define MAX_SKEW_USEC 100
+#endif
+
+#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 96b4e7810426..0a97193984db 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -15,6 +15,7 @@
#include <linux/workqueue.h>
#include <linux/compat.h>
#include <linux/sched/deadline.h>
+#include <linux/task_work.h>
#include "posix-timers.h"
@@ -34,14 +35,20 @@ void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit)
* tsk->signal->posix_cputimers.bases[clock].nextevt expiration cache if
* necessary. Needs siglock protection since other code may update the
* expiration cache as well.
+ *
+ * Returns 0 on success, -ESRCH on failure. Can fail if the task is exiting and
+ * we cannot lock_task_sighand. Cannot fail if task is current.
*/
-void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
+int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new)
{
u64 nsecs = rlim_new * NSEC_PER_SEC;
+ unsigned long irq_fl;
- spin_lock_irq(&task->sighand->siglock);
+ if (!lock_task_sighand(task, &irq_fl))
+ return -ESRCH;
set_process_cpu_timer(task, CPUCLOCK_PROF, &nsecs, NULL);
- spin_unlock_irq(&task->sighand->siglock);
+ unlock_task_sighand(task, &irq_fl);
+ return 0;
}
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 17a283ce2b20..2d76c91b85de 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -169,6 +169,8 @@ static ktime_t tick_init_jiffy_update(void)
return period;
}
+#define MAX_STALLED_JIFFIES 5
+
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
@@ -196,6 +198,21 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
if (tick_do_timer_cpu == cpu)
tick_do_update_jiffies64(now);
+ /*
+ * If jiffies update stalled for too long (timekeeper in stop_machine()
+ * or VMEXIT'ed for several msecs), force an update.
+ */
+ if (ts->last_tick_jiffies != jiffies) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ } else {
+ if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
+ tick_do_update_jiffies64(now);
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
+ }
+
if (ts->inidle)
ts->got_idle_tick = 1;
}
@@ -768,7 +785,7 @@ static inline bool local_timer_softirq_pending(void)
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
- u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+ u64 basemono, next_tick, delta, expires;
unsigned long basejiff;
unsigned int seq;
@@ -791,7 +808,7 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* minimal delta which brings us back to this place
* immediately. Lather, rinse and repeat...
*/
- if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ if (rcu_needs_cpu() || arch_needs_cpu() ||
irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
@@ -802,10 +819,8 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
* disabled this also looks at the next expiring
* hrtimer.
*/
- next_tmr = get_next_timer_interrupt(basejiff, basemono);
- ts->next_timer = next_tmr;
- /* Take the next rcu event into account */
- next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
+ next_tick = get_next_timer_interrupt(basejiff, basemono);
+ ts->next_timer = next_tick;
}
/*
@@ -984,6 +999,45 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
__tick_nohz_full_update_tick(ts, ktime_get());
}
+/*
+ * A pending softirq outside an IRQ (or softirq disabled section) context
+ * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
+ * reach here due to the need_resched() early check in can_stop_idle_tick().
+ *
+ * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
+ * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
+ * triggering the below since wakep_softirqd() is ignored.
+ *
+ */
+static bool report_idle_softirq(void)
+{
+ static int ratelimit;
+ unsigned int pending = local_softirq_pending();
+
+ if (likely(!pending))
+ return false;
+
+ /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
+ if (!cpu_active(smp_processor_id())) {
+ pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
+ if (!pending)
+ return false;
+ }
+
+ if (ratelimit < 10)
+ return false;
+
+ /* On RT, softirqs handling may be waiting on some lock */
+ if (!local_bh_blocked())
+ return false;
+
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+
+ return true;
+}
+
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
@@ -1010,17 +1064,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (need_resched())
return false;
- if (unlikely(local_softirq_pending())) {
- static int ratelimit;
-
- if (ratelimit < 10 && !local_bh_blocked() &&
- (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
- (unsigned int) local_softirq_pending());
- ratelimit++;
- }
+ if (unlikely(report_idle_softirq()))
return false;
- }
if (tick_nohz_full_enabled()) {
/*
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index d952ae393423..504649513399 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -49,6 +49,8 @@ enum tick_nohz_mode {
* @timer_expires_base: Base time clock monotonic for @timer_expires
* @next_timer: Expiry time of next expiring timer for debugging purpose only
* @tick_dep_mask: Tick dependency mask - is set, if someone needs the tick
+ * @last_tick_jiffies: Value of jiffies seen on last tick
+ * @stalled_jiffies: Number of stalled jiffies detected across ticks
*/
struct tick_sched {
struct hrtimer sched_timer;
@@ -77,6 +79,8 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
atomic_t tick_dep_mask;
+ unsigned long last_tick_jiffies;
+ unsigned int stalled_jiffies;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);
diff --git a/kernel/torture.c b/kernel/torture.c
index ef27a6c82451..789aeb0e1159 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -911,7 +911,7 @@ void torture_kthread_stopping(char *title)
{
char buf[128];
- snprintf(buf, sizeof(buf), "Stopping %s", title);
+ snprintf(buf, sizeof(buf), "%s is stopping", title);
VERBOSE_TOROUT_STRING(buf);
while (!kthread_should_stop()) {
torture_shutdown_absorb(title);
@@ -931,12 +931,14 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
int ret = 0;
VERBOSE_TOROUT_STRING(m);
- *tp = kthread_run(fn, arg, "%s", s);
+ *tp = kthread_create(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
TOROUT_ERRSTRING(f);
*tp = NULL;
+ return ret;
}
+ wake_up_process(*tp); // Process is sleeping, so ordering provided.
torture_shuffle_task_register(*tp);
return ret;
}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 752ed89a293b..2c43e327a619 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ features like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
@@ -70,10 +81,16 @@ config HAVE_C_RECORDMCOUNT
help
C version of recordmcount available?
+config HAVE_BUILDTIME_MCOUNT_SORT
+ bool
+ help
+ An architecture selects this if it sorts the mcount_loc section
+ at build time.
+
config BUILDTIME_MCOUNT_SORT
bool
default y
- depends on BUILDTIME_TABLE_SORT && !S390
+ depends on HAVE_BUILDTIME_MCOUNT_SORT && DYNAMIC_FTRACE
help
Sort the mcount_loc section at build time.
@@ -230,6 +247,21 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on HAVE_RETHOOK
+ select RETHOOK
+ default n
+ help
+ This option enables kernel function probe (fprobe) based on ftrace.
+ The fprobe is similar to kprobes, but probes only for kernel function
+ entries and exits. This also can probe multiple functions by one
+ fprobe.
+
+ If unsure, say N.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -731,6 +763,21 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ depends on BROKEN || COMPILE_TEST # API needs to be straighten out
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..d77cd8032213 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
@@ -97,6 +98,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index af68a67179b4..4d5629196d01 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -310,10 +310,20 @@ record_it:
local_irq_restore(flags);
}
-static void blk_trace_free(struct blk_trace *bt)
+static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
+
+ /*
+ * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
+ * under 'q->debugfs_dir', thus lookup and remove them.
+ */
+ if (!bt->dir) {
+ debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
+ debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ } else {
+ debugfs_remove(bt->dir);
+ }
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -335,10 +345,10 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
put_probe_ref();
}
@@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q)
return -EINVAL;
if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
if (ret)
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
put_probe_ref();
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return 0;
}
@@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
return 0;
free_bt:
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1892,7 +1902,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
switch (op & REQ_OP_MASK) {
case REQ_OP_WRITE:
- case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..7fa2ebc07f60 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,9 @@
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <net/bpf_sk_storage.h>
@@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
/**
* trace_call_bpf - invoke BPF program
@@ -332,8 +337,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -835,8 +838,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -1036,6 +1037,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+ .func = bpf_get_func_ip_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+ .func = bpf_get_attach_cookie_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1235,6 +1260,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ case BPF_FUNC_copy_from_user_task:
+ return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1277,9 +1304,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return &bpf_get_func_ip_proto_kprobe;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_func_ip_proto_kprobe_multi :
+ &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return &bpf_get_attach_cookie_proto_trace;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_attach_cookie_proto_kmulti :
+ &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1562,6 +1593,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1661,6 +1693,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_from_file_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_trace_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
@@ -2176,3 +2210,314 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ unsigned long *addrs;
+ u64 *cookies;
+ u32 cnt;
+};
+
+struct bpf_kprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ unsigned long entry_ip;
+};
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ kvfree(kmulti_link->addrs);
+ kvfree(kmulti_link->cookies);
+ kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+ .release = bpf_kprobe_multi_link_release,
+ .dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+ const struct bpf_kprobe_multi_link *link = priv;
+ unsigned long *addr_a = a, *addr_b = b;
+ u64 *cookie_a, *cookie_b;
+ unsigned long tmp1;
+ u64 tmp2;
+
+ cookie_a = link->cookies + (addr_a - link->addrs);
+ cookie_b = link->cookies + (addr_b - link->addrs);
+
+ /* swap addr_a/addr_b and cookie_a/cookie_b values */
+ tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
+ tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+ const unsigned long *addr_a = a, *addr_b = b;
+
+ if (*addr_a == *addr_b)
+ return 0;
+ return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+ return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ u64 *cookie, entry_ip;
+ unsigned long *addr;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ link = run_ctx->link;
+ if (!link->cookies)
+ return 0;
+ entry_ip = run_ctx->entry_ip;
+ addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
+ __bpf_kprobe_multi_cookie_cmp);
+ if (!addr)
+ return 0;
+ cookie = link->cookies + (addr - link->addrs);
+ return *cookie;
+}
+
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+ unsigned long entry_ip, struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .link = link,
+ .entry_ip = entry_ip,
+ };
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ err = 0;
+ goto out;
+ }
+
+ migrate_disable();
+ rcu_read_lock();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, entry_ip, regs);
+}
+
+static int
+kprobe_multi_resolve_syms(const void *usyms, u32 cnt,
+ unsigned long *addrs)
+{
+ unsigned long addr, size;
+ const char **syms;
+ int err = -ENOMEM;
+ unsigned int i;
+ char *func;
+
+ size = cnt * sizeof(*syms);
+ syms = kvzalloc(size, GFP_KERNEL);
+ if (!syms)
+ return -ENOMEM;
+
+ func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!func)
+ goto error;
+
+ if (copy_from_user(syms, usyms, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ err = -EINVAL;
+ addr = kallsyms_lookup_name(func);
+ if (!addr)
+ goto error;
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+ goto error;
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr)
+ goto error;
+ addrs[i] = addr;
+ }
+
+ err = 0;
+error:
+ kvfree(syms);
+ kfree(func);
+ return err;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_kprobe_multi_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ void __user *ucookies;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 *cookies = NULL;
+ void __user *usyms;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.kprobe_multi.flags;
+ if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+ usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+ if (!!uaddrs == !!usyms)
+ return -EINVAL;
+
+ cnt = attr->link_create.kprobe_multi.cnt;
+ if (!cnt)
+ return -EINVAL;
+
+ size = cnt * sizeof(*addrs);
+ addrs = kvmalloc(size, GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ if (uaddrs) {
+ if (copy_from_user(addrs, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+ if (err)
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc(size, GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+ &bpf_kprobe_multi_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ if (flags & BPF_F_KPROBE_MULTI_RETURN)
+ link->fp.exit_handler = kprobe_multi_link_handler;
+ else
+ link->fp.entry_handler = kprobe_multi_link_handler;
+
+ link->addrs = addrs;
+ link->cookies = cookies;
+ link->cnt = cnt;
+
+ if (cookies) {
+ /*
+ * Sorting addresses will trigger sorting cookies as well
+ * (check bpf_kprobe_multi_cookie_swap). This way we can
+ * find cookie based on the address in bpf_get_attach_cookie
+ * helper.
+ */
+ sort_r(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_cookie_cmp,
+ bpf_kprobe_multi_cookie_swap,
+ link);
+ }
+
+ err = register_fprobe_ips(&link->fp, addrs, cnt);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kvfree(addrs);
+ kvfree(cookies);
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..8f4fb328133a 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
*
* Highly modified by Steven Rostedt (VMware).
*/
+#include <linux/jump_label.h>
#include <linux/suspend.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
@@ -23,25 +24,13 @@
#define ASSIGN_OPS_HASH(opsname, val)
#endif
-static bool kill_ftrace_graph;
+DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
int ftrace_graph_active;
/* Both enabled by default (can be cleared by function_graph tracer flags */
static bool fgraph_sleep_time = true;
/**
- * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
- *
- * ftrace_graph_stop() is called when a severe error is detected in
- * the function graph tracing. This function is called by the critical
- * paths of function graph to keep those paths from doing any more harm.
- */
-bool ftrace_graph_is_dead(void)
-{
- return kill_ftrace_graph;
-}
-
-/**
* ftrace_graph_stop - set to permanently disable function graph tracing
*
* In case of an error int function graph tracing, this is called
@@ -51,7 +40,7 @@ bool ftrace_graph_is_dead(void)
*/
void ftrace_graph_stop(void)
{
- kill_ftrace_graph = true;
+ static_branch_enable(&kill_ftrace_graph);
}
/* Add a function return address to the trace stack on thread info.*/
@@ -415,7 +404,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..89d9f994ebb0
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "trace.h"
+
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+};
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh;
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ if (fp->entry_handler)
+ fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ goto out;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
+
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp = container_of(ops, struct fprobe, ops);
+
+ if (unlikely(kprobe_running())) {
+ fp->nmissed++;
+ return;
+ }
+ kprobe_busy_begin();
+ fprobe_handler(ip, parent_ip, ops, fregs);
+ kprobe_busy_end();
+}
+
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+
+ if (!fp || fprobe_disabled(fp))
+ return;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+ unsigned long addr, size;
+ unsigned long *addrs;
+ int i;
+
+ /* Convert symbols to symbol address */
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < num; i++) {
+ addr = kallsyms_lookup_name(syms[i]);
+ if (!addr) /* Maybe wrong symbol */
+ goto error;
+
+ /* Convert symbol address to ftrace location. */
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
+ goto error;
+
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr) /* No dynamic ftrace there. */
+ goto error;
+
+ addrs[i] = addr;
+ }
+
+ return addrs;
+
+error:
+ kfree(addrs);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+ fp->nmissed = 0;
+ if (fprobe_shared_with_kprobes(fp))
+ fp->ops.func = fprobe_kprobe_handler;
+ else
+ fp->ops.func = fprobe_handler;
+ fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+ int i, size;
+
+ if (num < 0)
+ return -EINVAL;
+
+ if (!fp->exit_handler) {
+ fp->rethook = NULL;
+ return 0;
+ }
+
+ /* Initialize rethook if needed */
+ size = num * num_possible_cpus() * 2;
+ if (size < 0)
+ return -E2BIG;
+
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ for (i = 0; i < size; i++) {
+ struct fprobe_rethook_node *node;
+
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(fp->rethook, &node->node);
+ }
+ return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ if (fp->rethook) {
+ /* Don't need to cleanup rethook->handler because this is not used. */
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ }
+ ftrace_free_filter(&fp->ops);
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+ struct ftrace_hash *hash;
+ unsigned char *str;
+ int ret, len;
+
+ if (!fp || !filter)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ len = strlen(filter);
+ str = kstrdup(filter, GFP_KERNEL);
+ ret = ftrace_set_filter(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ if (notfilter) {
+ len = strlen(notfilter);
+ str = kstrdup(notfilter, GFP_KERNEL);
+ ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ goto out;
+ }
+
+ /* TODO:
+ * correctly calculate the total number of filtered symbols
+ * from both filter and notfilter.
+ */
+ hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
+ if (WARN_ON_ONCE(!hash))
+ goto out;
+
+ ret = fprobe_init_rethook(fp, (int)hash->count);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+out:
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ int ret;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ ret = fprobe_init_rethook(fp, num);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+ unsigned long *addrs;
+ int ret;
+
+ if (!fp || !syms || num <= 0)
+ return -EINVAL;
+
+ addrs = get_ftrace_locations(syms, num);
+ if (IS_ERR(addrs))
+ return PTR_ERR(addrs);
+
+ ret = register_fprobe_ips(fp, addrs, num);
+
+ kfree(addrs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || fp->ops.func != fprobe_handler)
+ return -EINVAL;
+
+ /*
+ * rethook_free() starts disabling the rethook, but the rethook handlers
+ * may be running on other processors at this point. To make sure that all
+ * current running handlers are finished, call unregister_ftrace_function()
+ * after this.
+ */
+ if (fp->rethook)
+ rethook_free(fp->rethook);
+
+ ret = unregister_ftrace_function(&fp->ops);
+ if (ret < 0)
+ return ret;
+
+ ftrace_free_filter(&fp->ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9feb197b2da..4f1d2f5e7263 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1568,17 +1568,34 @@ unsigned long ftrace_location_range(unsigned long start, unsigned long end)
}
/**
- * ftrace_location - return true if the ip giving is a traced location
+ * ftrace_location - return the ftrace location
* @ip: the instruction pointer to check
*
- * Returns rec->ip if @ip given is a pointer to a ftrace location.
- * That is, the instruction that is either a NOP or call to
- * the function tracer. It checks the ftrace internal tables to
- * determine if the address belongs or not.
+ * If @ip matches the ftrace location, return @ip.
+ * If @ip matches sym+0, return sym's ftrace location.
+ * Otherwise, return 0.
*/
unsigned long ftrace_location(unsigned long ip)
{
- return ftrace_location_range(ip, ip);
+ struct dyn_ftrace *rec;
+ unsigned long offset;
+ unsigned long size;
+
+ rec = lookup_rec(ip, ip);
+ if (!rec) {
+ if (!kallsyms_lookup_size_offset(ip, &size, &offset))
+ goto out;
+
+ /* map sym+0 to __fentry__ */
+ if (!offset)
+ rec = lookup_rec(ip, ip + size - 1);
+ }
+
+ if (rec)
+ return rec->ip;
+
+out:
+ return 0;
}
/**
@@ -4958,11 +4975,12 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
- if (!ftrace_location(ip))
+ ip = ftrace_location(ip);
+ if (!ip)
return -EINVAL;
if (remove) {
@@ -4977,8 +4995,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
}
static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err) {
+ /*
+ * This expects the @hash is a temporary hash and if this
+ * fails the caller must free the @hash.
+ */
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5008,8 +5047,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -5026,10 +5065,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5110,11 +5149,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
struct ftrace_func_entry *entry;
struct ftrace_hash *free_hash = NULL;
struct dyn_ftrace *rec;
- int ret = -EBUSY;
+ int ret = -ENODEV;
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
/* See if there's a direct function at @ip already */
+ ret = -EBUSY;
if (ftrace_find_rec_direct(ip))
goto out_unlock;
@@ -5222,6 +5266,10 @@ int unregister_ftrace_direct(unsigned long ip, unsigned long addr)
mutex_lock(&direct_mutex);
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, NULL);
if (!entry)
goto out_unlock;
@@ -5354,6 +5402,11 @@ int modify_ftrace_direct(unsigned long ip,
mutex_lock(&direct_mutex);
mutex_lock(&ftrace_lock);
+
+ ip = ftrace_location(ip);
+ if (!ip)
+ goto out_unlock;
+
entry = find_direct_entry(&ip, &rec);
if (!entry)
goto out_unlock;
@@ -5634,11 +5687,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
+/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
*
@@ -5659,7 +5731,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
@@ -7096,6 +7168,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7191,7 +7265,6 @@ static int __init ftrace_nodyn_init(void)
core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; }
-static inline void ftrace_startup_enable(int command) { }
static inline void ftrace_startup_all(int command) { }
# define ftrace_startup_sysctl() do { } while (0)
@@ -7347,7 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
@@ -7791,7 +7866,7 @@ int ftrace_is_dead(void)
/**
* register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
+ * @ops: ops structure that holds the function for profiling.
*
* Register a function to be called by all functions in the
* kernel.
@@ -7818,7 +7893,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_function);
/**
* unregister_ftrace_function - unregister a function for profiling.
- * @ops - ops structure that holds the function to unregister
+ * @ops: ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.
*/
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..ab463a4d2b23
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ preempt_disable();
+ rethook_recycle(rhn);
+ preempt_enable();
+ }
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ struct rethook_node *rhn;
+ struct freelist_node *node;
+ int count = 1;
+
+ node = rh->pool.head;
+ while (node) {
+ rhn = container_of(node, struct rethook_node, freelist);
+ node = node->next;
+ kfree(rhn);
+ count++;
+ }
+
+ /* The rh->ref is the number of pooled node + 1 */
+ if (refcount_sub_and_test(count, &rh->ref))
+ kfree(rh);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+ rcu_assign_pointer(rh->handler, NULL);
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function.
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return NULL if memory allocation fails or @handler is NULL.
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+ struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+ if (!rh || !handler)
+ return NULL;
+
+ rh->data = data;
+ rh->handler = handler;
+ rh->pool.head = NULL;
+ refcount_set(&rh->ref, 1);
+
+ return rh;
+}
+
+/**
+ * rethook_add_node() - Add a new node to the rethook.
+ * @rh: the struct rethook.
+ * @node: the struct rethook_node to be added.
+ *
+ * Add @node to @rh. User must allocate @node (as a part of user's
+ * data structure.) The @node fields are initialized in this function.
+ */
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+ node->rethook = rh;
+ freelist_add(&node->freelist, &rh->pool);
+ refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+ if (refcount_dec_and_test(&node->rethook->ref))
+ kfree(node->rethook);
+ kfree(node);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (likely(READ_ONCE(node->rethook->handler)))
+ freelist_add(&node->freelist, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ rethook_handler_t handler = READ_ONCE(rh->handler);
+ struct freelist_node *fn;
+
+ lockdep_assert_preemption_disabled();
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!handler))
+ return NULL;
+
+ fn = freelist_try_get(&rh->pool);
+ if (!fn)
+ return NULL;
+
+ return container_of(fn, struct rethook_node, freelist);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+ arch_rethook_prepare(node, regs, mcount);
+ __llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr;
+ rethook_handler_t handler;
+ struct rethook_node *rhn;
+
+ correct_ret_addr = __rethook_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+
+ /*
+ * These loops must be protected from rethook_free_rcu() because those
+ * are accessing 'rhn->rethook'.
+ */
+ preempt_disable();
+
+ /*
+ * Run the handler on the shadow stack. Do not unlink the list here because
+ * stackdump inside the handlers needs to decode it.
+ */
+ first = current->rethooks.first;
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ handler = READ_ONCE(rhn->rethook->handler);
+ if (handler)
+ handler(rhn, rhn->rethook->data, regs);
+
+ if (first == node)
+ break;
+ first = first->next;
+ }
+
+ /* Fixup registers for returning to correct address. */
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink used shadow stack */
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ first = first->next;
+ rethook_recycle(rhn);
+ }
+ preempt_enable();
+
+ return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a569a0cb81ee..f4de111fa18f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,12 +231,21 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- return 0;
+ return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -246,12 +256,16 @@ static int __init set_trace_boot_clock(char *str)
{
strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
- return 0;
+ return 1;
}
__setup("trace_clock=", set_trace_boot_clock);
static int __init set_tracepoint_printk(char *str)
{
+ /* Ignore the "tp_printk_stop_on_boot" param */
+ if (*str == '_')
+ return 0;
+
if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
tracepoint_printk = 1;
return 1;
@@ -1470,10 +1484,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
@@ -3657,12 +3673,17 @@ static char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+ bool star, int len)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
+ /* Ignore strings with no length */
+ if (star && !len)
+ return true;
+
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3848,7 +3869,7 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!trace_safe_str(iter, str),
+ if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
"fmt: '%s' current_buffer: '%s'",
fmt, show_buffer(&iter->seq))) {
int ret;
@@ -7719,7 +7740,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7727,25 +7748,52 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+ struct tracing_log_err *err;
+
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
{
struct tracing_log_err *err;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- tr->n_err_log_entries++;
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
+ tr->n_err_log_entries++;
return err;
}
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd)
+ return ERR_PTR(-ENOMEM);
list_del(&err->list);
return err;
@@ -7806,22 +7854,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7839,7 +7890,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7867,9 +7918,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -10115,6 +10166,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d038ddbf1bea..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -136,7 +136,6 @@ struct kprobe_trace_entry_head {
struct eprobe_trace_entry_head {
struct trace_entry ent;
- unsigned int type;
};
struct kretprobe_trace_entry_head {
@@ -1878,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 191db32dec46..541aa13581b9 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -242,7 +242,6 @@ static int trace_eprobe_tp_arg_update(struct trace_eprobe *ep, int i)
static int eprobe_event_define_fields(struct trace_event_call *event_call)
{
- int ret;
struct eprobe_trace_entry_head field;
struct trace_probe *tp;
@@ -250,8 +249,6 @@ static int eprobe_event_define_fields(struct trace_event_call *event_call)
if (WARN_ON_ONCE(!tp))
return -ENOENT;
- DEFINE_FIELD(unsigned int, type, FIELD_STRING_TYPE, 0);
-
return traceprobe_define_arg_fields(event_call, sizeof(field), tp);
}
@@ -270,7 +267,9 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
struct trace_event_call *pevent;
struct trace_event *probed_event;
struct trace_seq *s = &iter->seq;
+ struct trace_eprobe *ep;
struct trace_probe *tp;
+ unsigned int type;
field = (struct eprobe_trace_entry_head *)iter->ent;
tp = trace_probe_primary_from_call(
@@ -278,15 +277,18 @@ print_eprobe_event(struct trace_iterator *iter, int flags,
if (WARN_ON_ONCE(!tp))
goto out;
+ ep = container_of(tp, struct trace_eprobe, tp);
+ type = ep->event->event.type;
+
trace_seq_printf(s, "%s: (", trace_probe_name(tp));
- probed_event = ftrace_find_event(field->type);
+ probed_event = ftrace_find_event(type);
if (probed_event) {
pevent = container_of(probed_event, struct trace_event_call, event);
trace_seq_printf(s, "%s.%s", pevent->class->system,
trace_event_name(pevent));
} else {
- trace_seq_printf(s, "%u", field->type);
+ trace_seq_printf(s, "%u", type);
}
trace_seq_putc(s, ')');
@@ -498,10 +500,6 @@ __eprobe_trace_func(struct eprobe_data *edata, void *rec)
return;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
- if (edata->ep->event)
- entry->type = edata->ep->event->event.type;
- else
- entry->type = 0;
store_trace_args(&entry[1], &edata->ep->tp, rec, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..e11e167b7809 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -384,6 +392,12 @@ static void test_event_printk(struct trace_event_call *call)
if (!(dereference_flags & (1ULL << arg)))
goto next_arg;
+ /* Check for __get_sockaddr */;
+ if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
+ dereference_flags &= ~(1ULL << arg);
+ goto next_arg;
+ }
+
/* Find the REC-> in the argument */
c = strchr(fmt + i, ',');
r = strstr(fmt + i, "REC->");
@@ -759,7 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5e6a988a8a51..44db5ba9cabb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -2289,9 +2308,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
/*
* For backward compatibility, if field_name
* was "cpu", then we treat this the same as
- * common_cpu.
+ * common_cpu. This also works for "CPU".
*/
- if (strcmp(field_name, "cpu") == 0) {
+ if (field && field->filter_type == FILTER_CPU) {
*flags |= HIST_FIELD_FL_CPU;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
@@ -2503,6 +2522,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data,
(HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS);
expr->fn = hist_field_unary_minus;
expr->operands[0] = operand1;
+ expr->size = operand1->size;
+ expr->is_signed = operand1->is_signed;
expr->operator = FIELD_OP_UNARY_MINUS;
expr->name = expr_str(expr, 0);
expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
@@ -2719,6 +2740,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
/* The operand sizes should be the same, so just pick one */
expr->size = operand1->size;
+ expr->is_signed = operand1->is_signed;
expr->operator = field_op;
expr->type = kstrdup_const(operand1->type, GFP_KERNEL);
@@ -3935,6 +3957,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
var_ref_idx = find_var_ref_idx(hist_data, var_ref);
if (WARN_ON(var_ref_idx < 0)) {
+ kfree(p);
ret = var_ref_idx;
goto err;
}
@@ -4828,7 +4851,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
- else if (!field)
+ else if (!field || hist_field->flags & HIST_FIELD_FL_CPU)
cmp_fn = tracing_map_cmp_num(hist_field->size,
hist_field->is_signed);
else if (is_string_field(field))
@@ -5606,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
@@ -6163,7 +6186,9 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
lockdep_assert_held(&event_mutex);
- if (glob && strlen(glob)) {
+ WARN_ON(!glob);
+
+ if (strlen(glob)) {
hist_err_clear();
last_cmd_set(file, param);
}
@@ -6196,7 +6221,7 @@ static int event_hist_trigger_parse(struct event_command *cmd_ops,
continue;
}
break;
- } while (p);
+ } while (1);
if (!p)
param = NULL;
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +57,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index d00fee705f9c..7eb9d04f1c2e 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -84,6 +84,20 @@ event_triggers_call(struct trace_event_file *file,
}
EXPORT_SYMBOL_GPL(event_triggers_call);
+bool __trace_trigger_soft_disabled(struct trace_event_file *file)
+{
+ unsigned long eflags = file->flags;
+
+ if (eflags & EVENT_FILE_FL_TRIGGER_MODE)
+ event_triggers_call(file, NULL, NULL, NULL);
+ if (eflags & EVENT_FILE_FL_SOFT_DISABLED)
+ return true;
+ if (eflags & EVENT_FILE_FL_PID_FILTER)
+ return trace_event_ignore_this_pid(file);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__trace_trigger_soft_disabled);
+
/**
* event_triggers_post_call - Call 'post_triggers' for a trace event
* @file: The trace_event_file associated with the event
@@ -1295,6 +1309,16 @@ traceon_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_on(file->tr);
+ return;
+ }
+
if (tracing_is_on())
return;
@@ -1306,8 +1330,15 @@ traceon_count_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -1315,7 +1346,10 @@ traceon_count_trigger(struct event_trigger_data *data,
if (data->count != -1)
(data->count)--;
- tracing_on();
+ if (file)
+ tracer_tracing_on(file->tr);
+ else
+ tracing_on();
}
static void
@@ -1323,6 +1357,16 @@ traceoff_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_off(file->tr);
+ return;
+ }
+
if (!tracing_is_on())
return;
@@ -1334,8 +1378,15 @@ traceoff_count_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- if (!tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (!tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -1343,7 +1394,10 @@ traceoff_count_trigger(struct event_trigger_data *data,
if (data->count != -1)
(data->count)--;
- tracing_off();
+ if (file)
+ tracer_tracing_off(file->tr);
+ else
+ tracing_off();
}
static int
@@ -1540,7 +1594,12 @@ stacktrace_trigger(struct event_trigger_data *data,
struct trace_buffer *buffer, void *rec,
struct ring_buffer_event *event)
{
- trace_dump_stack(STACK_SKIP);
+ struct trace_event_file *file = data->private_data;
+
+ if (file)
+ __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP);
+ else
+ trace_dump_stack(STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..706e1686b5eb
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1628 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+/* Reminder to move to uapi when everything works */
+#ifdef CONFIG_COMPILE_TEST
+#include <linux/user_events.h>
+#else
+#include <uapi/linux/user_events.h>
+#endif
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ atomic_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != 0)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != 0)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ register_page_data[user->index] = 0;
+ clear_bit(user->index, page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ atomic_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+/*
+ * Writes the user supplied payload out to perf ring buffer.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ atomic_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ atomic_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ mutex_lock(&reg_mutex);
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (!ret)
+ atomic_dec(&user->refcnt);
+
+ mutex_unlock(&reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (atomic_read(&user->refcnt) != 0)
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name, *arg_name;
+ int len, pos, i = *iout;
+ bool colon = false, match = false;
+
+ if (i >= argc)
+ return false;
+
+ len = MAX_FIELD_ARG_NAME;
+ field_name = kmalloc(len, GFP_KERNEL);
+ arg_name = kmalloc(len, GFP_KERNEL);
+
+ if (!arg_name || !field_name)
+ goto out;
+
+ pos = 0;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(arg_name + pos, len - pos, " ");
+
+ pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ colon = true;
+ break;
+ }
+ }
+
+ pos = 0;
+
+ pos += snprintf(field_name + pos, len - pos, field->type);
+ pos += snprintf(field_name + pos, len - pos, " ");
+ pos += snprintf(field_name + pos, len - pos, field->name);
+
+ if (colon)
+ pos += snprintf(field_name + pos, len - pos, ";");
+
+ *iout = i;
+
+ match = strcmp(arg_name, field_name) == 0;
+out:
+ kfree(arg_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->tracepoint.name = name;
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+
+ user->class.system = USER_EVENTS_SYSTEM;
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+
+ ret = user_event_trace_register(user);
+
+ if (ret)
+ goto put_user_lock;
+
+ user->index = index;
+
+ /* Ensure we track ref */
+ atomic_inc(&user->refcnt);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, page_bitmap);
+ hash_add(register_table, &user->node, key);
+
+ mutex_unlock(&event_mutex);
+
+ *newuser = user;
+ return 0;
+put_user_lock:
+ mutex_unlock(&event_mutex);
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+ u32 key;
+ int ret;
+ struct user_event *user = find_user_event(name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ /* Ensure we are the last ref */
+ if (atomic_read(&user->refcnt) != 1) {
+ ret = -EBUSY;
+ goto put_ref;
+ }
+
+ ret = destroy_user_event(user);
+
+ if (ret)
+ goto put_ref;
+
+ return ret;
+put_ref:
+ /* No longer have this ref */
+ atomic_dec(&user->refcnt);
+
+ return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(file->private_data);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(file->private_data,
+ lockdep_is_held(&reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ atomic_inc(&user->refcnt);
+
+ rcu_assign_pointer(file->private_data, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(file, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ atomic_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_reg(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_del(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&reg_mutex);
+
+ refs = file->private_data;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ atomic_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&reg_mutex);
+
+ kfree(refs);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_EVENTS)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ mutex_lock(&reg_mutex);
+
+ hash_for_each(register_table, i, user, node) {
+ status = register_page_data[user->index];
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = register_page_data + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static int __init trace_events_user_init(void)
+{
+ struct page *pages;
+ int ret;
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(page_bitmap, MAX_EVENTS);
+ set_bit(0, page_bitmap);
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+ if (!pages)
+ return -ENOMEM;
+ register_page_data = page_address(pages);
+
+ set_page_reservations(true);
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ set_page_reservations(false);
+ __free_pages(pages, MAX_PAGE_ORDER);
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 508f14af4f2c..47cebef78532 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -32,7 +32,7 @@ static int __init set_kprobe_boot_events(char *str)
strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE);
disable_tracing_selftest("running kprobe events");
- return 0;
+ return 1;
}
__setup("kprobe_event=", set_kprobe_boot_events);
@@ -1433,7 +1433,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
fbuffer.regs = regs;
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
@@ -1628,7 +1628,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
return;
entry->func = (unsigned long)tk->rp.kp.addr;
- entry->ret_ip = (unsigned long)ri->ret_addr;
+ entry->ret_ip = get_kretprobe_retaddr(ri);
store_trace_args(&entry[1], &tk->tp, regs, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 870a08da5b48..e9ae1f33a7f0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
@@ -1387,6 +1389,26 @@ static int run_osnoise(void)
}
/*
+ * In some cases, notably when running on a nohz_full CPU with
+ * a stopped tick PREEMPT_RCU has no way to account for QSs.
+ * This will eventually cause unwarranted noise as PREEMPT_RCU
+ * will force preemption as the means of ending the current
+ * grace period. We avoid this problem by calling
+ * rcu_momentary_dyntick_idle(), which performs a zero duration
+ * EQS allowing PREEMPT_RCU to end the current grace period.
+ * This call shouldn't be wrapped inside an RCU critical
+ * section.
+ *
+ * Note that in non PREEMPT_RCU kernels QSs are handled through
+ * cond_resched()
+ */
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ local_irq_disable();
+ rcu_momentary_dyntick_idle();
+ local_irq_enable();
+ }
+
+ /*
* For the non-preemptive kernel config: let threads runs, if
* they so wish.
*/
@@ -1437,6 +1459,37 @@ static struct cpumask osnoise_cpumask;
static struct cpumask save_cpumask;
/*
+ * osnoise_sleep - sleep until the next period
+ */
+static void osnoise_sleep(void)
+{
+ u64 interval;
+ ktime_t wake_time;
+
+ mutex_lock(&interface_lock);
+ interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
+ mutex_unlock(&interface_lock);
+
+ /*
+ * differently from hwlat_detector, the osnoise tracer can run
+ * without a pause because preemption is on.
+ */
+ if (!interval) {
+ /* Let synchronize_rcu_tasks() make progress */
+ cond_resched_tasks_rcu_qs();
+ return;
+ }
+
+ wake_time = ktime_add_us(ktime_get(), interval);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
+ while (schedule_hrtimeout_range(&wake_time, 0, HRTIMER_MODE_ABS)) {
+ if (kthread_should_stop())
+ break;
+ }
+}
+
+/*
* osnoise_main - The osnoise detection kernel thread
*
* Calls run_osnoise() function to measure the osnoise for the configured runtime,
@@ -1444,30 +1497,10 @@ static struct cpumask save_cpumask;
*/
static int osnoise_main(void *data)
{
- u64 interval;
while (!kthread_should_stop()) {
-
run_osnoise();
-
- mutex_lock(&interface_lock);
- interval = osnoise_data.sample_period - osnoise_data.sample_runtime;
- mutex_unlock(&interface_lock);
-
- do_div(interval, USEC_PER_MSEC);
-
- /*
- * differently from hwlat_detector, the osnoise tracer can run
- * without a pause because preemption is on.
- */
- if (interval < 1) {
- /* Let synchronize_rcu_tasks() make progress */
- cond_resched_tasks_rcu_qs();
- continue;
- }
-
- if (msleep_interruptible(interval))
- break;
+ osnoise_sleep();
}
return 0;
@@ -2189,6 +2222,17 @@ static void osnoise_workload_stop(void)
if (osnoise_has_registered_instances())
return;
+ /*
+ * If callbacks were already disabled in a previous stop
+ * call, there is no need to disable then again.
+ *
+ * For instance, this happens when tracing is stopped via:
+ * echo 0 > tracing_on
+ * echo nop > current_tracer.
+ */
+ if (!trace_osnoise_callback_enabled)
+ return;
+
trace_osnoise_callback_enabled = false;
/*
* Make sure that ftrace_nmi_enter/exit() see
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 73d90179b51b..80863c6508e5 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -871,15 +871,15 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
switch (ptype) {
case PROBE_PRINT_NORMAL:
fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
+ arg = ", REC->" FIELD_STRING_IP;
break;
case PROBE_PRINT_RETURN:
fmt = "(%lx <- %lx)";
- arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ arg = ", REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
break;
case PROBE_PRINT_EVENT:
- fmt = "(%u)";
- arg = "REC->" FIELD_STRING_TYPE;
+ fmt = "";
+ arg = "";
break;
default:
WARN_ON_ONCE(1);
@@ -903,7 +903,7 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
parg->type->fmt);
}
- pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", arg);
for (i = 0; i < tp->nr_args; i++) {
parg = tp->args + i;
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 99e7a5df025e..92cc149af0fd 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -38,7 +38,6 @@
#define FIELD_STRING_IP "__probe_ip"
#define FIELD_STRING_RETIP "__probe_ret_ip"
#define FIELD_STRING_FUNC "__probe_func"
-#define FIELD_STRING_TYPE "__probe_type"
#undef DEFINE_FIELD
#define DEFINE_FIELD(type, item, name, is_signed) \
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..45796d8bd4b2 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index afd937a46496..abcadbe933bb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -784,9 +784,7 @@ static struct fgraph_ops fgraph_ops __initdata = {
.retfunc = &trace_graph_return,
};
-#if defined(CONFIG_DYNAMIC_FTRACE) && \
- defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
-#define TEST_DIRECT_TRAMP
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
noinline __noclone static void trace_direct_tramp(void) { }
#endif
@@ -849,7 +847,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
goto out;
}
-#ifdef TEST_DIRECT_TRAMP
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
tracing_reset_online_cpus(&tr->array_buffer);
set_graph_array(tr);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 7b32c356ebc5..06ea04d44685 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -190,6 +190,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
kfree(new);
} else {
hlist_add_head(&new->node, hashent);
+ get_user_ns(new->ns);
spin_unlock_irq(&ucounts_lock);
return new;
}
@@ -210,6 +211,7 @@ void put_ucounts(struct ucounts *ucounts)
if (atomic_dec_and_lock_irqsave(&ucounts->count, &ucounts_lock, flags)) {
hlist_del_init(&ucounts->node);
spin_unlock_irqrestore(&ucounts_lock, flags);
+ put_user_ns(ucounts->ns);
kfree(ucounts);
}
}
@@ -348,7 +350,8 @@ bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsign
if (rlimit > LONG_MAX)
max = LONG_MAX;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- if (get_ucounts_value(iter, type) > max)
+ long val = get_ucounts_value(iter, type);
+ if (val < 0 || val > max)
return true;
max = READ_ONCE(iter->ns->ucount_max[type]);
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b2e3ca7ee99..5481ba44a8d6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -58,6 +58,18 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->user_ns = user_ns;
}
+static unsigned long enforced_nproc_rlimit(void)
+{
+ unsigned long limit = RLIM_INFINITY;
+
+ /* Is RLIMIT_NPROC currently enforced? */
+ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
+ (current_user_ns() != &init_user_ns))
+ limit = rlimit(RLIMIT_NPROC);
+
+ return limit;
+}
+
/*
* Create a new user namespace, deriving the creator from the user in the
* passed credentials, and replacing that user with the new root user for the
@@ -122,7 +134,7 @@ int create_user_ns(struct cred *new)
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX;
}
- set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+ set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 9c9eb20dd2c5..230038d4f908 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -54,6 +54,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit += page->index;
set_bit(bit, wqueue->notes_bitmap);
+ generic_pipe_buf_release(pipe, buf);
}
// No try_steal function => no stealing
@@ -112,7 +113,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
buf->offset = offset;
buf->len = len;
buf->flags = PIPE_BUF_FLAG_WHOLE;
- pipe->head = head + 1;
+ smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */
if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
spin_unlock_irq(&pipe->rd_wait.lock);
@@ -219,7 +220,6 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
struct page **pages;
unsigned long *bitmap;
unsigned long user_bufs;
- unsigned int bmsize;
int ret, i, nr_pages;
if (!wqueue)
@@ -243,7 +243,8 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
goto error;
}
- ret = pipe_resize_ring(pipe, nr_notes);
+ nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
if (ret < 0)
goto error;
@@ -258,21 +259,19 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
}
- bmsize = (nr_notes + BITS_PER_LONG - 1) / BITS_PER_LONG;
- bmsize *= sizeof(unsigned long);
- bitmap = kmalloc(bmsize, GFP_KERNEL);
+ bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
if (!bitmap)
goto error_p;
- memset(bitmap, 0xff, bmsize);
+ bitmap_fill(bitmap, nr_notes);
wqueue->notes = pages;
wqueue->notes_bitmap = bitmap;
wqueue->nr_pages = nr_pages;
- wqueue->nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
+ wqueue->nr_notes = nr_notes;
return 0;
error_p:
- for (i = 0; i < nr_pages; i++)
+ while (--i >= 0)
__free_page(pages[i]);
kfree(pages);
error:
@@ -320,7 +319,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
tf[i].info_mask & WATCH_INFO_LENGTH)
goto err_filter;
/* Ignore any unknown types */
- if (tf[i].type >= sizeof(wfilter->type_filter) * 8)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
nr_filter++;
}
@@ -336,7 +335,7 @@ long watch_queue_set_filter(struct pipe_inode_info *pipe,
q = wfilter->filters;
for (i = 0; i < filter.nr_filters; i++) {
- if (tf[i].type >= sizeof(wfilter->type_filter) * BITS_PER_LONG)
+ if (tf[i].type >= WATCH_TYPE__NR)
continue;
q->type = tf[i].type;
@@ -371,6 +370,8 @@ static void __put_watch_queue(struct kref *kref)
for (i = 0; i < wqueue->nr_pages; i++)
__free_page(wqueue->notes[i]);
+ kfree(wqueue->notes);
+ bitmap_free(wqueue->notes_bitmap);
wfilter = rcu_access_pointer(wqueue->filter);
if (wfilter)
@@ -395,6 +396,7 @@ static void free_watch(struct rcu_head *rcu)
put_watch_queue(rcu_access_pointer(watch->queue));
atomic_dec(&watch->cred->user->nr_watches);
put_cred(watch->cred);
+ kfree(watch);
}
static void __put_watch(struct kref *kref)
@@ -566,7 +568,7 @@ void watch_queue_clear(struct watch_queue *wqueue)
rcu_read_lock();
spin_lock_bh(&wqueue->lock);
- /* Prevent new additions and prevent notifications from happening */
+ /* Prevent new notifications from being stored. */
wqueue->defunct = true;
while (!hlist_empty(&wqueue->watches)) {
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 99afb88d2e85..9166220457bc 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -848,7 +848,7 @@ void __init lockup_detector_init(void)
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask,
- housekeeping_cpumask(HK_FLAG_TIMER));
+ housekeeping_cpumask(HK_TYPE_TIMER));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33f1106b4f99..0d2514b4ff0d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -154,15 +154,20 @@ struct worker_pool {
unsigned long watchdog_ts; /* L: watchdog timestamp */
- /* The current concurrency level. */
- atomic_t nr_running;
+ /*
+ * The counter is incremented in a process context on the associated CPU
+ * w/ preemption disabled, and decremented or reset in the same context
+ * but w/ pool->lock held. The readers grab pool->lock and are
+ * guaranteed to see if the counter reached zero.
+ */
+ int nr_running;
struct list_head worklist; /* L: list of pending works */
int nr_workers; /* L: total number of workers */
int nr_idle; /* L: currently idle workers */
- struct list_head idle_list; /* X: list of idle workers */
+ struct list_head idle_list; /* L: list of idle workers */
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
@@ -777,7 +782,7 @@ static bool work_is_canceling(struct work_struct *work)
static bool __need_more_worker(struct worker_pool *pool)
{
- return !atomic_read(&pool->nr_running);
+ return !pool->nr_running;
}
/*
@@ -802,8 +807,7 @@ static bool may_start_working(struct worker_pool *pool)
/* Do I need to keep working? Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) &&
- atomic_read(&pool->nr_running) <= 1;
+ return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}
/* Do we need a new worker? Called from manager. */
@@ -826,7 +830,7 @@ static bool too_many_workers(struct worker_pool *pool)
* Wake up functions.
*/
-/* Return the first idle worker. Safe with preemption disabled */
+/* Return the first idle worker. Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
if (unlikely(list_empty(&pool->idle_list)))
@@ -873,7 +877,7 @@ void wq_worker_running(struct task_struct *task)
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&worker->pool->nr_running);
+ worker->pool->nr_running++;
preempt_enable();
worker->sleeping = 0;
}
@@ -887,7 +891,7 @@ void wq_worker_running(struct task_struct *task)
*/
void wq_worker_sleeping(struct task_struct *task)
{
- struct worker *next, *worker = kthread_data(task);
+ struct worker *worker = kthread_data(task);
struct worker_pool *pool;
/*
@@ -917,23 +921,9 @@ void wq_worker_sleeping(struct task_struct *task)
return;
}
- /*
- * The counterpart of the following dec_and_test, implied mb,
- * worklist not empty test sequence is in insert_work().
- * Please read comment there.
- *
- * NOT_RUNNING is clear. This means that we're bound to and
- * running on the local cpu w/ rq lock held and preemption
- * disabled, which in turn means that none else could be
- * manipulating idle_list, so dereferencing idle_list without pool
- * lock is safe.
- */
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist)) {
- next = first_idle_worker(pool);
- if (next)
- wake_up_process(next->task);
- }
+ pool->nr_running--;
+ if (need_more_worker(pool))
+ wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
}
@@ -987,7 +977,7 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- atomic_dec(&pool->nr_running);
+ pool->nr_running--;
}
worker->flags |= flags;
@@ -1019,7 +1009,7 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
- atomic_inc(&pool->nr_running);
+ pool->nr_running++;
}
/**
@@ -1372,13 +1362,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
list_add_tail(&work->entry, head);
get_pwq(pwq);
- /*
- * Ensure either wq_worker_sleeping() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers lying
- * around lazily while there are works to be processed.
- */
- smp_mb();
-
if (__need_more_worker(pool))
wake_up_worker(pool);
}
@@ -1827,8 +1810,7 @@ static void worker_enter_idle(struct worker *worker)
mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);
/* Sanity check nr_running. */
- WARN_ON_ONCE(pool->nr_workers == pool->nr_idle &&
- atomic_read(&pool->nr_running));
+ WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}
/**
@@ -5006,7 +4988,7 @@ static void unbind_workers(int cpu)
* an unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
- atomic_set(&pool->nr_running, 0);
+ pool->nr_running = 0;
/*
* With concurrency management just turned off, a busy
@@ -6006,13 +5988,13 @@ static void __init wq_numa_init(void)
void __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
- int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
- cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
+ cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
+ cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);