aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore9
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Kconfig.preempt57
-rw-r--r--kernel/Makefile44
-rw-r--r--kernel/acct.c26
-rw-r--r--kernel/async.c66
-rw-r--r--kernel/audit.c209
-rw-r--r--kernel/audit.h42
-rw-r--r--kernel/audit_fsnotify.c32
-rw-r--r--kernel/audit_tree.c53
-rw-r--r--kernel/audit_watch.c35
-rw-r--r--kernel/auditfilter.c39
-rw-r--r--kernel/auditsc.c693
-rw-r--r--kernel/backtracetest.c2
-rw-r--r--kernel/bpf/Kconfig96
-rw-r--r--kernel/bpf/Makefile24
-rw-r--r--kernel/bpf/arraymap.c377
-rw-r--r--kernel/bpf/bloom_filter.c210
-rw-r--r--kernel/bpf/bpf_inode_storage.c283
-rw-r--r--kernel/bpf/bpf_iter.c751
-rw-r--r--kernel/bpf/bpf_local_storage.c631
-rw-r--r--kernel/bpf/bpf_lru_list.c7
-rw-r--r--kernel/bpf/bpf_lru_list.h2
-rw-r--r--kernel/bpf/bpf_lsm.c228
-rw-r--r--kernel/bpf/bpf_struct_ops.c107
-rw-r--r--kernel/bpf/bpf_struct_ops_types.h3
-rw-r--r--kernel/bpf/bpf_task_storage.c342
-rw-r--r--kernel/bpf/btf.c3115
-rw-r--r--kernel/bpf/cgroup.c1041
-rw-r--r--kernel/bpf/core.c621
-rw-r--r--kernel/bpf/cpumap.c410
-rw-r--r--kernel/bpf/devmap.c594
-rw-r--r--kernel/bpf/disasm.c74
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/dispatcher.c5
-rw-r--r--kernel/bpf/hashtab.c874
-rw-r--r--kernel/bpf/helpers.c990
-rw-r--r--kernel/bpf/inode.c176
-rw-r--r--kernel/bpf/local_storage.c281
-rw-r--r--kernel/bpf/lpm_trie.c55
-rw-r--r--kernel/bpf/map_in_map.c34
-rw-r--r--kernel/bpf/map_in_map.h2
-rw-r--r--kernel/bpf/map_iter.c195
-rw-r--r--kernel/bpf/mmap_unlock_work.h65
-rw-r--r--kernel/bpf/net_namespace.c567
-rw-r--r--kernel/bpf/percpu_freelist.c121
-rw-r--r--kernel/bpf/percpu_freelist.h1
-rw-r--r--kernel/bpf/preload/.gitignore2
-rw-r--r--kernel/bpf/preload/Kconfig27
-rw-r--r--kernel/bpf/preload/Makefile42
-rw-r--r--kernel/bpf/preload/bpf_preload.h16
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c102
-rw-r--r--kernel/bpf/preload/bpf_preload_umd_blob.S7
-rw-r--r--kernel/bpf/preload/iterators/.gitignore2
-rw-r--r--kernel/bpf/preload/iterators/Makefile69
-rw-r--r--kernel/bpf/preload/iterators/README4
-rw-r--r--kernel/bpf/preload/iterators/bpf_preload_common.h13
-rw-r--r--kernel/bpf/preload/iterators/iterators.bpf.c113
-rw-r--r--kernel/bpf/preload/iterators/iterators.c94
-rw-r--r--kernel/bpf/preload/iterators/iterators.skel.h412
-rw-r--r--kernel/bpf/prog_iter.c107
-rw-r--r--kernel/bpf/queue_stack_maps.c35
-rw-r--r--kernel/bpf/reuseport_array.c49
-rw-r--r--kernel/bpf/ringbuf.c477
-rw-r--r--kernel/bpf/stackmap.c502
-rw-r--r--kernel/bpf/syscall.c2529
-rw-r--r--kernel/bpf/sysfs_btf.c15
-rw-r--r--kernel/bpf/task_iter.c670
-rw-r--r--kernel/bpf/tnum.c56
-rw-r--r--kernel/bpf/trampoline.c535
-rw-r--r--kernel/bpf/verifier.c7896
-rw-r--r--kernel/bpf/xskmap.c265
-rw-r--r--kernel/capability.c18
-rw-r--r--kernel/cfi.c329
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h19
-rw-r--r--kernel/cgroup/cgroup-v1.c119
-rw-r--r--kernel/cgroup/cgroup.c1030
-rw-r--r--kernel/cgroup/cpuset.c375
-rw-r--r--kernel/cgroup/misc.c424
-rw-r--r--kernel/cgroup/namespace.c9
-rw-r--r--kernel/cgroup/pids.c15
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/cgroup/rstat.c164
-rw-r--r--kernel/compat.c39
-rw-r--r--kernel/configs/android-base.config1
-rw-r--r--kernel/configs/android-recommended.config2
-rw-r--r--kernel/configs/tiny-base.config1
-rw-r--r--kernel/configs/tiny.config1
-rw-r--r--kernel/context_tracking.c16
-rw-r--r--kernel/cpu.c553
-rw-r--r--kernel/cpu_pm.c92
-rw-r--r--kernel/crash_core.c22
-rw-r--r--kernel/crash_dump.c6
-rw-r--r--kernel/cred.c60
-rw-r--r--kernel/debug/debug_core.c182
-rw-r--r--kernel/debug/gdbstub.c43
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/kdb_bp.c84
-rw-r--r--kernel/debug/kdb/kdb_bt.c35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c82
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c4
-rw-r--r--kernel/debug/kdb/kdb_main.c936
-rw-r--r--kernel/debug/kdb/kdb_private.h41
-rw-r--r--kernel/debug/kdb/kdb_support.c512
-rw-r--r--kernel/delayacct.c71
-rw-r--r--kernel/dma/Kconfig90
-rw-r--r--kernel/dma/Makefile7
-rw-r--r--kernel/dma/coherent.c204
-rw-r--r--kernel/dma/contiguous.c176
-rw-r--r--kernel/dma/debug.c159
-rw-r--r--kernel/dma/debug.h130
-rw-r--r--kernel/dma/direct.c455
-rw-r--r--kernel/dma/direct.h119
-rw-r--r--kernel/dma/dummy.c5
-rw-r--r--kernel/dma/map_benchmark.c382
-rw-r--r--kernel/dma/mapping.c522
-rw-r--r--kernel/dma/ops_helpers.c93
-rw-r--r--kernel/dma/pool.c295
-rw-r--r--kernel/dma/remap.c171
-rw-r--r--kernel/dma/swiotlb.c886
-rw-r--r--kernel/dma/virt.c59
-rw-r--r--kernel/elfcore.c26
-rw-r--r--kernel/entry/Makefile13
-rw-r--r--kernel/entry/common.c477
-rw-r--r--kernel/entry/common.h7
-rw-r--r--kernel/entry/kvm.c52
-rw-r--r--kernel/entry/syscall_user_dispatch.c108
-rw-r--r--kernel/events/Makefile5
-rw-r--r--kernel/events/callchain.c22
-rw-r--r--kernel/events/core.c1787
-rw-r--r--kernel/events/hw_breakpoint.c22
-rw-r--r--kernel/events/internal.h13
-rw-r--r--kernel/events/ring_buffer.c63
-rw-r--r--kernel/events/uprobes.c154
-rw-r--r--kernel/exit.c301
-rw-r--r--kernel/extable.c40
-rw-r--r--kernel/fail_function.c11
-rw-r--r--kernel/fork.c567
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c4200
-rw-r--r--kernel/futex/Makefile3
-rw-r--r--kernel/futex/core.c1141
-rw-r--r--kernel/futex/futex.h293
-rw-r--r--kernel/futex/pi.c1233
-rw-r--r--kernel/futex/requeue.c897
-rw-r--r--kernel/futex/syscalls.c376
-rw-r--r--kernel/futex/waitwake.c708
-rw-r--r--kernel/gcov/Kconfig32
-rw-r--r--kernel/gcov/Makefile3
-rw-r--r--kernel/gcov/base.c49
-rw-r--r--kernel/gcov/clang.c220
-rw-r--r--kernel/gcov/fs.c114
-rw-r--r--kernel/gcov/gcc_3_4.c573
-rw-r--r--kernel/gcov/gcc_4_7.c189
-rw-r--r--kernel/gcov/gcov.h14
-rwxr-xr-xkernel/gen_kheaders.sh8
-rw-r--r--kernel/groups.c9
-rw-r--r--kernel/hung_task.c36
-rw-r--r--kernel/irq/Kconfig35
-rw-r--r--kernel/irq/affinity.c8
-rw-r--r--kernel/irq/chip.c50
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/debugfs.c37
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/generic-chip.c23
-rw-r--r--kernel/irq/handle.c38
-rw-r--r--kernel/irq/internals.h21
-rw-r--r--kernel/irq/ipi.c34
-rw-r--r--kernel/irq/irq_sim.c274
-rw-r--r--kernel/irq/irqdesc.c209
-rw-r--r--kernel/irq/irqdomain.c440
-rw-r--r--kernel/irq/manage.c309
-rw-r--r--kernel/irq/matrix.c21
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/msi.c289
-rw-r--r--kernel/irq/pm.c44
-rw-r--r--kernel/irq/proc.c13
-rw-r--r--kernel/irq/resend.c158
-rw-r--r--kernel/irq/settings.h19
-rw-r--r--kernel/irq/spurious.c12
-rw-r--r--kernel/irq/timings.c17
-rw-r--r--kernel/irq_work.c194
-rw-r--r--kernel/jump_label.c53
-rw-r--r--kernel/kallsyms.c256
-rw-r--r--kernel/kcmp.c59
-rw-r--r--kernel/kcov.c288
-rw-r--r--kernel/kcsan/Makefile20
-rw-r--r--kernel/kcsan/core.c1310
-rw-r--r--kernel/kcsan/debugfs.c275
-rw-r--r--kernel/kcsan/encoding.h102
-rw-r--r--kernel/kcsan/kcsan.h142
-rw-r--r--kernel/kcsan/kcsan_test.c1617
-rw-r--r--kernel/kcsan/permissive.h94
-rw-r--r--kernel/kcsan/report.c716
-rw-r--r--kernel/kcsan/selftest.c272
-rw-r--r--kernel/kexec.c105
-rw-r--r--kernel/kexec_core.c15
-rw-r--r--kernel/kexec_file.c133
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kmod.c13
-rw-r--r--kernel/kprobes.c1082
-rw-r--r--kernel/kthread.c358
-rw-r--r--kernel/latencytop.c4
-rw-r--r--kernel/livepatch/Kconfig2
-rw-r--r--kernel/livepatch/core.c185
-rw-r--r--kernel/livepatch/patch.c23
-rw-r--r--kernel/livepatch/state.c2
-rw-r--r--kernel/livepatch/transition.c104
-rw-r--r--kernel/locking/Makefile9
-rw-r--r--kernel/locking/irqflag-debug.c13
-rw-r--r--kernel/locking/lock_events_list.h6
-rw-r--r--kernel/locking/lockdep.c2185
-rw-r--r--kernel/locking/lockdep_internals.h31
-rw-r--r--kernel/locking/lockdep_proc.c59
-rw-r--r--kernel/locking/locktorture.c247
-rw-r--r--kernel/locking/mcs_spinlock.h2
-rw-r--r--kernel/locking/mutex-debug.c11
-rw-r--r--kernel/locking/mutex-debug.h29
-rw-r--r--kernel/locking/mutex.c666
-rw-r--r--kernel/locking/mutex.h50
-rw-r--r--kernel/locking/osq_lock.c10
-rw-r--r--kernel/locking/percpu-rwsem.c195
-rw-r--r--kernel/locking/qrwlock.c14
-rw-r--r--kernel/locking/qspinlock.c7
-rw-r--r--kernel/locking/rtmutex-debug.c182
-rw-r--r--kernel/locking/rtmutex-debug.h37
-rw-r--r--kernel/locking/rtmutex.c1401
-rw-r--r--kernel/locking/rtmutex.h35
-rw-r--r--kernel/locking/rtmutex_api.c612
-rw-r--r--kernel/locking/rtmutex_common.h207
-rw-r--r--kernel/locking/rwbase_rt.c291
-rw-r--r--kernel/locking/rwsem.c758
-rw-r--r--kernel/locking/rwsem.h10
-rw-r--r--kernel/locking/semaphore.c6
-rw-r--r--kernel/locking/spinlock.c14
-rw-r--r--kernel/locking/spinlock_debug.c11
-rw-r--r--kernel/locking/spinlock_rt.c268
-rw-r--r--kernel/locking/test-ww_mutex.c87
-rw-r--r--kernel/locking/ww_mutex.h569
-rw-r--r--kernel/locking/ww_rt_mutex.c101
-rw-r--r--kernel/module.c1153
-rw-r--r--kernel/module_signature.c2
-rw-r--r--kernel/module_signing.c2
-rw-r--r--kernel/notifier.c141
-rw-r--r--kernel/nsproxy.c337
-rw-r--r--kernel/padata.c497
-rw-r--r--kernel/panic.c68
-rw-r--r--kernel/params.c47
-rw-r--r--kernel/pid.c132
-rw-r--r--kernel/pid_namespace.c65
-rw-r--r--kernel/power/Kconfig61
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/autosleep.c2
-rw-r--r--kernel/power/energy_model.c396
-rw-r--r--kernel/power/hibernate.c195
-rw-r--r--kernel/power/main.c19
-rw-r--r--kernel/power/power.h26
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/process.c8
-rw-r--r--kernel/power/qos.c563
-rw-r--r--kernel/power/snapshot.c91
-rw-r--r--kernel/power/suspend.c38
-rw-r--r--kernel/power/suspend_test.c2
-rw-r--r--kernel/power/swap.c81
-rw-r--r--kernel/power/user.c151
-rw-r--r--kernel/printk/Makefile2
-rw-r--r--kernel/printk/console_cmdline.h1
-rw-r--r--kernel/printk/index.c194
-rw-r--r--kernel/printk/internal.h41
-rw-r--r--kernel/printk/printk.c2273
-rw-r--r--kernel/printk/printk_ringbuffer.c2082
-rw-r--r--kernel/printk/printk_ringbuffer.h382
-rw-r--r--kernel/printk/printk_safe.c389
-rw-r--r--kernel/profile.c23
-rw-r--r--kernel/ptrace.c89
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcu/Kconfig99
-rw-r--r--kernel/rcu/Kconfig.debug40
-rw-r--r--kernel/rcu/Makefile7
-rw-r--r--kernel/rcu/rcu.h78
-rw-r--r--kernel/rcu/rcu_segcblist.c231
-rw-r--r--kernel/rcu/rcu_segcblist.h59
-rw-r--r--kernel/rcu/rcuscale.c (renamed from kernel/rcu/rcuperf.c)430
-rw-r--r--kernel/rcu/rcutorture.c1373
-rw-r--r--kernel/rcu/refscale.c878
-rw-r--r--kernel/rcu/srcutiny.c81
-rw-r--r--kernel/rcu/srcutree.c219
-rw-r--r--kernel/rcu/sync.c4
-rw-r--r--kernel/rcu/tasks.h1721
-rw-r--r--kernel/rcu/tiny.c48
-rw-r--r--kernel/rcu/tree.c1929
-rw-r--r--kernel/rcu/tree.h73
-rw-r--r--kernel/rcu/tree_exp.h87
-rw-r--r--kernel/rcu/tree_nocb.h1548
-rw-r--r--kernel/rcu/tree_plugin.h1563
-rw-r--r--kernel/rcu/tree_stall.h428
-rw-r--r--kernel/rcu/update.c460
-rw-r--r--kernel/reboot.c333
-rw-r--r--kernel/regset.c76
-rw-r--r--kernel/relay.c145
-rw-r--r--kernel/resource.c484
-rw-r--r--kernel/resource_kunit.c152
-rw-r--r--kernel/rseq.c43
-rw-r--r--kernel/scftorture.c659
-rw-r--r--kernel/sched/Makefile10
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/completion.c36
-rw-r--r--kernel/sched/core.c4663
-rw-r--r--kernel/sched/core_sched.c298
-rw-r--r--kernel/sched/cpuacct.c120
-rw-r--r--kernel/sched/cpudeadline.c28
-rw-r--r--kernel/sched/cpufreq_schedutil.c322
-rw-r--r--kernel/sched/cpupri.c206
-rw-r--r--kernel/sched/cpupri.h14
-rw-r--r--kernel/sched/cputime.c145
-rw-r--r--kernel/sched/deadline.c608
-rw-r--r--kernel/sched/debug.c613
-rw-r--r--kernel/sched/fair.c3264
-rw-r--r--kernel/sched/features.h16
-rw-r--r--kernel/sched/idle.c116
-rw-r--r--kernel/sched/isolation.c28
-rw-r--r--kernel/sched/loadavg.c6
-rw-r--r--kernel/sched/membarrier.c334
-rw-r--r--kernel/sched/pelt.c122
-rw-r--r--kernel/sched/pelt.h49
-rw-r--r--kernel/sched/psi.c424
-rw-r--r--kernel/sched/rt.c398
-rw-r--r--kernel/sched/sched.h1064
-rw-r--r--kernel/sched/smp.h9
-rw-r--r--kernel/sched/stats.c106
-rw-r--r--kernel/sched/stats.h160
-rw-r--r--kernel/sched/stop_task.c31
-rw-r--r--kernel/sched/swait.c15
-rw-r--r--kernel/sched/topology.c657
-rw-r--r--kernel/sched/wait.c35
-rw-r--r--kernel/scs.c154
-rw-r--r--kernel/seccomp.c892
-rw-r--r--kernel/signal.c588
-rw-r--r--kernel/smp.c813
-rw-r--r--kernel/smpboot.c12
-rw-r--r--kernel/softirq.c538
-rw-r--r--kernel/stackleak.c18
-rw-r--r--kernel/stacktrace.c43
-rw-r--r--kernel/static_call.c548
-rw-r--r--kernel/stop_machine.c30
-rw-r--r--kernel/sys.c201
-rw-r--r--kernel/sys_ni.c23
-rw-r--r--kernel/sysctl-test.c24
-rw-r--r--kernel/sysctl.c3371
-rw-r--r--kernel/sysctl_binary.c171
-rw-r--r--kernel/task_work.c95
-rw-r--r--kernel/taskstats.c42
-rw-r--r--kernel/test_kprobes.c313
-rw-r--r--kernel/time/Kconfig49
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/alarmtimer.c22
-rw-r--r--kernel/time/clockevents.c23
-rw-r--r--kernel/time/clocksource-wdtest.c201
-rw-r--r--kernel/time/clocksource.c248
-rw-r--r--kernel/time/hrtimer.c453
-rw-r--r--kernel/time/itimer.c4
-rw-r--r--kernel/time/jiffies.c46
-rw-r--r--kernel/time/namespace.c73
-rw-r--r--kernel/time/ntp.c229
-rw-r--r--kernel/time/ntp_internal.h7
-rw-r--r--kernel/time/posix-cpu-timers.c557
-rw-r--r--kernel/time/posix-timers.c17
-rw-r--r--kernel/time/sched_clock.c56
-rw-r--r--kernel/time/test_udelay.c7
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c2
-rw-r--r--kernel/time/tick-broadcast.c190
-rw-r--r--kernel/time/tick-common.c48
-rw-r--r--kernel/time/tick-internal.h38
-rw-r--r--kernel/time/tick-legacy.c37
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c292
-rw-r--r--kernel/time/tick-sched.h2
-rw-r--r--kernel/time/time.c2
-rw-r--r--kernel/time/time_test.c99
-rw-r--r--kernel/time/timeconv.c134
-rw-r--r--kernel/time/timecounter.c2
-rw-r--r--kernel/time/timekeeping.c348
-rw-r--r--kernel/time/timekeeping.h4
-rw-r--r--kernel/time/timekeeping_internal.h11
-rw-r--r--kernel/time/timer.c392
-rw-r--r--kernel/time/timer_list.c76
-rw-r--r--kernel/time/vsyscall.c53
-rw-r--r--kernel/torture.c227
-rw-r--r--kernel/trace/Kconfig246
-rw-r--r--kernel/trace/Makefile15
-rw-r--r--kernel/trace/blktrace.c394
-rw-r--r--kernel/trace/bpf_trace.c958
-rw-r--r--kernel/trace/bpf_trace.h34
-rw-r--r--kernel/trace/error_report-traces.c11
-rw-r--r--kernel/trace/fgraph.c23
-rw-r--r--kernel/trace/ftrace.c965
-rw-r--r--kernel/trace/ftrace_internal.h22
-rw-r--r--kernel/trace/pid_list.c495
-rw-r--r--kernel/trace/pid_list.h88
-rw-r--r--kernel/trace/preemptirq_delay_test.c52
-rw-r--r--kernel/trace/ring_buffer.c1436
-rw-r--r--kernel/trace/ring_buffer_benchmark.c48
-rw-r--r--kernel/trace/synth_event_gen_test.c22
-rw-r--r--kernel/trace/trace.c1866
-rw-r--r--kernel/trace/trace.h471
-rw-r--r--kernel/trace/trace_benchmark.c6
-rw-r--r--kernel/trace/trace_boot.c391
-rw-r--r--kernel/trace/trace_branch.c6
-rw-r--r--kernel/trace/trace_clock.c44
-rw-r--r--kernel/trace/trace_dynevent.c91
-rw-r--r--kernel/trace/trace_dynevent.h14
-rw-r--r--kernel/trace/trace_entries.h83
-rw-r--r--kernel/trace/trace_eprobe.c959
-rw-r--r--kernel/trace/trace_event_perf.c33
-rw-r--r--kernel/trace/trace_events.c773
-rw-r--r--kernel/trace/trace_events_filter.c45
-rw-r--r--kernel/trace/trace_events_hist.c2987
-rw-r--r--kernel/trace/trace_events_inject.c6
-rw-r--r--kernel/trace/trace_events_synth.c2245
-rw-r--r--kernel/trace/trace_events_trigger.c99
-rw-r--r--kernel/trace/trace_export.c20
-rw-r--r--kernel/trace/trace_functions.c296
-rw-r--r--kernel/trace/trace_functions_graph.c48
-rw-r--r--kernel/trace/trace_hwlat.c589
-rw-r--r--kernel/trace/trace_irqsoff.c92
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_kprobe.c254
-rw-r--r--kernel/trace/trace_mmiotrace.c16
-rw-r--r--kernel/trace/trace_osnoise.c2368
-rw-r--r--kernel/trace/trace_output.c305
-rw-r--r--kernel/trace/trace_output.h1
-rw-r--r--kernel/trace/trace_preemptirq.c47
-rw-r--r--kernel/trace/trace_printk.c23
-rw-r--r--kernel/trace/trace_probe.c134
-rw-r--r--kernel/trace/trace_probe.h35
-rw-r--r--kernel/trace/trace_probe_tmpl.h8
-rw-r--r--kernel/trace/trace_recursion_record.c236
-rw-r--r--kernel/trace/trace_sched_wakeup.c97
-rw-r--r--kernel/trace/trace_selftest.c134
-rw-r--r--kernel/trace/trace_seq.c12
-rw-r--r--kernel/trace/trace_stack.c20
-rw-r--r--kernel/trace/trace_stat.c14
-rw-r--r--kernel/trace/trace_synth.h40
-rw-r--r--kernel/trace/trace_syscalls.c20
-rw-r--r--kernel/trace/trace_uprobe.c103
-rw-r--r--kernel/trace/tracing_map.c57
-rw-r--r--kernel/trace/tracing_map.h2
-rw-r--r--kernel/tracepoint.c292
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/ucount.c204
-rw-r--r--kernel/umh.c218
-rw-r--r--kernel/up.c42
-rw-r--r--kernel/user.c32
-rw-r--r--kernel/user_namespace.c96
-rw-r--r--kernel/usermode_driver.c191
-rw-r--r--kernel/utsname.c12
-rw-r--r--kernel/utsname_sysctl.c2
-rw-r--r--kernel/watch_queue.c663
-rw-r--r--kernel/watchdog.c182
-rw-r--r--kernel/workqueue.c763
-rw-r--r--kernel/workqueue_internal.h3
464 files changed, 99989 insertions, 39215 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 34d1e77ee9df..c6b299a6b786 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -1,6 +1,3 @@
-#
-# Generated files
-#
-kheaders.md5
-timeconst.h
-hz.bc
+# SPDX-License-Identifier: GPL-2.0-only
+/config_data
+/kheaders.md5
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 3de8fd11873b..4198f0273ecd 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS
config QUEUED_RWLOCKS
def_bool y if ARCH_USE_QUEUED_RWLOCKS
- depends on SMP
+ depends on SMP && !PREEMPT_RT
config ARCH_HAS_MMIOWB
bool
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf82259cff96..ce77f0265660 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,11 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -20,6 +32,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -38,8 +51,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -80,3 +92,44 @@ config PREEMPT_COUNT
config PREEMPTION
bool
select PREEMPT_COUNT
+
+config PREEMPT_DYNAMIC
+ bool "Preemption behaviour defined on boot"
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select PREEMPT_BUILD
+ default y
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model defined during compile time.
+
+ The feature is primarily interesting for Linux distributions which
+ provide a pre-built kernel binary to reduce the number of kernel
+ flavors they offer while still offering different usecases.
+
+ The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled
+ but if runtime patching is not available for the specific architecture
+ then the potential overhead should be considered.
+
+ Interesting if you want the same pre-built kernel should be used for
+ both Server and Desktop workloads.
+
+config SCHED_CORE
+ bool "Core Scheduling for SMT"
+ depends on SCHED_SMT
+ help
+ This option permits Core Scheduling, a means of coordinated task
+ selection across SMT siblings. When enabled -- see
+ prctl(PR_SCHED_CORE) -- task selection ensures that all SMT siblings
+ will execute a task from the same 'core group', forcing idle when no
+ matching task is found.
+
+ Use of this feature includes:
+ - mitigation of some (not all) SMT side channels;
+ - limiting SMT interference to improve determinism and/or performance.
+
+ SCHED_CORE is default disabled. When it is enabled and unused,
+ which is the likely usage by Linux distributions, there should
+ be no measurable impact on performance.
+
+
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb4130ced32..186c49582f45 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,13 +5,14 @@
obj-y = fork.o exec_domain.o panic.o \
cpu.o exit.o softirq.o resource.o \
- sysctl.o sysctl_binary.o capability.o ptrace.o user.o \
+ sysctl.o capability.o ptrace.o user.o \
signal.o sys.o umh.o workqueue.o pid.o task_work.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o ucount.o
+ async.o range.o smpboot.o ucount.o regset.o
+obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
@@ -23,6 +24,9 @@ endif
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
+# Avoid KCSAN instrumentation in softirq ("No shared variables, all the data
+# are CPU local" => assume no data races), to reduce overhead in interrupts.
+KCSAN_SANITIZE_softirq.o = n
# These are called from save_stack_trace() on slub debug path,
# and produce insane amounts of uninteresting coverage.
KCOV_INSTRUMENT_module.o := n
@@ -30,11 +34,15 @@ KCOV_INSTRUMENT_extable.o := n
KCOV_INSTRUMENT_stacktrace.o := n
# Don't self-instrument.
KCOV_INSTRUMENT_kcov.o := n
+# If sanitizers detect any issues in kcov, it may lead to recursion
+# via printk, etc.
KASAN_SANITIZE_kcov.o := n
-CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+KCSAN_SANITIZE_kcov.o := n
+UBSAN_SANITIZE_kcov.o := n
+CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector
-# cond_syscall is currently not LTO compatible
-CFLAGS_sys_ni.o = $(DISABLE_LTO)
+# Don't instrument error handlers
+CFLAGS_REMOVE_cfi.o := $(CC_FLAGS_CFI)
obj-y += sched/
obj-y += locking/
@@ -44,13 +52,14 @@ obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
obj-y += dma/
+obj-y += entry/
-obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
+obj-$(CONFIG_KCMP) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
-obj-$(CONFIG_FUTEX) += futex.o
+obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@@ -76,7 +85,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
-obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
@@ -94,7 +102,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_ELFCORE) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
@@ -103,6 +110,10 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
obj-$(CONFIG_BPF) += bpf/
+obj-$(CONFIG_KCSAN) += kcsan/
+obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o
+obj-$(CONFIG_HAVE_STATIC_CALL_INLINE) += static_call.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -115,19 +126,30 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
obj-$(CONFIG_HAS_IOMEM) += iomem.o
obj-$(CONFIG_RSEQ) += rseq.o
+obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o
+obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o
obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o
+CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN)
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
KASAN_SANITIZE_stackleak.o := n
+KCSAN_SANITIZE_stackleak.o := n
KCOV_INSTRUMENT_stackleak.o := n
+obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
+
$(obj)/configs.o: $(obj)/config_data.gz
-targets += config_data.gz
-$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
+targets += config_data config_data.gz
+$(obj)/config_data.gz: $(obj)/config_data FORCE
$(call if_changed,gzip)
+filechk_cat = cat $<
+
+$(obj)/config_data: $(KCONFIG_CONFIG) FORCE
+ $(call filechk,cat)
+
$(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz
quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz
diff --git a/kernel/acct.c b/kernel/acct.c
index 11ff4a596d6b..3df53cf1dcd5 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -25,7 +25,7 @@
* Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
* XTerms and EMACS are manifestations of pure evil. 21/10/98, AV.
*
- * Fixed a nasty interaction with with sys_umount(). If the accointing
+ * Fixed a nasty interaction with sys_umount(). If the accounting
* was suspeneded we failed to stop it on umount(). Messy.
* Another one: remount to readonly didn't stop accounting.
* Question: what should we do if we have CAP_SYS_ADMIN but not
@@ -40,7 +40,7 @@
* is one more bug... 10/11/98, AV.
*
* Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold
- * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks
+ * ->mmap_lock to walk the vma list of current->mm. Nasty, since it leaks
* a struct file opened for write. Fixed. 2/6/2000, AV.
*/
@@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
-#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>
@@ -263,12 +262,12 @@ static DEFINE_MUTEX(acct_on_mutex);
* sys_acct - enable/disable process accounting
* @name: file name for accounting records or NULL to shutdown accounting
*
- * Returns 0 for success or negative errno values for failure.
- *
* sys_acct() is the only system call needed to implement process
* accounting. It takes the name of the file where accounting records
* should be written. If the filename is NULL, accounting will be
* shutdown.
+ *
+ * Returns: 0 for success or negative errno values for failure.
*/
SYSCALL_DEFINE1(acct, const char __user *, name)
{
@@ -381,9 +380,7 @@ static comp2_t encode_comp2_t(u64 value)
return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1));
}
}
-#endif
-
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
/*
* encode an u64 into a 32 bit IEEE float
*/
@@ -480,7 +477,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/*
* Accounting records are not subject to resource limits.
*/
- flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+ flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
@@ -500,8 +497,7 @@ static void do_acct_process(struct bsd_acct_struct *acct)
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
-#endif
-#if ACCT_VERSION == 3
+#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
@@ -541,13 +537,13 @@ void acct_collect(long exitcode, int group_dead)
if (group_dead && current->mm) {
struct vm_area_struct *vma;
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = current->mm->mmap;
while (vma) {
vsize += vma->vm_end - vma->vm_start;
vma = vma->vm_next;
}
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
}
spin_lock_irq(&current->sighand->siglock);
@@ -586,9 +582,7 @@ static void slow_acct_process(struct pid_namespace *ns)
}
/**
- * acct_process
- *
- * handles process accounting for an exiting task
+ * acct_process - handles process accounting for an exiting task
*/
void acct_process(void)
{
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..b8d7a663497f 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,6 +78,12 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
+static long long microseconds_since(ktime_t start)
+{
+ ktime_t now = ktime_get();
+ return ktime_to_ns(ktime_sub(now, start)) >> 10;
+}
+
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
struct async_entry *first = NULL;
@@ -111,24 +117,18 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t uninitialized_var(calltime), delta, rettime;
+ ktime_t calltime;
/* 1) run (and print duration) */
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("calling %lli_%pS @ %i\n",
- (long long)entry->cookie,
- entry->func, task_pid_nr(current));
- calltime = ktime_get();
- }
+ pr_debug("calling %lli_%pS @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
+ calltime = ktime_get();
+
entry->func(entry->data, entry->cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- rettime = ktime_get();
- delta = ktime_sub(rettime, calltime);
- pr_debug("initcall %lli_%pS returned 0 after %lld usecs\n",
- (long long)entry->cookie,
- entry->func,
- (long long)ktime_to_ns(delta) >> 10);
- }
+
+ pr_debug("initcall %lli_%pS returned after %lld usecs\n",
+ (long long)entry->cookie, entry->func,
+ microseconds_since(calltime));
/* 2) remove self from the pending queues */
spin_lock_irqsave(&async_lock, flags);
@@ -246,24 +246,6 @@ void async_synchronize_full(void)
EXPORT_SYMBOL_GPL(async_synchronize_full);
/**
- * async_unregister_domain - ensure no more anonymous waiters on this domain
- * @domain: idle domain to flush out of any async_synchronize_full instances
- *
- * async_synchronize_{cookie|full}_domain() are not flushed since callers
- * of these routines should know the lifetime of @domain
- *
- * Prefer ASYNC_DOMAIN_EXCLUSIVE() declarations over flushing
- */
-void async_unregister_domain(struct async_domain *domain)
-{
- spin_lock_irq(&async_lock);
- WARN_ON(!domain->registered || !list_empty(&domain->pending));
- domain->registered = 0;
- spin_unlock_irq(&async_lock);
-}
-EXPORT_SYMBOL_GPL(async_unregister_domain);
-
-/**
* async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
* @domain: the domain to synchronize
*
@@ -287,23 +269,15 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
*/
void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain)
{
- ktime_t uninitialized_var(starttime), delta, endtime;
+ ktime_t starttime;
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- pr_debug("async_waiting @ %i\n", task_pid_nr(current));
- starttime = ktime_get();
- }
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
+ starttime = ktime_get();
wait_event(async_done, lowest_in_progress(domain) >= cookie);
- if (initcall_debug && system_state < SYSTEM_RUNNING) {
- endtime = ktime_get();
- delta = ktime_sub(endtime, starttime);
-
- pr_debug("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current),
- (long long)ktime_to_ns(delta) >> 10);
- }
+ pr_debug("async_continuing @ %i after %lli usec\n", task_pid_nr(current),
+ microseconds_since(starttime));
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
diff --git a/kernel/audit.c b/kernel/audit.c
index 9ddfe2aa6671..e4bbe2c70c26 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -67,7 +67,7 @@
#define AUDIT_DISABLED -1
#define AUDIT_UNINITIALIZED 0
#define AUDIT_INITIALIZED 1
-static int audit_initialized;
+static int audit_initialized = AUDIT_UNINITIALIZED;
u32 audit_enabled = AUDIT_OFF;
bool audit_ever_enabled = !!AUDIT_OFF;
@@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
-kuid_t audit_sig_uid = INVALID_UID;
-pid_t audit_sig_pid = -1;
-u32 audit_sig_sid = 0;
+static kuid_t audit_sig_uid = INVALID_UID;
+static pid_t audit_sig_pid = -1;
+static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -136,6 +136,11 @@ u32 audit_sig_sid = 0;
*/
static atomic_t audit_lost = ATOMIC_INIT(0);
+/* Monotonically increasing sum of time the kernel has spent
+ * waiting while the backlog limit is exceeded.
+ */
+static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);
+
/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -518,7 +523,7 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net)
}
/**
- * kauditd_print_skb - Print the audit record to the ring buffer
+ * kauditd_printk_skb - Print the audit record to the ring buffer
* @skb: audit record
*
* Whatever the reason, this packet may not make it to the auditd connection
@@ -713,7 +718,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
{
int rc = 0;
struct sk_buff *skb;
- static unsigned int failed = 0;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
@@ -730,32 +735,30 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
(*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -880,7 +883,7 @@ main_queue:
return 0;
}
-int audit_send_list(void *_dest)
+int audit_send_list_thread(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
@@ -924,19 +927,29 @@ out_kfree_skb:
return NULL;
}
+static void audit_free_reply(struct audit_reply *reply)
+{
+ if (!reply)
+ return;
+
+ kfree_skb(reply->skb);
+ if (reply->net)
+ put_net(reply->net);
+ kfree(reply);
+}
+
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct sock *sk = audit_get_sk(reply->net);
audit_ctl_lock();
audit_ctl_unlock();
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
- netlink_unicast(sk, reply->skb, reply->portid, 0);
- put_net(reply->net);
- kfree(reply);
+ netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
+ reply->skb = NULL;
+ audit_free_reply(reply);
return 0;
}
@@ -950,35 +963,32 @@ static int audit_send_reply_thread(void *arg)
* @payload: payload data
* @size: payload size
*
- * Allocates an skb, builds the netlink message, and sends it to the port id.
- * No failure notifications.
+ * Allocates a skb, builds the netlink message, and sends it to the port id.
*/
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
- struct sk_buff *skb;
struct task_struct *tsk;
- struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
- GFP_KERNEL);
+ struct audit_reply *reply;
+ reply = kzalloc(sizeof(*reply), GFP_KERNEL);
if (!reply)
return;
- skb = audit_make_reply(seq, type, done, multi, payload, size);
- if (!skb)
- goto out;
-
- reply->net = get_net(net);
+ reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
+ if (!reply->skb)
+ goto err;
+ reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
reply->portid = NETLINK_CB(request_skb).portid;
- reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
- if (!IS_ERR(tsk))
- return;
- kfree_skb(skb);
-out:
- kfree(reply);
+ if (IS_ERR(tsk))
+ goto err;
+
+ return;
+
+err:
+ audit_free_reply(reply);
}
/*
@@ -1193,17 +1203,18 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_GET: {
struct audit_status s;
memset(&s, 0, sizeof(s));
- s.enabled = audit_enabled;
- s.failure = audit_failure;
+ s.enabled = audit_enabled;
+ s.failure = audit_failure;
/* NOTE: use pid_vnr() so the PID is relative to the current
* namespace */
- s.pid = auditd_pid_vnr();
- s.rate_limit = audit_rate_limit;
- s.backlog_limit = audit_backlog_limit;
- s.lost = atomic_read(&audit_lost);
- s.backlog = skb_queue_len(&audit_queue);
- s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
- s.backlog_wait_time = audit_backlog_wait_time;
+ s.pid = auditd_pid_vnr();
+ s.rate_limit = audit_rate_limit;
+ s.backlog_limit = audit_backlog_limit;
+ s.lost = atomic_read(&audit_lost);
+ s.backlog = skb_queue_len(&audit_queue);
+ s.feature_bitmap = AUDIT_FEATURE_BITMAP_ALL;
+ s.backlog_wait_time = audit_backlog_wait_time;
+ s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
@@ -1307,6 +1318,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_config_change("lost", 0, lost, 1);
return lost;
}
+ if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
+ u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);
+
+ audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
+ return actual;
+ }
break;
}
case AUDIT_GET_FEATURE:
@@ -1326,6 +1343,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
@@ -1424,7 +1444,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
- sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@@ -1437,7 +1457,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, sizeof(*sig_data) + len);
+ sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -1520,22 +1540,76 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
+}
+
+/* Log information about who is connecting to the audit multicast socket */
+static void audit_log_multicast(int group, const char *op, int err)
+{
+ const struct cred *cred;
+ struct tty_struct *tty;
+ char comm[sizeof(current->comm)];
+ struct audit_buffer *ab;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
+ if (!ab)
+ return;
+
+ cred = current_cred();
+ tty = audit_get_tty();
+ audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
+ task_pid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ tty ? tty_name(tty) : "(none)",
+ audit_get_sessionid(current));
+ audit_put_tty(tty);
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_d_path_exe(ab, current->mm); /* exe= */
+ audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
+ audit_log_end(ab);
}
/* Run custom bind function on netlink socket group connect or bind requests. */
-static int audit_bind(struct net *net, int group)
+static int audit_multicast_bind(struct net *net, int group)
{
+ int err = 0;
+
if (!capable(CAP_AUDIT_READ))
- return -EPERM;
+ err = -EPERM;
+ audit_log_multicast(group, "connect", err);
+ return err;
+}
- return 0;
+static void audit_multicast_unbind(struct net *net, int group)
+{
+ audit_log_multicast(group, "disconnect", 0);
}
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
- .bind = audit_bind,
+ .bind = audit_multicast_bind,
+ .unbind = audit_multicast_unbind,
.flags = NL_CFG_F_NONROOT_RECV,
.groups = AUDIT_NLGRP_MAX,
};
@@ -1547,7 +1621,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1717,7 +1792,7 @@ unsigned int audit_serial(void)
{
static atomic_t serial = ATOMIC_INIT(0);
- return atomic_add_return(1, &serial);
+ return atomic_inc_return(&serial);
}
static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1749,7 +1824,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
{
struct audit_buffer *ab;
struct timespec64 t;
- unsigned int uninitialized_var(serial);
+ unsigned int serial;
if (audit_initialized != AUDIT_INITIALIZED)
return NULL;
@@ -1763,7 +1838,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
@@ -1775,12 +1852,15 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
/* sleep if we are allowed and we haven't exhausted our
* backlog wait limit */
if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
+ long rtime = stime;
+
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&audit_backlog_wait,
&wait);
set_current_state(TASK_UNINTERRUPTIBLE);
- stime = schedule_timeout(stime);
+ stime = schedule_timeout(rtime);
+ atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
remove_wait_queue(&audit_backlog_wait, &wait);
} else {
if (audit_rate_check() && printk_ratelimit())
@@ -1800,6 +1880,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
}
audit_get_stamp(ab->ctx, &t, &serial);
+ /* cancel dummy context to enable supporting records */
+ if (ctx)
+ ctx->dummy = 0;
audit_log_format(ab, "audit(%llu.%03lu:%u): ",
(unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);
@@ -2028,13 +2111,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
- audit_log_string(ab, "<no_memory>");
+ audit_log_format(ab, "\"<no_memory>\"");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
- audit_log_string(ab, "<too_long>");
+ audit_log_format(ab, "\"<too_long>\"");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);
@@ -2064,7 +2147,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
- security_task_getsecid(current, &sid);
+ security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@@ -2217,7 +2300,7 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
uid = from_kuid(&init_user_ns, task_uid(current));
oldloginuid = from_kuid(&init_user_ns, koldloginuid);
- loginuid = from_kuid(&init_user_ns, kloginuid),
+ loginuid = from_kuid(&init_user_ns, kloginuid);
tty = audit_get_tty();
audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
@@ -2285,7 +2368,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
- security_task_getsecid(current, &audit_sig_sid);
+ security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);
@@ -2297,7 +2380,7 @@ int audit_signal_info(int sig, struct task_struct *t)
*
* We can not do a netlink send inside an irq context because it blocks (last
* arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
- * queue and a tasklet is scheduled to remove them from the queue outside the
+ * queue and a kthread is scheduled to remove them from the queue outside the
* irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
diff --git a/kernel/audit.h b/kernel/audit.h
index 6fb7160412d4..c4498090a5bd 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -1,16 +1,20 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* audit -- definition of audit_context structure and supporting types
+/* audit -- definition of audit_context structure and supporting types
*
* Copyright 2003-2004 Red Hat, Inc.
* Copyright 2005 Hewlett-Packard Development Company, L.P.
* Copyright 2005 IBM Corporation
*/
+#ifndef _KERNEL_AUDIT_H_
+#define _KERNEL_AUDIT_H_
+
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
+#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@@ -21,16 +25,16 @@
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
enum audit_state {
- AUDIT_DISABLED, /* Do not create per-task audit_context.
+ AUDIT_STATE_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
+ AUDIT_STATE_BUILD, /* Create the per-task audit_context,
* and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
* should be recorded. */
- AUDIT_RECORD_CONTEXT /* Create the per-task audit_context,
+ AUDIT_STATE_RECORD /* Create the per-task audit_context,
* always fill it in at syscall entry
* time, and always write out the audit
* record at syscall exit time. */
@@ -97,10 +101,15 @@ struct audit_proctitle {
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
+ enum {
+ AUDIT_CTX_UNUSED, /* audit_context is currently unused */
+ AUDIT_CTX_SYSCALL, /* in use by syscall */
+ AUDIT_CTX_URING, /* in use by io_uring */
+ } context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
+ int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
@@ -185,6 +194,7 @@ struct audit_context {
int fd;
int flags;
} mmap;
+ struct open_how openat2;
struct {
int argc;
} execve;
@@ -229,7 +239,7 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *_dest);
+int audit_send_list_thread(void *_dest);
extern int selinux_audit_rule_update(void);
@@ -292,8 +302,8 @@ extern void audit_filter_inodes(struct task_struct *tsk,
extern struct list_head *audit_killed_trees(void);
#else /* CONFIG_AUDITSYSCALL */
#define auditsc_get_stamp(c, t, s) 0
-#define audit_put_watch(w) {}
-#define audit_get_watch(w) {}
+#define audit_put_watch(w) do { } while (0)
+#define audit_get_watch(w) do { } while (0)
#define audit_to_watch(k, p, l, o) (-EINVAL)
#define audit_add_watch(k, l) (-EINVAL)
#define audit_remove_watch_rule(k) BUG()
@@ -302,8 +312,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
#define audit_mark_path(m) ""
-#define audit_remove_mark(m)
-#define audit_remove_mark_rule(k)
+#define audit_remove_mark(m) do { } while (0)
+#define audit_remove_mark_rule(k) do { } while (0)
#define audit_mark_compare(m, i, d) 0
#define audit_exe_compare(t, m) (-EINVAL)
#define audit_dupe_exe(n, o) (-EINVAL)
@@ -311,8 +321,8 @@ extern struct list_head *audit_killed_trees(void);
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
#define audit_make_tree(rule, str, op) -EINVAL
-#define audit_trim_trees() (void)0
-#define audit_put_tree(tree) (void)0
+#define audit_trim_trees() do { } while (0)
+#define audit_put_tree(tree) do { } while (0)
#define audit_tag_tree(old, new) -EINVAL
#define audit_tree_path(rule) "" /* never called */
#define audit_kill_trees(context) BUG()
@@ -322,16 +332,14 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
return 0;
}
-#define audit_filter_inodes(t, c) AUDIT_DISABLED
+#define audit_filter_inodes(t, c) AUDIT_STATE_DISABLED
#endif /* CONFIG_AUDITSYSCALL */
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
-extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
-extern u32 audit_sig_sid;
-
extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
extern void audit_ctl_unlock(void);
+
+#endif
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index f0d243318452..02348b48447c 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -36,7 +36,7 @@ static struct fsnotify_group *audit_fsnotify_group;
/* fsnotify events we care about. */
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF)
static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
{
@@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
dentry = kern_path_locked(pathname, &path);
if (IS_ERR(dentry))
- return (void *)dentry; /* returning an error */
+ return ERR_CAST(dentry); /* returning an error */
inode = path.dentry->d_inode;
inode_unlock(inode);
@@ -152,44 +152,30 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
}
/* Update mark data in audit rules based on fsnotify events. */
-static int audit_mark_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
struct audit_fsnotify_mark *audit_mark;
- const struct inode *inode = NULL;
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- BUG_ON(group != audit_fsnotify_group);
-
- switch (data_type) {
- case (FSNOTIFY_EVENT_PATH):
- inode = ((const struct path *)data)->dentry->d_inode;
- break;
- case (FSNOTIFY_EVENT_INODE):
- inode = (const struct inode *)data;
- break;
- default:
- BUG();
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
- }
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
return 0;
audit_update_mark(audit_mark, inode);
- } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) {
audit_autoremove_mark_rule(audit_mark);
+ }
return 0;
}
static const struct fsnotify_ops audit_mark_fsnotify_ops = {
- .handle_event = audit_mark_handle_event,
+ .handle_inode_event = audit_mark_handle_event,
.free_mark = audit_fsnotify_free_mark,
};
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e49c912f862d..e7315d487163 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -30,7 +30,7 @@ struct audit_chunk {
int count;
atomic_long_t refs;
struct rcu_head head;
- struct node {
+ struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
@@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
- tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
+ tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@@ -188,11 +188,9 @@ static struct fsnotify_mark *alloc_mark(void)
static struct audit_chunk *alloc_chunk(int count)
{
struct audit_chunk *chunk;
- size_t size;
int i;
- size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node);
- chunk = kzalloc(size, GFP_KERNEL);
+ chunk = kzalloc(struct_size(chunk, owners, count), GFP_KERNEL);
if (!chunk)
return NULL;
@@ -271,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
-static struct audit_chunk *find_chunk(struct node *p)
+static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
@@ -324,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
list_replace_rcu(&old->hash, &new->hash);
}
-static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
+static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
@@ -461,7 +459,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
- struct node *p;
+ struct audit_node *p;
int n;
mutex_lock(&audit_tree_group->mark_mutex);
@@ -572,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
- struct node *p;
+ struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
- p = list_first_entry(&victim->chunks, struct node, list);
+ p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
@@ -595,7 +593,6 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
spin_lock(&hash_lock);
}
spin_unlock(&hash_lock);
- put_tree(victim);
}
/*
@@ -604,6 +601,7 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
static void prune_one(struct audit_tree *victim)
{
prune_tree_chunks(victim, false);
+ put_tree(victim);
}
/* trim the uncommitted chunks from tree */
@@ -618,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
- struct node *node = list_entry(p, struct node, list);
+ struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
@@ -686,13 +684,12 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
- struct node *node;
+ struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path);
@@ -729,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
- rule->listnr != AUDIT_FILTER_EXIT ||
+ (rule->listnr != AUDIT_FILTER_EXIT &&
+ rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@@ -842,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
drop_collected_mounts(mnt);
if (!err) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -901,8 +899,7 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(cursor.next, struct audit_tree, list);
get_tree(tree);
- list_del(&cursor);
- list_add(&cursor, &tree->list);
+ list_move(&cursor, &tree->list);
mutex_unlock(&audit_filter_mutex);
err = kern_path(tree->pathname, 0, &path2);
@@ -927,8 +924,7 @@ int audit_tag_tree(char *old, char *new)
mutex_lock(&audit_filter_mutex);
spin_lock(&hash_lock);
if (!tree->goner) {
- list_del(&tree->list);
- list_add(&tree->list, &tree_list);
+ list_move(&tree->list, &tree_list);
}
spin_unlock(&hash_lock);
put_tree(tree);
@@ -939,12 +935,11 @@ int audit_tag_tree(char *old, char *new)
tree = container_of(barrier.prev, struct audit_tree, list);
get_tree(tree);
- list_del(&tree->list);
- list_add(&tree->list, &barrier);
+ list_move(&tree->list, &barrier);
mutex_unlock(&audit_filter_mutex);
if (!failed) {
- struct node *node;
+ struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@@ -1037,11 +1032,9 @@ static void evict_chunk(struct audit_chunk *chunk)
audit_schedule_prune();
}
-static int audit_tree_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *file_name, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *file_name, u32 cookie)
{
return 0;
}
@@ -1070,7 +1063,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *mark,
}
static const struct fsnotify_ops audit_tree_ops = {
- .handle_event = audit_tree_handle_event,
+ .handle_inode_event = audit_tree_handle_event,
.freeing_mark = audit_tree_freeing_mark,
.free_mark = audit_tree_destroy_watch,
};
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 4508d5e0cf69..713b256be944 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -53,7 +53,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
+ FS_MOVE_SELF | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
- krule->listnr != AUDIT_FILTER_EXIT ||
+ (krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@@ -302,8 +303,6 @@ static void audit_update_watch(struct audit_parent *parent,
if (oentry->rule.exe)
audit_remove_mark(oentry->rule.exe);
- audit_watch_log_rule_change(r, owatch, "updated_rules");
-
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
@@ -466,32 +465,16 @@ void audit_remove_watch_rule(struct audit_krule *krule)
}
/* Update watch data in audit rules based on fsnotify events. */
-static int audit_watch_handle_event(struct fsnotify_group *group,
- struct inode *to_tell,
- u32 mask, const void *data, int data_type,
- const struct qstr *dname, u32 cookie,
- struct fsnotify_iter_info *iter_info)
+static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *dname, u32 cookie)
{
- struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
- const struct inode *inode;
struct audit_parent *parent;
parent = container_of(inode_mark, struct audit_parent, mark);
- BUG_ON(group != audit_watch_group);
-
- switch (data_type) {
- case (FSNOTIFY_EVENT_PATH):
- inode = d_backing_inode(((const struct path *)data)->dentry);
- break;
- case (FSNOTIFY_EVENT_INODE):
- inode = (const struct inode *)data;
- break;
- default:
- BUG();
- inode = NULL;
- break;
- }
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
+ return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
@@ -504,7 +487,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
}
static const struct fsnotify_ops audit_watch_fsnotify_ops = {
- .handle_event = audit_watch_handle_event,
+ .handle_inode_event = audit_watch_handle_event,
.free_mark = audit_watch_free_mark,
};
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 026e34da4ace..42d99896e7a6 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
-#if AUDIT_NR_FILTERS != 7
+ LIST_HEAD_INIT(audit_filter_list[7]),
+#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
+ LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
- if (krule->listnr != AUDIT_FILTER_EXIT ||
+ if ((krule->listnr != AUDIT_FILTER_EXIT &&
+ krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
+ case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
@@ -332,6 +336,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
+ case AUDIT_PERM:
+ if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
+ return -EINVAL;
+ break;
}
switch (entry->rule.listnr) {
@@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
- data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
+ data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@@ -681,7 +689,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->values[i] = AUDIT_UID_UNSET;
break;
}
- /* fall through - if set */
+ fallthrough; /* if set */
default:
data->values[i] = f->val;
}
@@ -980,7 +988,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
}
entry->rule.prio = ~0ULL;
- if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
+ if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
+ entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@@ -1083,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
- sizeof(*data) + data->buflen);
+ struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@@ -1161,11 +1170,8 @@ int audit_rule_change(int type, int seq, void *data, size_t datasz)
*/
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
- u32 portid = NETLINK_CB(request_skb).portid;
- struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
- int err = 0;
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1173,25 +1179,26 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq)
* happen if we're actually running in the context of auditctl
* trying to _send_ the stuff */
- dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
+ dest = kmalloc(sizeof(*dest), GFP_KERNEL);
if (!dest)
return -ENOMEM;
- dest->net = get_net(net);
- dest->portid = portid;
+ dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
+ dest->portid = NETLINK_CB(request_skb).portid;
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
audit_list_rules(seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
- tsk = kthread_run(audit_send_list, dest, "audit_send_list");
+ tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
if (IS_ERR(tsk)) {
skb_queue_purge(&dest->q);
+ put_net(dest->net);
kfree(dest);
- err = PTR_ERR(tsk);
+ return PTR_ERR(tsk);
}
- return err;
+ return 0;
}
int audit_comparator(u32 left, u32 op, u32 right)
@@ -1361,7 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
- security_task_getsecid(current, &sid);
+ security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4effe01ebbe2..fce5d43a933f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@@ -75,6 +62,8 @@
#include <linux/uaccess.h>
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
+#include <uapi/linux/netfilter/nf_tables.h>
+#include <uapi/linux/openat2.h> // struct open_how
#include "audit.h"
@@ -101,8 +90,6 @@ struct audit_aux_data {
int type;
};
-#define AUDIT_AUX_IPCPERM 0
-
/* Number of target pids per aux struct. */
#define AUDIT_AUX_PIDS 16
@@ -130,15 +117,44 @@ struct audit_tree_refs {
struct audit_chunk *c[31];
};
+struct audit_nfcfgop_tab {
+ enum audit_nfcfgop op;
+ const char *s;
+};
+
+static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
+ { AUDIT_XT_OP_REGISTER, "xt_register" },
+ { AUDIT_XT_OP_REPLACE, "xt_replace" },
+ { AUDIT_XT_OP_UNREGISTER, "xt_unregister" },
+ { AUDIT_NFT_OP_TABLE_REGISTER, "nft_register_table" },
+ { AUDIT_NFT_OP_TABLE_UNREGISTER, "nft_unregister_table" },
+ { AUDIT_NFT_OP_CHAIN_REGISTER, "nft_register_chain" },
+ { AUDIT_NFT_OP_CHAIN_UNREGISTER, "nft_unregister_chain" },
+ { AUDIT_NFT_OP_RULE_REGISTER, "nft_register_rule" },
+ { AUDIT_NFT_OP_RULE_UNREGISTER, "nft_unregister_rule" },
+ { AUDIT_NFT_OP_SET_REGISTER, "nft_register_set" },
+ { AUDIT_NFT_OP_SET_UNREGISTER, "nft_unregister_set" },
+ { AUDIT_NFT_OP_SETELEM_REGISTER, "nft_register_setelem" },
+ { AUDIT_NFT_OP_SETELEM_UNREGISTER, "nft_unregister_setelem" },
+ { AUDIT_NFT_OP_GEN_REGISTER, "nft_register_gen" },
+ { AUDIT_NFT_OP_OBJ_REGISTER, "nft_register_obj" },
+ { AUDIT_NFT_OP_OBJ_UNREGISTER, "nft_unregister_obj" },
+ { AUDIT_NFT_OP_OBJ_RESET, "nft_reset_obj" },
+ { AUDIT_NFT_OP_FLOWTABLE_REGISTER, "nft_register_flowtable" },
+ { AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, "nft_unregister_flowtable" },
+ { AUDIT_NFT_OP_INVALID, "nft_invalid" },
+};
+
static int audit_match_perm(struct audit_context *ctx, int mask)
{
unsigned n;
+
if (unlikely(!ctx))
return 0;
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
- case 0: /* native */
+ case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@@ -149,7 +165,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
- case 1: /* 32bit on biarch */
+ case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@@ -160,14 +176,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
- case 2: /* open */
+ case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
- case 3: /* openat */
+ case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
- case 4: /* socketcall */
+ case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
- case 5: /* execve */
+ case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
+ case AUDITSC_OPENAT2:
+ return mask & ACC_MODE((u32)((struct open_how *)ctx->argv[2])->flags);
default:
return 0;
}
@@ -204,7 +222,7 @@ static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
}
@@ -212,6 +230,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
int left = ctx->tree_count;
+
if (likely(left)) {
p->c[--left] = chunk;
ctx->tree_count = left;
@@ -232,6 +251,7 @@ static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
static int grow_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p = ctx->trees;
+
ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL);
if (!ctx->trees) {
ctx->trees = p;
@@ -250,6 +270,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
{
struct audit_tree_refs *q;
int n;
+
if (!p) {
/* we started with empty chain */
p = ctx->first_trees;
@@ -276,6 +297,7 @@ static void unroll_tree_refs(struct audit_context *ctx,
static void free_tree_refs(struct audit_context *ctx)
{
struct audit_tree_refs *p, *q;
+
for (p = ctx->first_trees; p; p = q) {
q = p->next;
kfree(p);
@@ -286,6 +308,7 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
{
struct audit_tree_refs *p;
int n;
+
if (!tree)
return 0;
/* full ones */
@@ -310,13 +333,13 @@ static int audit_compare_uid(kuid_t uid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_uid_comparator(uid, f->op, name->uid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_uid_comparator(uid, f->op, n->uid);
@@ -334,13 +357,13 @@ static int audit_compare_gid(kgid_t gid,
{
struct audit_names *n;
int rc;
-
+
if (name) {
rc = audit_gid_comparator(gid, f->op, name->gid);
if (rc)
return rc;
}
-
+
if (ctx) {
list_for_each_entry(n, &ctx->names_list, list) {
rc = audit_gid_comparator(gid, f->op, n->gid);
@@ -447,6 +470,9 @@ static int audit_filter_rules(struct task_struct *tsk,
u32 sid;
unsigned int sessionid;
+ if (ctx && rule->prio <= ctx->prio)
+ return 0;
+
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
@@ -523,11 +549,11 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_EXIT:
- if (ctx && ctx->return_valid)
+ if (ctx && ctx->return_valid != AUDITSC_INVALID)
result = audit_comparator(ctx->return_code, f->op, f->val);
break;
case AUDIT_SUCCESS:
- if (ctx && ctx->return_valid) {
+ if (ctx && ctx->return_valid != AUDITSC_INVALID) {
if (f->val)
result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS);
else
@@ -624,7 +650,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
@@ -640,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
- security_task_getsecid(tsk, &sid);
+ /* @tsk should always be equal to
+ * @current with the exception of
+ * fork()/copy_process() in which case
+ * the new @tsk creds are still a dup
+ * of @current's creds so we can still
+ * use security_current_getsecid_subj()
+ * here even though it always refs
+ * @current's creds
+ */
+ security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@@ -714,8 +749,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
- if (rule->prio <= ctx->prio)
- return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@@ -724,10 +757,10 @@ static int audit_filter_rules(struct task_struct *tsk,
}
switch (rule->action) {
case AUDIT_NEVER:
- *state = AUDIT_DISABLED;
+ *state = AUDIT_STATE_DISABLED;
break;
case AUDIT_ALWAYS:
- *state = AUDIT_RECORD_CONTEXT;
+ *state = AUDIT_STATE_RECORD;
break;
}
return 1;
@@ -746,14 +779,14 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
&state, true)) {
- if (state == AUDIT_RECORD_CONTEXT)
+ if (state == AUDIT_STATE_RECORD)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
return state;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return AUDIT_STATE_BUILD;
}
static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
@@ -772,33 +805,60 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
-/* At syscall entry and exit time, this filter is called if the
- * audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write an audit
- * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
+/**
+ * audit_filter_uring - apply filters to an io_uring operation
+ * @tsk: associated task
+ * @ctx: audit context
*/
-static enum audit_state audit_filter_syscall(struct task_struct *tsk,
- struct audit_context *ctx,
- struct list_head *list)
+static void audit_filter_uring(struct task_struct *tsk,
+ struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
if (auditd_test_task(tsk))
- return AUDIT_DISABLED;
+ return;
rcu_read_lock();
- list_for_each_entry_rcu(e, list, list) {
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
+ list) {
+ if (audit_in_mask(&e->rule, ctx->uring_op) &&
+ audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
+ false)) {
+ rcu_read_unlock();
+ ctx->current_state = state;
+ return;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/* At syscall exit time, this filter is called if the audit_state is
+ * not low enough that auditing cannot take place, but is also not
+ * high enough that we already know we have to write an audit record
+ * (i.e., the state is AUDIT_STATE_BUILD).
+ */
+static void audit_filter_syscall(struct task_struct *tsk,
+ struct audit_context *ctx)
+{
+ struct audit_entry *e;
+ enum audit_state state;
+
+ if (auditd_test_task(tsk))
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_EXIT], list) {
if (audit_in_mask(&e->rule, ctx->major) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
&state, false)) {
rcu_read_unlock();
ctx->current_state = state;
- return state;
+ return;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+ return;
}
/*
@@ -883,10 +943,81 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
+ context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
+ context->aux_pids = NULL;
+}
+
+/**
+ * audit_reset_context - reset a audit_context structure
+ * @ctx: the audit_context to reset
+ *
+ * All fields in the audit_context will be reset to an initial state, all
+ * references held by fields will be dropped, and private memory will be
+ * released. When this function returns the audit_context will be suitable
+ * for reuse, so long as the passed context is not NULL or a dummy context.
+ */
+static void audit_reset_context(struct audit_context *ctx)
+{
+ if (!ctx)
+ return;
+
+ /* if ctx is non-null, reset the "ctx->state" regardless */
+ ctx->context = AUDIT_CTX_UNUSED;
+ if (ctx->dummy)
+ return;
+
+ /*
+ * NOTE: It shouldn't matter in what order we release the fields, so
+ * release them in the order in which they appear in the struct;
+ * this gives us some hope of quickly making sure we are
+ * resetting the audit_context properly.
+ *
+ * Other things worth mentioning:
+ * - we don't reset "dummy"
+ * - we don't reset "state", we do reset "current_state"
+ * - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
+ * - much of this is likely overkill, but play it safe for now
+ * - we really need to work on improving the audit_context struct
+ */
+
+ ctx->current_state = ctx->state;
+ ctx->serial = 0;
+ ctx->major = 0;
+ ctx->uring_op = 0;
+ ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
+ memset(ctx->argv, 0, sizeof(ctx->argv));
+ ctx->return_code = 0;
+ ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
+ ctx->return_valid = AUDITSC_INVALID;
+ audit_free_names(ctx);
+ if (ctx->state != AUDIT_STATE_RECORD) {
+ kfree(ctx->filterkey);
+ ctx->filterkey = NULL;
+ }
+ audit_free_aux(ctx);
+ kfree(ctx->sockaddr);
+ ctx->sockaddr = NULL;
+ ctx->sockaddr_len = 0;
+ ctx->pid = ctx->ppid = 0;
+ ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
+ ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
+ ctx->personality = 0;
+ ctx->arch = 0;
+ ctx->target_pid = 0;
+ ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
+ ctx->target_sessionid = 0;
+ ctx->target_sid = 0;
+ ctx->target_comm[0] = '\0';
+ unroll_tree_refs(ctx, NULL, 0);
+ WARN_ON(!list_empty(&ctx->killed_trees));
+ ctx->type = 0;
+ audit_free_module(ctx);
+ ctx->fds[0] = -1;
+ audit_proctitle_free(ctx);
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -896,10 +1027,13 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
+ context->context = AUDIT_CTX_UNUSED;
context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
+ context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
+ context->fds[0] = -1;
+ context->return_valid = AUDITSC_INVALID;
return context;
}
@@ -919,11 +1053,11 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
- return 0; /* Return if not auditing. */
+ return 0;
state = audit_filter_task(tsk, &key);
- if (state == AUDIT_DISABLED) {
- clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ if (state == AUDIT_STATE_DISABLED) {
+ clear_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
@@ -935,20 +1069,41 @@ int audit_alloc(struct task_struct *tsk)
context->filterkey = key;
audit_set_context(tsk, context);
- set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+ set_task_syscall_work(tsk, SYSCALL_AUDIT);
return 0;
}
+/**
+ * audit_alloc_kernel - allocate an audit_context for a kernel task
+ * @tsk: the kernel task
+ *
+ * Similar to the audit_alloc() function, but intended for kernel private
+ * threads. Returns zero on success, negative values on failure.
+ */
+int audit_alloc_kernel(struct task_struct *tsk)
+{
+ /*
+ * At the moment we are just going to call into audit_alloc() to
+ * simplify the code, but there two things to keep in mind with this
+ * approach:
+ *
+ * 1. Filtering internal kernel tasks is a bit laughable in almost all
+ * cases, but there is at least one case where there is a benefit:
+ * the '-a task,never' case allows the admin to effectively disable
+ * task auditing at runtime.
+ *
+ * 2. The {set,clear}_task_syscall_work() ops likely have zero effect
+ * on these internal kernel tasks, but they probably don't hurt either.
+ */
+ return audit_alloc(tsk);
+}
+
static inline void audit_free_context(struct audit_context *context)
{
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
+ /* resetting is extra work, but it is likely just noise */
+ audit_reset_context(context);
free_tree_refs(context);
- audit_free_aux(context);
kfree(context->filterkey);
- kfree(context->sockaddr);
- audit_proctitle_free(context);
kfree(context);
}
@@ -1197,6 +1352,7 @@ static void show_special(struct audit_context *context, int *call_panic)
switch (context->type) {
case AUDIT_SOCKETCALL: {
int nargs = context->socketcall.nargs;
+
audit_log_format(ab, "nargs=%d", nargs);
for (i = 0; i < nargs; i++)
audit_log_format(ab, " a%d=%lx", i,
@@ -1212,6 +1368,7 @@ static void show_special(struct audit_context *context, int *call_panic)
if (osid) {
char *ctx = NULL;
u32 len;
+
if (security_secid_to_secctx(osid, &ctx, &len)) {
audit_log_format(ab, " osid=%u", osid);
*call_panic = 1;
@@ -1261,6 +1418,7 @@ static void show_special(struct audit_context *context, int *call_panic)
break;
case AUDIT_MQ_GETSETATTR: {
struct mq_attr *attr = &context->mq_getsetattr.mqstat;
+
audit_log_format(ab,
"mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
"mq_curmsgs=%ld ",
@@ -1279,6 +1437,12 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
+ case AUDIT_OPENAT2:
+ audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
+ context->openat2.flags,
+ context->openat2.mode,
+ context->openat2.resolve);
+ break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
@@ -1297,6 +1461,7 @@ static void show_special(struct audit_context *context, int *call_panic)
static inline int audit_proctitle_rtrim(char *proctitle, int len)
{
char *end = proctitle + len - 1;
+
while (end > proctitle && !isprint(*end))
end--;
@@ -1338,7 +1503,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
/* name was specified as a relative path and the
* directory component is the cwd
*/
- audit_log_d_path(ab, " name=", &context->pwd);
+ if (context->pwd.dentry && context->pwd.mnt)
+ audit_log_d_path(ab, " name=", &context->pwd);
+ else
+ audit_log_format(ab, " name=(null)");
break;
default:
/* log the name's directory component */
@@ -1406,9 +1574,6 @@ static void audit_log_proctitle(void)
struct audit_context *context = audit_context();
struct audit_buffer *ab;
- if (!context || context->dummy)
- return;
-
ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE);
if (!ab)
return; /* audit_panic or being filtered */
@@ -1441,6 +1606,44 @@ out:
audit_log_end(ab);
}
+/**
+ * audit_log_uring - generate a AUDIT_URINGOP record
+ * @ctx: the audit context
+ */
+static void audit_log_uring(struct audit_context *ctx)
+{
+ struct audit_buffer *ab;
+ const struct cred *cred;
+
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
+ if (!ab)
+ return;
+ cred = current_cred();
+ audit_log_format(ab, "uring_op=%d", ctx->uring_op);
+ if (ctx->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (ctx->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ ctx->return_code);
+ audit_log_format(ab,
+ " items=%d"
+ " ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
+ " fsuid=%u egid=%u sgid=%u fsgid=%u",
+ ctx->name_count,
+ task_ppid_nr(current), task_tgid_nr(current),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid));
+ audit_log_task_context(ab);
+ audit_log_key(ab, ctx->filterkey);
+ audit_log_end(ab);
+}
+
static void audit_log_exit(void)
{
int i, call_panic = 0;
@@ -1451,29 +1654,38 @@ static void audit_log_exit(void)
context->personality = current->personality;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
- if (!ab)
- return; /* audit_panic has been called */
- audit_log_format(ab, "arch=%x syscall=%d",
- context->arch, context->major);
- if (context->personality != PER_LINUX)
- audit_log_format(ab, " per=%lx", context->personality);
- if (context->return_valid)
- audit_log_format(ab, " success=%s exit=%ld",
- (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
- context->return_code);
-
- audit_log_format(ab,
- " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
- context->argv[0],
- context->argv[1],
- context->argv[2],
- context->argv[3],
- context->name_count);
-
- audit_log_task_info(ab);
- audit_log_key(ab, context->filterkey);
- audit_log_end(ab);
+ switch (context->context) {
+ case AUDIT_CTX_SYSCALL:
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
+ if (!ab)
+ return;
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
+ if (context->personality != PER_LINUX)
+ audit_log_format(ab, " per=%lx", context->personality);
+ if (context->return_valid != AUDITSC_INVALID)
+ audit_log_format(ab, " success=%s exit=%ld",
+ (context->return_valid == AUDITSC_SUCCESS ?
+ "yes" : "no"),
+ context->return_code);
+ audit_log_format(ab,
+ " a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
+ context->argv[0],
+ context->argv[1],
+ context->argv[2],
+ context->argv[3],
+ context->name_count);
+ audit_log_task_info(ab);
+ audit_log_key(ab, context->filterkey);
+ audit_log_end(ab);
+ break;
+ case AUDIT_CTX_URING:
+ audit_log_uring(context);
+ break;
+ default:
+ BUG();
+ break;
+ }
for (aux = context->aux; aux; aux = aux->next) {
@@ -1485,6 +1697,7 @@ static void audit_log_exit(void)
case AUDIT_BPRM_FCAPS: {
struct audit_aux_data_bprm_fcaps *axs = (void *)aux;
+
audit_log_format(ab, "fver=%x", axs->fcap_ver);
audit_log_cap(ab, "fp", &axs->fcap.permitted);
audit_log_cap(ab, "fi", &axs->fcap.inheritable);
@@ -1563,21 +1776,22 @@ static void audit_log_exit(void)
audit_log_name(context, n, NULL, i++, &call_panic);
}
- audit_log_proctitle();
+ if (context->context == AUDIT_CTX_SYSCALL)
+ audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
audit_log_end(ab);
if (call_panic)
- audit_panic("error converting sid to string");
+ audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
- * Called from copy_process and do_exit
+ * Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
@@ -1586,6 +1800,7 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
@@ -1594,15 +1809,21 @@ void __audit_free(struct task_struct *tsk)
* random task_struct that doesn't doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
- if (tsk == current && !context->dummy && context->in_syscall) {
- context->return_valid = 0;
+ if (tsk == current && !context->dummy) {
+ context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
-
- audit_filter_syscall(tsk, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(tsk, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit();
+ if (context->context == AUDIT_CTX_SYSCALL) {
+ audit_filter_syscall(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_exit();
+ } else if (context->context == AUDIT_CTX_URING) {
+ /* TODO: verify this case is real and valid */
+ audit_filter_uring(tsk, context);
+ audit_filter_inodes(tsk, context);
+ if (context->current_state == AUDIT_STATE_RECORD)
+ audit_log_uring(context);
+ }
}
audit_set_context(tsk, NULL);
@@ -1610,6 +1831,131 @@ void __audit_free(struct task_struct *tsk)
}
/**
+ * audit_return_fixup - fixup the return codes in the audit_context
+ * @ctx: the audit_context
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * We need to fixup the return code in the audit logs if the actual return
+ * codes are later going to be fixed by the arch specific signal handlers.
+ */
+static void audit_return_fixup(struct audit_context *ctx,
+ int success, long code)
+{
+ /*
+ * This is actually a test for:
+ * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
+ * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
+ *
+ * but is faster than a bunch of ||
+ */
+ if (unlikely(code <= -ERESTARTSYS) &&
+ (code >= -ERESTART_RESTARTBLOCK) &&
+ (code != -ENOIOCTLCMD))
+ ctx->return_code = -EINTR;
+ else
+ ctx->return_code = code;
+ ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
+}
+
+/**
+ * __audit_uring_entry - prepare the kernel task's audit context for io_uring
+ * @op: the io_uring opcode
+ *
+ * This is similar to audit_syscall_entry() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_entry() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_entry(u8 op)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->state == AUDIT_STATE_DISABLED)
+ return;
+
+ /*
+ * NOTE: It's possible that we can be called from the process' context
+ * before it returns to userspace, and before audit_syscall_exit()
+ * is called. In this case there is not much to do, just record
+ * the io_uring details and return.
+ */
+ ctx->uring_op = op;
+ if (ctx->context == AUDIT_CTX_SYSCALL)
+ return;
+
+ ctx->dummy = !audit_n_rules;
+ if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
+ ctx->prio = 0;
+
+ ctx->context = AUDIT_CTX_URING;
+ ctx->current_state = ctx->state;
+ ktime_get_coarse_real_ts64(&ctx->ctime);
+}
+
+/**
+ * __audit_uring_exit - wrap up the kernel task's audit context after io_uring
+ * @success: true/false value to indicate if the operation succeeded or not
+ * @code: operation return code
+ *
+ * This is similar to audit_syscall_exit() but is intended for use by io_uring
+ * operations. This function should only ever be called from
+ * audit_uring_exit() as we rely on the audit context checking present in that
+ * function.
+ */
+void __audit_uring_exit(int success, long code)
+{
+ struct audit_context *ctx = audit_context();
+
+ if (ctx->context == AUDIT_CTX_SYSCALL) {
+ /*
+ * NOTE: See the note in __audit_uring_entry() about the case
+ * where we may be called from process context before we
+ * return to userspace via audit_syscall_exit(). In this
+ * case we simply emit a URINGOP record and bail, the
+ * normal syscall exit handling will take care of
+ * everything else.
+ * It is also worth mentioning that when we are called,
+ * the current process creds may differ from the creds
+ * used during the normal syscall processing; keep that
+ * in mind if/when we move the record generation code.
+ */
+
+ /*
+ * We need to filter on the syscall info here to decide if we
+ * should emit a URINGOP record. I know it seems odd but this
+ * solves the problem where users have a filter to block *all*
+ * syscall records in the "exit" filter; we want to preserve
+ * the behavior here.
+ */
+ audit_filter_syscall(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ return;
+
+ audit_log_uring(ctx);
+ return;
+ }
+
+ /* this may generate CONFIG_CHANGE records */
+ if (!list_empty(&ctx->killed_trees))
+ audit_kill_trees(ctx);
+
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_uring(current, ctx);
+ audit_filter_inodes(current, ctx);
+ if (ctx->current_state != AUDIT_STATE_RECORD)
+ goto out;
+ audit_return_fixup(ctx, success, code);
+ audit_log_exit();
+
+out:
+ audit_reset_context(ctx);
+}
+
+/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
* @a1: additional syscall register 1
@@ -1620,7 +1966,7 @@ void __audit_free(struct task_struct *tsk)
* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
- * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * per-task filter or from the per-syscall filter is AUDIT_STATE_RECORD,
* then the record will be written at syscall exit time (otherwise, it
* will only be written if another part of the kernel requests that it
* be written).
@@ -1634,14 +1980,19 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
if (!audit_enabled || !context)
return;
- BUG_ON(context->in_syscall || context->name_count);
+ WARN_ON(context->context != AUDIT_CTX_UNUSED);
+ WARN_ON(context->name_count);
+ if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
+ audit_panic("unrecoverable error in audit_syscall_entry()");
+ return;
+ }
state = context->state;
- if (state == AUDIT_DISABLED)
+ if (state == AUDIT_STATE_DISABLED)
return;
context->dummy = !audit_n_rules;
- if (!context->dummy && state == AUDIT_BUILD_CONTEXT) {
+ if (!context->dummy && state == AUDIT_STATE_BUILD) {
context->prio = 0;
if (auditd_test_task(current))
return;
@@ -1653,10 +2004,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
- context->serial = 0;
- context->in_syscall = 1;
+ context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
- context->ppid = 0;
ktime_get_coarse_real_ts64(&context->ctime);
}
@@ -1666,71 +2015,34 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
* @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
- * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * auditable (either because of the AUDIT_STATE_RECORD state from
* filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
void __audit_syscall_exit(int success, long return_code)
{
- struct audit_context *context;
+ struct audit_context *context = audit_context();
- context = audit_context();
- if (!context)
- return;
+ if (!context || context->dummy ||
+ context->context != AUDIT_CTX_SYSCALL)
+ goto out;
+ /* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
- if (!context->dummy && context->in_syscall) {
- if (success)
- context->return_valid = AUDITSC_SUCCESS;
- else
- context->return_valid = AUDITSC_FAILURE;
+ /* run through both filters to ensure we set the filterkey properly */
+ audit_filter_syscall(current, context);
+ audit_filter_inodes(current, context);
+ if (context->current_state < AUDIT_STATE_RECORD)
+ goto out;
- /*
- * we need to fix up the return code in the audit logs if the
- * actual return codes are later going to be fixed up by the
- * arch specific signal handlers
- *
- * This is actually a test for:
- * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
- * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
- *
- * but is faster than a bunch of ||
- */
- if (unlikely(return_code <= -ERESTARTSYS) &&
- (return_code >= -ERESTART_RESTARTBLOCK) &&
- (return_code != -ENOIOCTLCMD))
- context->return_code = -EINTR;
- else
- context->return_code = return_code;
-
- audit_filter_syscall(current, context,
- &audit_filter_list[AUDIT_FILTER_EXIT]);
- audit_filter_inodes(current, context);
- if (context->current_state == AUDIT_RECORD_CONTEXT)
- audit_log_exit();
- }
-
- context->in_syscall = 0;
- context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-
- audit_free_module(context);
- audit_free_names(context);
- unroll_tree_refs(context, NULL, 0);
- audit_free_aux(context);
- context->aux = NULL;
- context->aux_pids = NULL;
- context->target_pid = 0;
- context->target_sid = 0;
- context->sockaddr_len = 0;
- context->type = 0;
- context->fds[0] = -1;
- if (context->state != AUDIT_RECORD_CONTEXT) {
- kfree(context->filterkey);
- context->filterkey = NULL;
- }
+ audit_return_fixup(context, success, return_code);
+ audit_log_exit();
+
+out:
+ audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
@@ -1739,6 +2051,7 @@ static inline void handle_one(const struct inode *inode)
struct audit_tree_refs *p;
struct audit_chunk *chunk;
int count;
+
if (likely(!inode->i_fsnotify_marks))
return;
context = audit_context();
@@ -1780,8 +2093,10 @@ retry:
seq = read_seqbegin(&rename_lock);
for(;;) {
struct inode *inode = d_backing_inode(d);
+
if (inode && unlikely(inode->i_fsnotify_marks)) {
struct audit_chunk *chunk;
+
chunk = audit_tree_lookup(inode);
if (chunk) {
if (unlikely(!put_tree_ref(context, chunk))) {
@@ -1837,6 +2152,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
+ if (!context->pwd.dentry)
+ get_fs_pwd(current->fs, &context->pwd);
return aname;
}
@@ -1877,7 +2194,7 @@ void __audit_getname(struct filename *name)
struct audit_context *context = audit_context();
struct audit_names *n;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@@ -1888,9 +2205,6 @@ void __audit_getname(struct filename *name)
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
name->refcnt++;
-
- if (!context->pwd.dentry)
- get_fs_pwd(current->fs, &context->pwd);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -1902,7 +2216,7 @@ static inline int audit_copy_fcaps(struct audit_names *name,
if (!dentry)
return 0;
- rc = get_vfs_caps_from_disk(dentry, &caps);
+ rc = get_vfs_caps_from_disk(&init_user_ns, dentry, &caps);
if (rc)
return rc;
@@ -1952,7 +2266,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2070,7 +2384,7 @@ void __audit_inode_child(struct inode *parent,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
- if (!context->in_syscall)
+ if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@@ -2169,7 +2483,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
- if (!ctx->in_syscall)
+ if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
@@ -2178,7 +2492,7 @@ int auditsc_get_stamp(struct audit_context *ctx,
*serial = ctx->serial;
if (!ctx->prio) {
ctx->prio = 1;
- ctx->current_state = AUDIT_RECORD_CONTEXT;
+ ctx->current_state = AUDIT_STATE_RECORD;
}
return 1;
}
@@ -2260,6 +2574,7 @@ void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
struct audit_context *context = audit_context();
+
context->mq_getsetattr.mqdes = mqdes;
context->mq_getsetattr.mqstat = *mqstat;
context->type = AUDIT_MQ_GETSETATTR;
@@ -2273,6 +2588,7 @@ void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
struct audit_context *context = audit_context();
+
context->ipc.uid = ipcp->uid;
context->ipc.gid = ipcp->gid;
context->ipc.mode = ipcp->mode;
@@ -2337,6 +2653,7 @@ int __audit_socketcall(int nargs, unsigned long *args)
void __audit_fd_pair(int fd1, int fd2)
{
struct audit_context *context = audit_context();
+
context->fds[0] = fd1;
context->fds[1] = fd2;
}
@@ -2354,6 +2671,7 @@ int __audit_sockaddr(int len, void *a)
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
+
if (!p)
return -ENOMEM;
context->sockaddr = p;
@@ -2372,7 +2690,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &context->target_sid);
+ security_task_getsecid_obj(t, &context->target_sid);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2399,7 +2717,7 @@ int audit_signal_info_syscall(struct task_struct *t)
ctx->target_auid = audit_get_loginuid(t);
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
- security_task_getsecid(t, &ctx->target_sid);
+ security_task_getsecid_obj(t, &ctx->target_sid);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2420,7 +2738,7 @@ int audit_signal_info_syscall(struct task_struct *t)
axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
- security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ security_task_getsecid_obj(t, &axp->target_sid[axp->pid_count]);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
@@ -2453,7 +2771,8 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->d.next = context->aux;
context->aux = (void *)ax;
- get_vfs_caps_from_disk(bprm->file->f_path.dentry, &vcaps);
+ get_vfs_caps_from_disk(&init_user_ns,
+ bprm->file->f_path.dentry, &vcaps);
ax->fcap.permitted = vcaps.permitted;
ax->fcap.inheritable = vcaps.inheritable;
@@ -2484,6 +2803,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = audit_context();
+
context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
@@ -2495,11 +2815,22 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
void __audit_mmap_fd(int fd, int flags)
{
struct audit_context *context = audit_context();
+
context->mmap.fd = fd;
context->mmap.flags = flags;
context->type = AUDIT_MMAP;
}
+void __audit_openat2_how(struct open_how *how)
+{
+ struct audit_context *context = audit_context();
+
+ context->openat2.flags = how->flags;
+ context->openat2.mode = how->mode;
+ context->openat2.resolve = how->resolve;
+ context->type = AUDIT_OPENAT2;
+}
+
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
@@ -2545,6 +2876,26 @@ void __audit_ntp_log(const struct audit_ntp_data *ad)
audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
}
+void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
+ enum audit_nfcfgop op, gfp_t gfp)
+{
+ struct audit_buffer *ab;
+ char comm[sizeof(current->comm)];
+
+ ab = audit_log_start(audit_context(), gfp, AUDIT_NETFILTER_CFG);
+ if (!ab)
+ return;
+ audit_log_format(ab, "table=%s family=%u entries=%u op=%s",
+ name, af, nentries, audit_nfcfgs[op].s);
+
+ audit_log_format(ab, " pid=%u", task_pid_nr(current));
+ audit_log_task_context(ab); /* subj= */
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, get_task_comm(comm, current));
+ audit_log_end(ab);
+}
+EXPORT_SYMBOL_GPL(__audit_log_nfcfg);
+
static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
@@ -2640,7 +2991,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
- if (likely(!ctx || !ctx->in_syscall))
+ if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index a2a97fa3071b..370217dd7e39 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -29,7 +29,7 @@ static void backtrace_test_irq_callback(unsigned long data)
complete(&backtrace_work);
}
-static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
+static DECLARE_TASKLET_OLD(backtrace_tasklet, &backtrace_test_irq_callback);
static void backtrace_test_irq(void)
{
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
new file mode 100644
index 000000000000..d24d518ddd63
--- /dev/null
+++ b/kernel/bpf/Kconfig
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+# BPF interpreter that, for example, classic socket filters depend on.
+config BPF
+ bool
+
+# Used by archs to tell that they support BPF JIT compiler plus which
+# flavour. Only one of the two can be selected for a specific arch since
+# eBPF JIT supersedes the cBPF JIT.
+
+# Classic BPF JIT (cBPF)
+config HAVE_CBPF_JIT
+ bool
+
+# Extended BPF JIT (eBPF)
+config HAVE_EBPF_JIT
+ bool
+
+# Used by archs to tell that they want the BPF JIT compiler enabled by
+# default for kernels that were compiled with BPF JIT support.
+config ARCH_WANT_DEFAULT_BPF_JIT
+ bool
+
+menu "BPF subsystem"
+
+config BPF_SYSCALL
+ bool "Enable bpf() system call"
+ select BPF
+ select IRQ_WORK
+ select TASKS_TRACE_RCU
+ select BINARY_PRINTF
+ select NET_SOCK_MSG if NET
+ default n
+ help
+ Enable the bpf() system call that allows to manipulate BPF programs
+ and maps via file descriptors.
+
+config BPF_JIT
+ bool "Enable BPF Just In Time compiler"
+ depends on BPF
+ depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
+ depends on MODULES
+ help
+ BPF programs are normally handled by a BPF interpreter. This option
+ allows the kernel to generate native code when a program is loaded
+ into the kernel. This will significantly speed-up processing of BPF
+ programs.
+
+ Note, an admin should enable this feature changing:
+ /proc/sys/net/core/bpf_jit_enable
+ /proc/sys/net/core/bpf_jit_harden (optional)
+ /proc/sys/net/core/bpf_jit_kallsyms (optional)
+
+config BPF_JIT_ALWAYS_ON
+ bool "Permanently enable BPF JIT and remove BPF interpreter"
+ depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT
+ help
+ Enables BPF JIT and removes BPF interpreter to avoid speculative
+ execution of BPF instructions by the interpreter.
+
+config BPF_JIT_DEFAULT_ON
+ def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON
+ depends on HAVE_EBPF_JIT && BPF_JIT
+
+config BPF_UNPRIV_DEFAULT_OFF
+ bool "Disable unprivileged BPF by default"
+ default y
+ depends on BPF_SYSCALL
+ help
+ Disables unprivileged BPF by default by setting the corresponding
+ /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can
+ still reenable it by setting it to 0 later on, or permanently
+ disable it by setting it to 1 (from which no other transition to
+ 0 is possible anymore).
+
+ Unprivileged BPF could be used to exploit certain potential
+ speculative execution side-channel vulnerabilities on unmitigated
+ affected hardware.
+
+ If you are unsure how to answer this question, answer Y.
+
+source "kernel/bpf/preload/Kconfig"
+
+config BPF_LSM
+ bool "Enable BPF LSM Instrumentation"
+ depends on BPF_EVENTS
+ depends on BPF_SYSCALL
+ depends on SECURITY
+ depends on BPF_JIT
+ help
+ Enables instrumentation of the security hooks with BPF programs for
+ implementing dynamic MAC and Audit Policies.
+
+ If you are unsure how to answer this question, answer N.
+
+endmenu # "BPF subsystem"
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 046ce5d98033..c1a9be6a4b9f 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,10 +1,16 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
-CFLAGS_core.o += $(call cc-disable-warning, override-init)
+ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
+# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
+cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
+endif
+CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
+obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o
@@ -12,10 +18,8 @@ obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
-ifeq ($(CONFIG_XDP_SOCKETS),y)
-obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
-endif
obj-$(CONFIG_BPF_SYSCALL) += offload.o
+obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o
endif
ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
@@ -29,4 +33,10 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o
endif
ifeq ($(CONFIG_BPF_JIT),y)
obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
+obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
+obj-$(CONFIG_BPF_PRELOAD) += preload/
+
+obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
+$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
+ $(call if_changed_rule,cc_o_c)
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 95d77770353c..c7a5be3bf8be 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -10,11 +10,13 @@
#include <linux/filter.h>
#include <linux/perf_event.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "map_in_map.h"
#define ARRAY_CREATE_FLAG_MASK \
- (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK)
+ (BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
+ BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@@ -32,8 +34,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array)
int i;
for (i = 0; i < array->map.max_entries; i++) {
- ptr = __alloc_percpu_gfp(array->elem_size, 8,
- GFP_USER | __GFP_NOWARN);
+ ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8,
+ GFP_USER | __GFP_NOWARN);
if (!ptr) {
bpf_array_free_percpu(array);
return -ENOMEM;
@@ -60,7 +62,11 @@ int array_map_alloc_check(union bpf_attr *attr)
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
- attr->map_flags & BPF_F_MMAPABLE)
+ attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
+ return -EINVAL;
+
+ if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
+ attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
if (attr->value_size > KMALLOC_MAX_SIZE)
@@ -75,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
{
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
- int ret, numa_node = bpf_map_attr_numa_node(attr);
+ int numa_node = bpf_map_attr_numa_node(attr);
u32 elem_size, index_mask, max_entries;
- bool unpriv = !capable(CAP_SYS_ADMIN);
- u64 cost, array_size, mask64;
- struct bpf_map_memory mem;
+ bool bypass_spec_v1 = bpf_bypass_spec_v1();
+ u64 array_size, mask64;
struct bpf_array *array;
elem_size = round_up(attr->value_size, 8);
@@ -95,7 +100,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
mask64 -= 1;
index_mask = mask64;
- if (unpriv) {
+ if (!bypass_spec_v1) {
/* round up array size to nearest power of 2,
* since cpu will speculate within index_mask limits
*/
@@ -120,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
}
}
- /* make sure there is no u32 overflow later in round_up() */
- cost = array_size;
- if (percpu)
- cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
-
- ret = bpf_map_charge_init(&mem, cost);
- if (ret < 0)
- return ERR_PTR(ret);
-
/* allocate all map elements and zero-initialize them */
if (attr->map_flags & BPF_F_MMAPABLE) {
void *data;
/* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */
data = bpf_map_area_mmapable_alloc(array_size, numa_node);
- if (!data) {
- bpf_map_charge_finish(&mem);
+ if (!data)
return ERR_PTR(-ENOMEM);
- }
array = data + PAGE_ALIGN(sizeof(struct bpf_array))
- offsetof(struct bpf_array, value);
} else {
array = bpf_map_area_alloc(array_size, numa_node);
}
- if (!array) {
- bpf_map_charge_finish(&mem);
+ if (!array)
return ERR_PTR(-ENOMEM);
- }
array->index_mask = index_mask;
- array->map.unpriv_array = unpriv;
+ array->map.bypass_spec_v1 = bypass_spec_v1;
/* copy mandatory map attributes */
bpf_map_init_from_attr(&array->map, attr);
- bpf_map_charge_move(&array->map.memory, &mem);
array->elem_size = elem_size;
if (percpu && bpf_array_alloc_percpu(array)) {
- bpf_map_charge_finish(&array->map.memory);
bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
@@ -208,7 +198,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
-static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
@@ -217,9 +207,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
+ if (map->map_flags & BPF_F_INNER_MAP)
+ return -EOPNOTSUPP;
+
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
@@ -294,6 +287,12 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key
return 0;
}
+static void check_and_free_timer_in_array(struct bpf_array *arr, void *val)
+{
+ if (unlikely(map_value_has_timer(&arr->map)))
+ bpf_timer_cancel_and_free(val + arr->map.timer_off);
+}
+
/* Called from syscall or from eBPF program */
static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
@@ -328,6 +327,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
copy_map_value_locked(map, val, value, false);
else
copy_map_value(map, val, value);
+ check_and_free_timer_in_array(array, val);
}
return 0;
}
@@ -381,18 +381,24 @@ static void *array_map_vmalloc_addr(struct bpf_array *array)
return (void *)round_down((unsigned long)array, PAGE_SIZE);
}
+static void array_map_free_timers(struct bpf_map *map)
+{
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ int i;
+
+ if (likely(!map_value_has_timer(map)))
+ return;
+
+ for (i = 0; i < array->map.max_entries; i++)
+ bpf_timer_cancel_and_free(array->value + array->elem_size * i +
+ map->timer_off);
+}
+
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void array_map_free(struct bpf_map *map)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
- * so the programs (can be more than one that used this map) were
- * disconnected from events. Wait for outstanding programs to complete
- * and free the array
- */
- synchronize_rcu();
-
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
@@ -486,14 +492,202 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
if (!(map->map_flags & BPF_F_MMAPABLE))
return -EINVAL;
- return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff);
+ if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) >
+ PAGE_ALIGN((u64)array->map.max_entries * array->elem_size))
+ return -EINVAL;
+
+ return remap_vmalloc_range(vma, array_map_vmalloc_addr(array),
+ vma->vm_pgoff + pgoff);
+}
+
+static bool array_map_meta_equal(const struct bpf_map *meta0,
+ const struct bpf_map *meta1)
+{
+ if (!bpf_map_meta_equal(meta0, meta1))
+ return false;
+ return meta0->map_flags & BPF_F_INNER_MAP ? true :
+ meta0->max_entries == meta1->max_entries;
}
+struct bpf_iter_seq_array_map_info {
+ struct bpf_map *map;
+ void *percpu_value_buf;
+ u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array->value + array->elem_size * index;
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_map *map = info->map;
+ struct bpf_array *array;
+ u32 index;
+
+ ++*pos;
+ ++info->index;
+ if (info->index >= map->max_entries)
+ return NULL;
+
+ array = container_of(map, struct bpf_array, map);
+ index = info->index & array->index_mask;
+ if (info->percpu_value_buf)
+ return array->pptrs[index];
+ return array->value + array->elem_size * index;
+}
+
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_seq_array_map_info *info = seq->private;
+ struct bpf_iter__bpf_map_elem ctx = {};
+ struct bpf_map *map = info->map;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int off = 0, cpu = 0;
+ void __percpu **pptr;
+ u32 size;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, v == NULL);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+
+ if (!info->percpu_value_buf) {
+ ctx.value = v;
+ } else {
+ pptr = v;
+ size = round_up(map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(info->percpu_value_buf + off,
+ per_cpu_ptr(pptr, cpu),
+ size);
+ off += size;
+ }
+ ctx.value = info->percpu_value_buf;
+ }
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+ if (!v)
+ (void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+ struct bpf_map *map = aux->map;
+ void *value_buf;
+ u32 buf_size;
+
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+ buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+ if (!value_buf)
+ return -ENOMEM;
+
+ seq_info->percpu_value_buf = value_buf;
+ }
+
+ seq_info->map = map;
+ return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+ struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+ kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+ .start = bpf_array_map_seq_start,
+ .next = bpf_array_map_seq_next,
+ .stop = bpf_array_map_seq_stop,
+ .show = bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_array_map_seq_ops,
+ .init_seq_private = bpf_iter_init_array_map,
+ .fini_seq_private = bpf_iter_fini_array_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
+};
+
+static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+ void *callback_ctx, u64 flags)
+{
+ u32 i, key, num_elems = 0;
+ struct bpf_array *array;
+ bool is_percpu;
+ u64 ret = 0;
+ void *val;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ array = container_of(map, struct bpf_array, map);
+ if (is_percpu)
+ migrate_disable();
+ for (i = 0; i < map->max_entries; i++) {
+ if (is_percpu)
+ val = this_cpu_ptr(array->pptrs[i]);
+ else
+ val = array->value + array->elem_size * i;
+ num_elems++;
+ key = i;
+ ret = callback_fn((u64)(long)map, (u64)(long)&key,
+ (u64)(long)val, (u64)(long)callback_ctx, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ break;
+ }
+
+ if (is_percpu)
+ migrate_enable();
+ return num_elems;
+}
+
+static int array_map_btf_id;
const struct bpf_map_ops array_map_ops = {
+ .map_meta_equal = array_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
.map_get_next_key = array_map_get_next_key,
+ .map_release_uref = array_map_free_timers,
.map_lookup_elem = array_map_lookup_elem,
.map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem,
@@ -505,9 +699,16 @@ const struct bpf_map_ops array_map_ops = {
.map_check_btf = array_map_check_btf,
.map_lookup_batch = generic_map_lookup_batch,
.map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &array_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
+static int percpu_array_map_btf_id;
const struct bpf_map_ops percpu_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = array_map_free,
@@ -517,6 +718,13 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_delete_elem = array_map_delete_elem,
.map_seq_show_elem = percpu_array_map_seq_show_elem,
.map_check_btf = array_map_check_btf,
+ .map_lookup_batch = generic_map_lookup_batch,
+ .map_update_batch = generic_map_update_batch,
+ .map_set_for_each_callback_args = map_set_for_each_callback_args,
+ .map_for_each_callback = bpf_for_each_array_elem,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &percpu_array_map_btf_id,
+ .iter_seq_info = &iter_seq_info,
};
static int fd_array_map_alloc_check(union bpf_attr *attr)
@@ -535,8 +743,6 @@ static void fd_array_map_free(struct bpf_map *map)
struct bpf_array *array = container_of(map, struct bpf_array, map);
int i;
- synchronize_rcu();
-
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
@@ -748,6 +954,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
struct bpf_prog *old,
struct bpf_prog *new)
{
+ u8 *old_addr, *new_addr, *old_bypass_addr;
struct prog_poke_elem *elem;
struct bpf_array_aux *aux;
@@ -768,12 +975,13 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* there could be danger of use after free otherwise.
* 2) Initially when we start tracking aux, the program
* is not JITed yet and also does not have a kallsyms
- * entry. We skip these as poke->ip_stable is not
- * active yet. The JIT will do the final fixup before
- * setting it stable. The various poke->ip_stable are
- * successively activated, so tail call updates can
- * arrive from here while JIT is still finishing its
- * final fixup for non-activated poke entries.
+ * entry. We skip these as poke->tailcall_target_stable
+ * is not active yet. The JIT will do the final fixup
+ * before setting it stable. The various
+ * poke->tailcall_target_stable are successively
+ * activated, so tail call updates can arrive from here
+ * while JIT is still finishing its final fixup for
+ * non-activated poke entries.
* 3) On program teardown, the program's kallsym entry gets
* removed out of RCU callback, but we can only untrack
* from sleepable context, therefore bpf_arch_text_poke()
@@ -790,7 +998,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
* 5) Any other error happening below from bpf_arch_text_poke()
* is a unexpected bug.
*/
- if (!READ_ONCE(poke->ip_stable))
+ if (!READ_ONCE(poke->tailcall_target_stable))
continue;
if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
continue;
@@ -798,12 +1006,39 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
poke->tail_call.key != key)
continue;
- ret = bpf_arch_text_poke(poke->ip, BPF_MOD_JUMP,
- old ? (u8 *)old->bpf_func +
- poke->adj_off : NULL,
- new ? (u8 *)new->bpf_func +
- poke->adj_off : NULL);
- BUG_ON(ret < 0 && ret != -EINVAL);
+ old_bypass_addr = old ? NULL : poke->bypass_addr;
+ old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
+ new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
+
+ if (new) {
+ ret = bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, new_addr);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ if (!old) {
+ ret = bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ poke->bypass_addr,
+ NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
+ } else {
+ ret = bpf_arch_text_poke(poke->tailcall_bypass,
+ BPF_MOD_JUMP,
+ old_bypass_addr,
+ poke->bypass_addr);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ /* let other CPUs finish the execution of program
+ * so that it will not possible to expose them
+ * to invalid nop, stack unwind, nop state
+ */
+ if (!ret)
+ synchronize_rcu();
+ ret = bpf_arch_text_poke(poke->tailcall_target,
+ BPF_MOD_JUMP,
+ old_addr, NULL);
+ BUG_ON(ret < 0 && ret != -EINVAL);
+ }
}
}
}
@@ -829,13 +1064,14 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
struct bpf_array_aux *aux;
struct bpf_map *map;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL);
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT);
if (!aux)
return ERR_PTR(-ENOMEM);
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
+ spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
@@ -863,6 +1099,12 @@ static void prog_array_map_free(struct bpf_map *map)
fd_array_map_free(map);
}
+/* prog_array->aux->{type,jited} is a runtime binding.
+ * Doing static check alone in the verifier is not enough.
+ * Thus, prog_array_map cannot be used as an inner_map
+ * and map_meta_equal is not implemented.
+ */
+static int prog_array_map_btf_id;
const struct bpf_map_ops prog_array_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = prog_array_map_alloc,
@@ -878,6 +1120,8 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
.map_release_uref = prog_array_map_clear,
.map_seq_show_elem = prog_array_map_seq_show_elem,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &prog_array_map_btf_id,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -947,6 +1191,9 @@ static void perf_event_fd_array_release(struct bpf_map *map,
struct bpf_event_entry *ee;
int i;
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ return;
+
rcu_read_lock();
for (i = 0; i < array->map.max_entries; i++) {
ee = READ_ONCE(array->ptrs[i]);
@@ -956,10 +1203,19 @@ static void perf_event_fd_array_release(struct bpf_map *map,
rcu_read_unlock();
}
+static void perf_event_fd_array_map_free(struct bpf_map *map)
+{
+ if (map->map_flags & BPF_F_PRESERVE_ELEMS)
+ bpf_fd_array_map_clear(map);
+ fd_array_map_free(map);
+}
+
+static int perf_event_array_map_btf_id;
const struct bpf_map_ops perf_event_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
- .map_free = fd_array_map_free,
+ .map_free = perf_event_fd_array_map_free,
.map_get_next_key = array_map_get_next_key,
.map_lookup_elem = fd_array_map_lookup_elem,
.map_delete_elem = fd_array_map_delete_elem,
@@ -967,6 +1223,8 @@ const struct bpf_map_ops perf_event_array_map_ops = {
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
.map_release = perf_event_fd_array_release,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &perf_event_array_map_btf_id,
};
#ifdef CONFIG_CGROUPS
@@ -989,7 +1247,9 @@ static void cgroup_fd_array_free(struct bpf_map *map)
fd_array_map_free(map);
}
+static int cgroup_array_map_btf_id;
const struct bpf_map_ops cgroup_array_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_map_alloc,
.map_free = cgroup_fd_array_free,
@@ -999,6 +1259,8 @@ const struct bpf_map_ops cgroup_array_map_ops = {
.map_fd_get_ptr = cgroup_fd_array_get_ptr,
.map_fd_put_ptr = cgroup_fd_array_put_ptr,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &cgroup_array_map_btf_id,
};
#endif
@@ -1041,7 +1303,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
-static u32 array_of_map_gen_lookup(struct bpf_map *map,
+static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
@@ -1053,7 +1315,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
- if (map->unpriv_array) {
+ if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
} else {
@@ -1072,6 +1334,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static int array_of_maps_map_btf_id;
const struct bpf_map_ops array_of_maps_map_ops = {
.map_alloc_check = fd_array_map_alloc_check,
.map_alloc = array_of_map_alloc,
@@ -1084,4 +1347,6 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
.map_gen_lookup = array_of_map_gen_lookup,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_array",
+ .map_btf_id = &array_of_maps_map_btf_id,
};
diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c
new file mode 100644
index 000000000000..b141a1346f72
--- /dev/null
+++ b/kernel/bpf/bloom_filter.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Facebook */
+
+#include <linux/bitmap.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#define BLOOM_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_ZERO_SEED | BPF_F_ACCESS_MASK)
+
+struct bpf_bloom_filter {
+ struct bpf_map map;
+ u32 bitset_mask;
+ u32 hash_seed;
+ /* If the size of the values in the bloom filter is u32 aligned,
+ * then it is more performant to use jhash2 as the underlying hash
+ * function, else we use jhash. This tracks the number of u32s
+ * in an u32-aligned value size. If the value size is not u32 aligned,
+ * this will be 0.
+ */
+ u32 aligned_u32_count;
+ u32 nr_hash_funcs;
+ unsigned long bitset[];
+};
+
+static u32 hash(struct bpf_bloom_filter *bloom, void *value,
+ u32 value_size, u32 index)
+{
+ u32 h;
+
+ if (bloom->aligned_u32_count)
+ h = jhash2(value, bloom->aligned_u32_count,
+ bloom->hash_seed + index);
+ else
+ h = jhash(value, value_size, bloom->hash_seed + index);
+
+ return h & bloom->bitset_mask;
+}
+
+static int bloom_map_peek_elem(struct bpf_map *map, void *value)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ if (!test_bit(h, bloom->bitset))
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int bloom_map_push_elem(struct bpf_map *map, void *value, u64 flags)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+ u32 i, h;
+
+ if (flags != BPF_ANY)
+ return -EINVAL;
+
+ for (i = 0; i < bloom->nr_hash_funcs; i++) {
+ h = hash(bloom, value, map->value_size, i);
+ set_bit(h, bloom->bitset);
+ }
+
+ return 0;
+}
+
+static int bloom_map_pop_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_delete_elem(struct bpf_map *map, void *value)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bloom_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -EOPNOTSUPP;
+}
+
+static struct bpf_map *bloom_map_alloc(union bpf_attr *attr)
+{
+ u32 bitset_bytes, bitset_mask, nr_hash_funcs, nr_bits;
+ int numa_node = bpf_map_attr_numa_node(attr);
+ struct bpf_bloom_filter *bloom;
+
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ if (attr->key_size != 0 || attr->value_size == 0 ||
+ attr->max_entries == 0 ||
+ attr->map_flags & ~BLOOM_CREATE_FLAG_MASK ||
+ !bpf_map_flags_access_ok(attr->map_flags) ||
+ /* The lower 4 bits of map_extra (0xF) specify the number
+ * of hash functions
+ */
+ (attr->map_extra & ~0xF))
+ return ERR_PTR(-EINVAL);
+
+ nr_hash_funcs = attr->map_extra;
+ if (nr_hash_funcs == 0)
+ /* Default to using 5 hash functions if unspecified */
+ nr_hash_funcs = 5;
+
+ /* For the bloom filter, the optimal bit array size that minimizes the
+ * false positive probability is n * k / ln(2) where n is the number of
+ * expected entries in the bloom filter and k is the number of hash
+ * functions. We use 7 / 5 to approximate 1 / ln(2).
+ *
+ * We round this up to the nearest power of two to enable more efficient
+ * hashing using bitmasks. The bitmask will be the bit array size - 1.
+ *
+ * If this overflows a u32, the bit array size will have 2^32 (4
+ * GB) bits.
+ */
+ if (check_mul_overflow(attr->max_entries, nr_hash_funcs, &nr_bits) ||
+ check_mul_overflow(nr_bits / 5, (u32)7, &nr_bits) ||
+ nr_bits > (1UL << 31)) {
+ /* The bit array size is 2^32 bits but to avoid overflowing the
+ * u32, we use U32_MAX, which will round up to the equivalent
+ * number of bytes
+ */
+ bitset_bytes = BITS_TO_BYTES(U32_MAX);
+ bitset_mask = U32_MAX;
+ } else {
+ if (nr_bits <= BITS_PER_LONG)
+ nr_bits = BITS_PER_LONG;
+ else
+ nr_bits = roundup_pow_of_two(nr_bits);
+ bitset_bytes = BITS_TO_BYTES(nr_bits);
+ bitset_mask = nr_bits - 1;
+ }
+
+ bitset_bytes = roundup(bitset_bytes, sizeof(unsigned long));
+ bloom = bpf_map_area_alloc(sizeof(*bloom) + bitset_bytes, numa_node);
+
+ if (!bloom)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&bloom->map, attr);
+
+ bloom->nr_hash_funcs = nr_hash_funcs;
+ bloom->bitset_mask = bitset_mask;
+
+ /* Check whether the value size is u32-aligned */
+ if ((attr->value_size & (sizeof(u32) - 1)) == 0)
+ bloom->aligned_u32_count =
+ attr->value_size / sizeof(u32);
+
+ if (!(attr->map_flags & BPF_F_ZERO_SEED))
+ bloom->hash_seed = get_random_int();
+
+ return &bloom->map;
+}
+
+static void bloom_map_free(struct bpf_map *map)
+{
+ struct bpf_bloom_filter *bloom =
+ container_of(map, struct bpf_bloom_filter, map);
+
+ bpf_map_area_free(bloom);
+}
+
+static void *bloom_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ /* The eBPF program should use map_peek_elem instead */
+ return ERR_PTR(-EINVAL);
+}
+
+static int bloom_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ /* The eBPF program should use map_push_elem instead */
+ return -EINVAL;
+}
+
+static int bloom_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ /* Bloom filter maps are keyless */
+ return btf_type_is_void(key_type) ? 0 : -EINVAL;
+}
+
+static int bpf_bloom_map_btf_id;
+const struct bpf_map_ops bloom_filter_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = bloom_map_alloc,
+ .map_free = bloom_map_free,
+ .map_get_next_key = bloom_map_get_next_key,
+ .map_push_elem = bloom_map_push_elem,
+ .map_peek_elem = bloom_map_peek_elem,
+ .map_pop_elem = bloom_map_pop_elem,
+ .map_lookup_elem = bloom_map_lookup_elem,
+ .map_update_elem = bloom_map_update_elem,
+ .map_delete_elem = bloom_map_delete_elem,
+ .map_check_btf = bloom_map_check_btf,
+ .map_btf_name = "bpf_bloom_filter",
+ .map_btf_id = &bpf_bloom_map_btf_id,
+};
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
new file mode 100644
index 000000000000..e29d9e3d853e
--- /dev/null
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/bpf_lsm.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(inode_cache);
+
+static struct bpf_local_storage __rcu **
+inode_storage_ptr(void *owner)
+{
+ struct inode *inode = owner;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+ return &bsb->storage;
+}
+
+static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
+ struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *inode_storage;
+ struct bpf_local_storage_map *smap;
+ struct bpf_storage_blob *bsb;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return NULL;
+
+ inode_storage =
+ rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
+ if (!inode_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(inode_storage, smap, cacheit_lockit);
+}
+
+void bpf_inode_storage_free(struct inode *inode)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ bool free_inode_storage = false;
+ struct bpf_storage_blob *bsb;
+ struct hlist_node *n;
+
+ bsb = bpf_inode(inode);
+ if (!bsb)
+ return;
+
+ rcu_read_lock();
+
+ local_storage = rcu_dereference(bsb->storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Neither the bpf_prog nor the bpf-map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added-to or deleted-from the
+ * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ raw_spin_lock_bh(&local_storage->lock);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ free_inode_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false);
+ }
+ raw_spin_unlock_bh(&local_storage->lock);
+ rcu_read_unlock();
+
+ /* free_inoode_storage should always be true as long as
+ * local_storage->list was non-empty.
+ */
+ if (free_inode_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct file *f;
+ int fd;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f)
+ return ERR_PTR(-EBADF);
+
+ sdata = inode_storage_lookup(f->f_inode, map, true);
+ fput(f);
+ return sdata ? sdata->data : NULL;
+}
+
+static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct file *f;
+ int fd;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f)
+ return -EBADF;
+ if (!inode_storage_ptr(f->f_inode)) {
+ fput(f);
+ return -EBADF;
+ }
+
+ sdata = bpf_local_storage_update(f->f_inode,
+ (struct bpf_local_storage_map *)map,
+ value, map_flags);
+ fput(f);
+ return PTR_ERR_OR_ZERO(sdata);
+}
+
+static int inode_storage_delete(struct inode *inode, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = inode_storage_lookup(inode, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata));
+
+ return 0;
+}
+
+static int bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct file *f;
+ int fd, err;
+
+ fd = *(int *)key;
+ f = fget_raw(fd);
+ if (!f)
+ return -EBADF;
+
+ err = inode_storage_delete(f->f_inode, map);
+ fput(f);
+ return err;
+}
+
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ /* explicitly check that the inode_storage_ptr is not
+ * NULL as inode_storage_lookup returns NULL in this case and
+ * bpf_local_storage_update expects the owner to have a
+ * valid storage pointer.
+ */
+ if (!inode || !inode_storage_ptr(inode))
+ return (unsigned long)NULL;
+
+ sdata = inode_storage_lookup(inode, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
+ sdata = bpf_local_storage_update(
+ inode, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST);
+ return IS_ERR(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_inode_storage_delete,
+ struct bpf_map *, map, struct inode *, inode)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!inode)
+ return -EINVAL;
+
+ /* This helper must only called from where the inode is guaranteed
+ * to have a refcount and cannot be freed.
+ */
+ return inode_storage_delete(inode, map);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache);
+ return &smap->map;
+}
+
+static void inode_storage_map_free(struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap, NULL);
+}
+
+static int inode_storage_map_btf_id;
+const struct bpf_map_ops inode_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = inode_storage_map_alloc,
+ .map_free = inode_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_inode_storage_lookup_elem,
+ .map_update_elem = bpf_fd_inode_storage_update_elem,
+ .map_delete_elem = bpf_fd_inode_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_name = "bpf_local_storage_map",
+ .map_btf_id = &inode_storage_map_btf_id,
+ .map_owner_storage_ptr = inode_storage_ptr,
+};
+
+BTF_ID_LIST_SINGLE(bpf_inode_storage_btf_ids, struct, inode)
+
+const struct bpf_func_proto bpf_inode_storage_get_proto = {
+ .func = bpf_inode_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_inode_storage_delete_proto = {
+ .func = bpf_inode_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &bpf_inode_storage_btf_ids[0],
+};
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
new file mode 100644
index 000000000000..b7aef5b3416d
--- /dev/null
+++ b/kernel/bpf/bpf_iter.c
@@ -0,0 +1,751 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+struct bpf_iter_target_info {
+ struct list_head list;
+ const struct bpf_iter_reg *reg_info;
+ u32 btf_id; /* cached value */
+};
+
+struct bpf_iter_link {
+ struct bpf_link link;
+ struct bpf_iter_aux_info aux;
+ struct bpf_iter_target_info *tinfo;
+};
+
+struct bpf_iter_priv_data {
+ struct bpf_iter_target_info *tinfo;
+ const struct bpf_iter_seq_info *seq_info;
+ struct bpf_prog *prog;
+ u64 session_id;
+ u64 seq_num;
+ bool done_stop;
+ u8 target_private[] __aligned(8);
+};
+
+static struct list_head targets = LIST_HEAD_INIT(targets);
+static DEFINE_MUTEX(targets_mutex);
+
+/* protect bpf_iter_link changes */
+static DEFINE_MUTEX(link_mutex);
+
+/* incremented on every opened seq_file */
+static atomic64_t session_id;
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info);
+
+static void bpf_iter_inc_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num++;
+}
+
+static void bpf_iter_dec_seq_num(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->seq_num--;
+}
+
+static void bpf_iter_done_stop(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ iter_priv->done_stop = true;
+}
+
+static bool bpf_iter_support_resched(struct seq_file *seq)
+{
+ struct bpf_iter_priv_data *iter_priv;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+ return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED;
+}
+
+/* maximum visited objects before bailing out */
+#define MAX_ITER_OBJECTS 1000000
+
+/* bpf_seq_read, a customized and simpler version for bpf iterator.
+ * no_llseek is assumed for this file.
+ * The following are differences from seq_read():
+ * . fixed buffer size (PAGE_SIZE)
+ * . assuming no_llseek
+ * . stop() may call bpf program, handling potential overflow there
+ */
+static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct seq_file *seq = file->private_data;
+ size_t n, offs, copied = 0;
+ int err = 0, num_objs = 0;
+ bool can_resched;
+ void *p;
+
+ mutex_lock(&seq->lock);
+
+ if (!seq->buf) {
+ seq->size = PAGE_SIZE << 3;
+ seq->buf = kvmalloc(seq->size, GFP_KERNEL);
+ if (!seq->buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+ }
+
+ if (seq->count) {
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf + seq->from, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ seq->count -= n;
+ seq->from += n;
+ copied = n;
+ goto done;
+ }
+
+ seq->from = 0;
+ p = seq->op->start(seq, &seq->index);
+ if (!p)
+ goto stop;
+ if (IS_ERR(p)) {
+ err = PTR_ERR(p);
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ /* object is skipped, decrease seq_num, so next
+ * valid object can reuse the same seq_num.
+ */
+ bpf_iter_dec_seq_num(seq);
+ seq->count = 0;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ seq->count = 0;
+ goto done;
+ }
+
+ can_resched = bpf_iter_support_resched(seq);
+ while (1) {
+ loff_t pos = seq->index;
+
+ num_objs++;
+ offs = seq->count;
+ p = seq->op->next(seq, p, &seq->index);
+ if (pos == seq->index) {
+ pr_info_ratelimited("buggy seq_file .next function %ps "
+ "did not updated position index\n",
+ seq->op->next);
+ seq->index++;
+ }
+
+ if (IS_ERR_OR_NULL(p))
+ break;
+
+ /* got a valid next object, increase seq_num */
+ bpf_iter_inc_seq_num(seq);
+
+ if (seq->count >= size)
+ break;
+
+ if (num_objs >= MAX_ITER_OBJECTS) {
+ if (offs == 0) {
+ err = -EAGAIN;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
+ err = seq->op->show(seq, p);
+ if (err > 0) {
+ bpf_iter_dec_seq_num(seq);
+ seq->count = offs;
+ } else if (err < 0 || seq_has_overflowed(seq)) {
+ seq->count = offs;
+ if (offs == 0) {
+ if (!err)
+ err = -E2BIG;
+ seq->op->stop(seq, p);
+ goto done;
+ }
+ break;
+ }
+
+ if (can_resched)
+ cond_resched();
+ }
+stop:
+ offs = seq->count;
+ /* bpf program called if !p */
+ seq->op->stop(seq, p);
+ if (!p) {
+ if (!seq_has_overflowed(seq)) {
+ bpf_iter_done_stop(seq);
+ } else {
+ seq->count = offs;
+ if (offs == 0) {
+ err = -E2BIG;
+ goto done;
+ }
+ }
+ }
+
+ n = min(seq->count, size);
+ err = copy_to_user(buf, seq->buf, n);
+ if (err) {
+ err = -EFAULT;
+ goto done;
+ }
+ copied = n;
+ seq->count -= n;
+ seq->from = n;
+done:
+ if (!copied)
+ copied = err;
+ else
+ *ppos += copied;
+ mutex_unlock(&seq->lock);
+ return copied;
+}
+
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+ const struct bpf_iter_seq_info *seq_info;
+
+ if (link->aux.map) {
+ seq_info = link->aux.map->ops->iter_seq_info;
+ if (seq_info)
+ return seq_info;
+ }
+
+ return link->tinfo->reg_info->seq_info;
+}
+
+static int iter_open(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_link *link = inode->i_private;
+
+ return prepare_seq_file(file, link, __get_seq_info(link));
+}
+
+static int iter_release(struct inode *inode, struct file *file)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+
+ seq = file->private_data;
+ if (!seq)
+ return 0;
+
+ iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
+ target_private);
+
+ if (iter_priv->seq_info->fini_seq_private)
+ iter_priv->seq_info->fini_seq_private(seq->private);
+
+ bpf_prog_put(iter_priv->prog);
+ seq->private = iter_priv;
+
+ return seq_release_private(inode, file);
+}
+
+const struct file_operations bpf_iter_fops = {
+ .open = iter_open,
+ .llseek = no_llseek,
+ .read = bpf_seq_read,
+ .release = iter_release,
+};
+
+/* The argument reg_info will be cached in bpf_iter_target_info.
+ * The common practice is to declare target reg_info as
+ * a const static variable and passed as an argument to
+ * bpf_iter_reg_target().
+ */
+int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+
+ tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ return -ENOMEM;
+
+ tinfo->reg_info = reg_info;
+ INIT_LIST_HEAD(&tinfo->list);
+
+ mutex_lock(&targets_mutex);
+ list_add(&tinfo->list, &targets);
+ mutex_unlock(&targets_mutex);
+
+ return 0;
+}
+
+void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info)
+{
+ struct bpf_iter_target_info *tinfo;
+ bool found = false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (reg_info == tinfo->reg_info) {
+ list_del(&tinfo->list);
+ kfree(tinfo);
+ found = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ WARN_ON(found == false);
+}
+
+static void cache_btf_id(struct bpf_iter_target_info *tinfo,
+ struct bpf_prog *prog)
+{
+ tinfo->btf_id = prog->aux->attach_btf_id;
+}
+
+bool bpf_iter_prog_supported(struct bpf_prog *prog)
+{
+ const char *attach_fname = prog->aux->attach_func_name;
+ u32 prog_btf_id = prog->aux->attach_btf_id;
+ const char *prefix = BPF_ITER_FUNC_PREFIX;
+ struct bpf_iter_target_info *tinfo;
+ int prefix_len = strlen(prefix);
+ bool supported = false;
+
+ if (strncmp(attach_fname, prefix, prefix_len))
+ return false;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
+ supported = true;
+ break;
+ }
+ if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
+ cache_btf_id(tinfo, prog);
+ supported = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ if (supported) {
+ prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
+ prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
+ }
+
+ return supported;
+}
+
+const struct bpf_func_proto *
+bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_iter_target_info *tinfo;
+ const struct bpf_func_proto *fn = NULL;
+
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog->aux->attach_btf_id) {
+ const struct bpf_iter_reg *reg_info;
+
+ reg_info = tinfo->reg_info;
+ if (reg_info->get_func_proto)
+ fn = reg_info->get_func_proto(func_id, prog);
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+
+ return fn;
+}
+
+static void bpf_iter_link_release(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ if (iter_link->tinfo->reg_info->detach_target)
+ iter_link->tinfo->reg_info->detach_target(&iter_link->aux);
+}
+
+static void bpf_iter_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+
+ kfree(iter_link);
+}
+
+static int bpf_iter_link_replace(struct bpf_link *link,
+ struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ int ret = 0;
+
+ mutex_lock(&link_mutex);
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+
+ if (link->prog->type != new_prog->type ||
+ link->prog->expected_attach_type != new_prog->expected_attach_type ||
+ link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ mutex_unlock(&link_mutex);
+ return ret;
+}
+
+static void bpf_iter_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ bpf_iter_show_fdinfo_t show_fdinfo;
+
+ seq_printf(seq,
+ "target_name:\t%s\n",
+ iter_link->tinfo->reg_info->target);
+
+ show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo;
+ if (show_fdinfo)
+ show_fdinfo(&iter_link->aux, seq);
+}
+
+static int bpf_iter_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_iter_link *iter_link =
+ container_of(link, struct bpf_iter_link, link);
+ char __user *ubuf = u64_to_user_ptr(info->iter.target_name);
+ bpf_iter_fill_link_info_t fill_link_info;
+ u32 ulen = info->iter.target_name_len;
+ const char *target_name;
+ u32 target_len;
+
+ if (!ulen ^ !ubuf)
+ return -EINVAL;
+
+ target_name = iter_link->tinfo->reg_info->target;
+ target_len = strlen(target_name);
+ info->iter.target_name_len = target_len + 1;
+
+ if (ubuf) {
+ if (ulen >= target_len + 1) {
+ if (copy_to_user(ubuf, target_name, target_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(ubuf, target_name, ulen - 1))
+ return -EFAULT;
+ if (put_user(zero, ubuf + ulen - 1))
+ return -EFAULT;
+ return -ENOSPC;
+ }
+ }
+
+ fill_link_info = iter_link->tinfo->reg_info->fill_link_info;
+ if (fill_link_info)
+ return fill_link_info(&iter_link->aux, info);
+
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_iter_link_lops = {
+ .release = bpf_iter_link_release,
+ .dealloc = bpf_iter_link_dealloc,
+ .update_prog = bpf_iter_link_replace,
+ .show_fdinfo = bpf_iter_link_show_fdinfo,
+ .fill_link_info = bpf_iter_link_fill_link_info,
+};
+
+bool bpf_link_is_iter(struct bpf_link *link)
+{
+ return link->ops == &bpf_iter_link_lops;
+}
+
+int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
+ struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_iter_target_info *tinfo;
+ union bpf_iter_link_info linfo;
+ struct bpf_iter_link *link;
+ u32 prog_btf_id, linfo_len;
+ bool existed = false;
+ bpfptr_t ulinfo;
+ int err;
+
+ if (attr->link_create.target_fd || attr->link_create.flags)
+ return -EINVAL;
+
+ memset(&linfo, 0, sizeof(union bpf_iter_link_info));
+
+ ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel);
+ linfo_len = attr->link_create.iter_info_len;
+ if (bpfptr_is_null(ulinfo) ^ !linfo_len)
+ return -EINVAL;
+
+ if (!bpfptr_is_null(ulinfo)) {
+ err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo),
+ linfo_len);
+ if (err)
+ return err;
+ linfo_len = min_t(u32, linfo_len, sizeof(linfo));
+ if (copy_from_bpfptr(&linfo, ulinfo, linfo_len))
+ return -EFAULT;
+ }
+
+ prog_btf_id = prog->aux->attach_btf_id;
+ mutex_lock(&targets_mutex);
+ list_for_each_entry(tinfo, &targets, list) {
+ if (tinfo->btf_id == prog_btf_id) {
+ existed = true;
+ break;
+ }
+ }
+ mutex_unlock(&targets_mutex);
+ if (!existed)
+ return -ENOENT;
+
+ link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
+ if (!link)
+ return -ENOMEM;
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog);
+ link->tinfo = tinfo;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ return err;
+ }
+
+ if (tinfo->reg_info->attach_target) {
+ err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+ }
+
+ return bpf_link_settle(&link_primer);
+}
+
+static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
+ struct bpf_iter_target_info *tinfo,
+ const struct bpf_iter_seq_info *seq_info,
+ struct bpf_prog *prog)
+{
+ priv_data->tinfo = tinfo;
+ priv_data->seq_info = seq_info;
+ priv_data->prog = prog;
+ priv_data->session_id = atomic64_inc_return(&session_id);
+ priv_data->seq_num = 0;
+ priv_data->done_stop = false;
+}
+
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+ const struct bpf_iter_seq_info *seq_info)
+{
+ struct bpf_iter_priv_data *priv_data;
+ struct bpf_iter_target_info *tinfo;
+ struct bpf_prog *prog;
+ u32 total_priv_dsize;
+ struct seq_file *seq;
+ int err = 0;
+
+ mutex_lock(&link_mutex);
+ prog = link->link.prog;
+ bpf_prog_inc(prog);
+ mutex_unlock(&link_mutex);
+
+ tinfo = link->tinfo;
+ total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
+ seq_info->seq_priv_size;
+ priv_data = __seq_open_private(file, seq_info->seq_ops,
+ total_priv_dsize);
+ if (!priv_data) {
+ err = -ENOMEM;
+ goto release_prog;
+ }
+
+ if (seq_info->init_seq_private) {
+ err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
+ if (err)
+ goto release_seq_file;
+ }
+
+ init_seq_meta(priv_data, tinfo, seq_info, prog);
+ seq = file->private_data;
+ seq->private = priv_data->target_private;
+
+ return 0;
+
+release_seq_file:
+ seq_release_private(file->f_inode, file);
+ file->private_data = NULL;
+release_prog:
+ bpf_prog_put(prog);
+ return err;
+}
+
+int bpf_iter_new_fd(struct bpf_link *link)
+{
+ struct bpf_iter_link *iter_link;
+ struct file *file;
+ unsigned int flags;
+ int err, fd;
+
+ if (link->ops != &bpf_iter_link_lops)
+ return -EINVAL;
+
+ flags = O_RDONLY | O_CLOEXEC;
+ fd = get_unused_fd_flags(flags);
+ if (fd < 0)
+ return fd;
+
+ file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto free_fd;
+ }
+
+ iter_link = container_of(link, struct bpf_iter_link, link);
+ err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
+ if (err)
+ goto free_file;
+
+ fd_install(fd, file);
+ return fd;
+
+free_file:
+ fput(file);
+free_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop)
+{
+ struct bpf_iter_priv_data *iter_priv;
+ struct seq_file *seq;
+ void *seq_priv;
+
+ seq = meta->seq;
+ if (seq->file->f_op != &bpf_iter_fops)
+ return NULL;
+
+ seq_priv = seq->private;
+ iter_priv = container_of(seq_priv, struct bpf_iter_priv_data,
+ target_private);
+
+ if (in_stop && iter_priv->done_stop)
+ return NULL;
+
+ meta->session_id = iter_priv->session_id;
+ meta->seq_num = iter_priv->seq_num;
+
+ return iter_priv->prog;
+}
+
+int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx)
+{
+ int ret;
+
+ rcu_read_lock();
+ migrate_disable();
+ ret = bpf_prog_run(prog, ctx);
+ migrate_enable();
+ rcu_read_unlock();
+
+ /* bpf program can only return 0 or 1:
+ * 0 : okay
+ * 1 : retry the same object
+ * The bpf_iter_run_prog() return value
+ * will be seq_ops->show() return value.
+ */
+ return ret == 0 ? 0 : -EAGAIN;
+}
+
+BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn,
+ void *, callback_ctx, u64, flags)
+{
+ return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags);
+}
+
+const struct bpf_func_proto bpf_for_each_map_elem_proto = {
+ .func = bpf_for_each_map_elem,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+/* maximum number of loops */
+#define MAX_LOOPS BIT(23)
+
+BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
+ u64, flags)
+{
+ bpf_callback_t callback = (bpf_callback_t)callback_fn;
+ u64 ret;
+ u32 i;
+
+ if (flags)
+ return -EINVAL;
+ if (nr_loops > MAX_LOOPS)
+ return -E2BIG;
+
+ for (i = 0; i < nr_loops; i++) {
+ ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
+ /* return value: 0 - continue, 1 - stop and return */
+ if (ret)
+ return i + 1;
+ }
+
+ return i;
+}
+
+const struct bpf_func_proto bpf_loop_proto = {
+ .func = bpf_loop,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_FUNC,
+ .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
new file mode 100644
index 000000000000..71de2a89869c
--- /dev/null
+++ b/kernel/bpf/bpf_local_storage.c
@@ -0,0 +1,631 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/rcupdate_wait.h>
+
+#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
+
+static struct bpf_local_storage_map_bucket *
+select_bucket(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ return &smap->buckets[hash_ptr(selem, smap->bucket_log)];
+}
+
+static int mem_charge(struct bpf_local_storage_map *smap, void *owner, u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (!map->ops->map_local_storage_charge)
+ return 0;
+
+ return map->ops->map_local_storage_charge(smap, owner, size);
+}
+
+static void mem_uncharge(struct bpf_local_storage_map *smap, void *owner,
+ u32 size)
+{
+ struct bpf_map *map = &smap->map;
+
+ if (map->ops->map_local_storage_uncharge)
+ map->ops->map_local_storage_uncharge(smap, owner, size);
+}
+
+static struct bpf_local_storage __rcu **
+owner_storage(struct bpf_local_storage_map *smap, void *owner)
+{
+ struct bpf_map *map = &smap->map;
+
+ return map->ops->map_owner_storage_ptr(owner);
+}
+
+static bool selem_linked_to_storage(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->snode);
+}
+
+static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
+{
+ return !hlist_unhashed(&selem->map_node);
+}
+
+struct bpf_local_storage_elem *
+bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
+ void *value, bool charge_mem)
+{
+ struct bpf_local_storage_elem *selem;
+
+ if (charge_mem && mem_charge(smap, owner, smap->elem_size))
+ return NULL;
+
+ selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (selem) {
+ if (value)
+ memcpy(SDATA(selem)->data, value, smap->map.value_size);
+ return selem;
+ }
+
+ if (charge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ return NULL;
+}
+
+void bpf_local_storage_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage *local_storage;
+
+ local_storage = container_of(rcu, struct bpf_local_storage, rcu);
+ kfree_rcu(local_storage, rcu);
+}
+
+static void bpf_selem_free_rcu(struct rcu_head *rcu)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
+ kfree_rcu(selem, rcu);
+}
+
+/* local_storage->lock must be held and selem->local_storage == local_storage.
+ * The caller must ensure selem->smap is still valid to be
+ * dereferenced for its smap->elem_size and smap->cache_idx.
+ */
+bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem,
+ bool uncharge_mem)
+{
+ struct bpf_local_storage_map *smap;
+ bool free_local_storage;
+ void *owner;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ owner = local_storage->owner;
+
+ /* All uncharging on the owner must be done first.
+ * The owner may be freed once the last selem is unlinked
+ * from local_storage.
+ */
+ if (uncharge_mem)
+ mem_uncharge(smap, owner, smap->elem_size);
+
+ free_local_storage = hlist_is_singular_node(&selem->snode,
+ &local_storage->list);
+ if (free_local_storage) {
+ mem_uncharge(smap, owner, sizeof(struct bpf_local_storage));
+ local_storage->owner = NULL;
+
+ /* After this RCU_INIT, owner may be freed and cannot be used */
+ RCU_INIT_POINTER(*owner_storage(smap, owner), NULL);
+
+ /* local_storage is not freed now. local_storage->lock is
+ * still held and raw_spin_unlock_bh(&local_storage->lock)
+ * will be done by the caller.
+ *
+ * Although the unlock will be done under
+ * rcu_read_lock(), it is more intutivie to
+ * read if the freeing of the storage is done
+ * after the raw_spin_unlock_bh(&local_storage->lock).
+ *
+ * Hence, a "bool free_local_storage" is returned
+ * to the caller which then calls then frees the storage after
+ * all the RCU grace periods have expired.
+ */
+ }
+ hlist_del_init_rcu(&selem->snode);
+ if (rcu_access_pointer(local_storage->cache[smap->cache_idx]) ==
+ SDATA(selem))
+ RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
+
+ call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
+ return free_local_storage;
+}
+
+static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage *local_storage;
+ bool free_local_storage = false;
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_storage(selem)))
+ /* selem has already been unlinked from sk */
+ return;
+
+ local_storage = rcu_dereference_check(selem->local_storage,
+ bpf_rcu_lock_held());
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (likely(selem_linked_to_storage(selem)))
+ free_local_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, true);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+
+ if (free_local_storage)
+ call_rcu_tasks_trace(&local_storage->rcu,
+ bpf_local_storage_free_rcu);
+}
+
+void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_elem *selem)
+{
+ RCU_INIT_POINTER(selem->local_storage, local_storage);
+ hlist_add_head_rcu(&selem->snode, &local_storage->list);
+}
+
+void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map *smap;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned long flags;
+
+ if (unlikely(!selem_linked_to_map(selem)))
+ /* selem has already be unlinked from smap */
+ return;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
+ b = select_bucket(smap, selem);
+ raw_spin_lock_irqsave(&b->lock, flags);
+ if (likely(selem_linked_to_map(selem)))
+ hlist_del_init_rcu(&selem->map_node);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_link_map(struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_map_bucket *b = select_bucket(smap, selem);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+ RCU_INIT_POINTER(SDATA(selem)->smap, smap);
+ hlist_add_head_rcu(&selem->map_node, &b->list);
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+}
+
+void bpf_selem_unlink(struct bpf_local_storage_elem *selem)
+{
+ /* Always unlink from map before unlinking from local_storage
+ * because selem will be freed after successfully unlinked from
+ * the local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ __bpf_selem_unlink_storage(selem);
+}
+
+struct bpf_local_storage_data *
+bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
+ struct bpf_local_storage_map *smap,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage_data *sdata;
+ struct bpf_local_storage_elem *selem;
+
+ /* Fast path (cache hit) */
+ sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
+ bpf_rcu_lock_held());
+ if (sdata && rcu_access_pointer(sdata->smap) == smap)
+ return sdata;
+
+ /* Slow path (cache miss) */
+ hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
+ rcu_read_lock_trace_held())
+ if (rcu_access_pointer(SDATA(selem)->smap) == smap)
+ break;
+
+ if (!selem)
+ return NULL;
+
+ sdata = SDATA(selem);
+ if (cacheit_lockit) {
+ unsigned long flags;
+
+ /* spinlock is needed to avoid racing with the
+ * parallel delete. Otherwise, publishing an already
+ * deleted sdata to the cache will become a use-after-free
+ * problem in the next bpf_local_storage_lookup().
+ */
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ if (selem_linked_to_storage(selem))
+ rcu_assign_pointer(local_storage->cache[smap->cache_idx],
+ sdata);
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ }
+
+ return sdata;
+}
+
+static int check_flags(const struct bpf_local_storage_data *old_sdata,
+ u64 map_flags)
+{
+ if (old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST)
+ /* elem already exists */
+ return -EEXIST;
+
+ if (!old_sdata && (map_flags & ~BPF_F_LOCK) == BPF_EXIST)
+ /* elem doesn't exist, cannot update it */
+ return -ENOENT;
+
+ return 0;
+}
+
+int bpf_local_storage_alloc(void *owner,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *first_selem)
+{
+ struct bpf_local_storage *prev_storage, *storage;
+ struct bpf_local_storage **owner_storage_ptr;
+ int err;
+
+ err = mem_charge(smap, owner, sizeof(*storage));
+ if (err)
+ return err;
+
+ storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!storage) {
+ err = -ENOMEM;
+ goto uncharge;
+ }
+
+ INIT_HLIST_HEAD(&storage->list);
+ raw_spin_lock_init(&storage->lock);
+ storage->owner = owner;
+
+ bpf_selem_link_storage_nolock(storage, first_selem);
+ bpf_selem_link_map(smap, first_selem);
+
+ owner_storage_ptr =
+ (struct bpf_local_storage **)owner_storage(smap, owner);
+ /* Publish storage to the owner.
+ * Instead of using any lock of the kernel object (i.e. owner),
+ * cmpxchg will work with any kernel object regardless what
+ * the running context is, bh, irq...etc.
+ *
+ * From now on, the owner->storage pointer (e.g. sk->sk_bpf_storage)
+ * is protected by the storage->lock. Hence, when freeing
+ * the owner->storage, the storage->lock must be held before
+ * setting owner->storage ptr to NULL.
+ */
+ prev_storage = cmpxchg(owner_storage_ptr, NULL, storage);
+ if (unlikely(prev_storage)) {
+ bpf_selem_unlink_map(first_selem);
+ err = -EAGAIN;
+ goto uncharge;
+
+ /* Note that even first_selem was linked to smap's
+ * bucket->list, first_selem can be freed immediately
+ * (instead of kfree_rcu) because
+ * bpf_local_storage_map_free() does a
+ * synchronize_rcu_mult (waiting for both sleepable and
+ * normal programs) before walking the bucket->list.
+ * Hence, no one is accessing selem from the
+ * bucket->list under rcu_read_lock().
+ */
+ }
+
+ return 0;
+
+uncharge:
+ kfree(storage);
+ mem_uncharge(smap, owner, sizeof(*storage));
+ return err;
+}
+
+/* sk cannot be going away because it is linking new elem
+ * to sk->sk_bpf_storage. (i.e. sk->sk_refcnt cannot be 0).
+ * Otherwise, it will become a leak (and other memory issues
+ * during map destruction).
+ */
+struct bpf_local_storage_data *
+bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *old_sdata = NULL;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ unsigned long flags;
+ int err;
+
+ /* BPF_EXIST and BPF_NOEXIST cannot be both set */
+ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) ||
+ /* BPF_F_LOCK can only be used in a value with spin_lock */
+ unlikely((map_flags & BPF_F_LOCK) &&
+ !map_value_has_spin_lock(&smap->map)))
+ return ERR_PTR(-EINVAL);
+
+ local_storage = rcu_dereference_check(*owner_storage(smap, owner),
+ bpf_rcu_lock_held());
+ if (!local_storage || hlist_empty(&local_storage->list)) {
+ /* Very first elem for the owner */
+ err = check_flags(NULL, map_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ selem = bpf_selem_alloc(smap, owner, value, true);
+ if (!selem)
+ return ERR_PTR(-ENOMEM);
+
+ err = bpf_local_storage_alloc(owner, smap, selem);
+ if (err) {
+ kfree(selem);
+ mem_uncharge(smap, owner, smap->elem_size);
+ return ERR_PTR(err);
+ }
+
+ return SDATA(selem);
+ }
+
+ if ((map_flags & BPF_F_LOCK) && !(map_flags & BPF_NOEXIST)) {
+ /* Hoping to find an old_sdata to do inline update
+ * such that it can avoid taking the local_storage->lock
+ * and changing the lists.
+ */
+ old_sdata =
+ bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ return ERR_PTR(err);
+ if (old_sdata && selem_linked_to_storage(SELEM(old_sdata))) {
+ copy_map_value_locked(&smap->map, old_sdata->data,
+ value, false);
+ return old_sdata;
+ }
+ }
+
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+
+ /* Recheck local_storage->list under local_storage->lock */
+ if (unlikely(hlist_empty(&local_storage->list))) {
+ /* A parallel del is happening and local_storage is going
+ * away. It has just been checked before, so very
+ * unlikely. Return instead of retry to keep things
+ * simple.
+ */
+ err = -EAGAIN;
+ goto unlock_err;
+ }
+
+ old_sdata = bpf_local_storage_lookup(local_storage, smap, false);
+ err = check_flags(old_sdata, map_flags);
+ if (err)
+ goto unlock_err;
+
+ if (old_sdata && (map_flags & BPF_F_LOCK)) {
+ copy_map_value_locked(&smap->map, old_sdata->data, value,
+ false);
+ selem = SELEM(old_sdata);
+ goto unlock;
+ }
+
+ /* local_storage->lock is held. Hence, we are sure
+ * we can unlink and uncharge the old_sdata successfully
+ * later. Hence, instead of charging the new selem now
+ * and then uncharge the old selem later (which may cause
+ * a potential but unnecessary charge failure), avoid taking
+ * a charge at all here (the "!old_sdata" check) and the
+ * old_sdata will not be uncharged later during
+ * bpf_selem_unlink_storage_nolock().
+ */
+ selem = bpf_selem_alloc(smap, owner, value, !old_sdata);
+ if (!selem) {
+ err = -ENOMEM;
+ goto unlock_err;
+ }
+
+ /* First, link the new selem to the map */
+ bpf_selem_link_map(smap, selem);
+
+ /* Second, link (and publish) the new selem to local_storage */
+ bpf_selem_link_storage_nolock(local_storage, selem);
+
+ /* Third, remove old selem, SELEM(old_sdata) */
+ if (old_sdata) {
+ bpf_selem_unlink_map(SELEM(old_sdata));
+ bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
+ false);
+ }
+
+unlock:
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ return SDATA(selem);
+
+unlock_err:
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ return ERR_PTR(err);
+}
+
+u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache)
+{
+ u64 min_usage = U64_MAX;
+ u16 i, res = 0;
+
+ spin_lock(&cache->idx_lock);
+
+ for (i = 0; i < BPF_LOCAL_STORAGE_CACHE_SIZE; i++) {
+ if (cache->idx_usage_counts[i] < min_usage) {
+ min_usage = cache->idx_usage_counts[i];
+ res = i;
+
+ /* Found a free cache_idx */
+ if (!min_usage)
+ break;
+ }
+ }
+ cache->idx_usage_counts[res]++;
+
+ spin_unlock(&cache->idx_lock);
+
+ return res;
+}
+
+void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache,
+ u16 idx)
+{
+ spin_lock(&cache->idx_lock);
+ cache->idx_usage_counts[idx]--;
+ spin_unlock(&cache->idx_lock);
+}
+
+void bpf_local_storage_map_free(struct bpf_local_storage_map *smap,
+ int __percpu *busy_counter)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map_bucket *b;
+ unsigned int i;
+
+ /* Note that this map might be concurrently cloned from
+ * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
+ * RCU read section to finish before proceeding. New RCU
+ * read sections should be prevented via bpf_map_inc_not_zero.
+ */
+ synchronize_rcu();
+
+ /* bpf prog and the userspace can no longer access this map
+ * now. No new selem (of this map) can be added
+ * to the owner->storage or to the map bucket's list.
+ *
+ * The elem of this map can be cleaned up here
+ * or when the storage is freed e.g.
+ * by bpf_sk_storage_free() during __sk_destruct().
+ */
+ for (i = 0; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+
+ rcu_read_lock();
+ /* No one is adding to b->list now */
+ while ((selem = hlist_entry_safe(
+ rcu_dereference_raw(hlist_first_rcu(&b->list)),
+ struct bpf_local_storage_elem, map_node))) {
+ if (busy_counter) {
+ migrate_disable();
+ __this_cpu_inc(*busy_counter);
+ }
+ bpf_selem_unlink(selem);
+ if (busy_counter) {
+ __this_cpu_dec(*busy_counter);
+ migrate_enable();
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+ }
+
+ /* While freeing the storage we may still need to access the map.
+ *
+ * e.g. when bpf_sk_storage_free() has unlinked selem from the map
+ * which then made the above while((selem = ...)) loop
+ * exit immediately.
+ *
+ * However, while freeing the storage one still needs to access the
+ * smap->elem_size to do the uncharging in
+ * bpf_selem_unlink_storage_nolock().
+ *
+ * Hence, wait another rcu grace period for the storage to be freed.
+ */
+ synchronize_rcu();
+
+ kvfree(smap->buckets);
+ kfree(smap);
+}
+
+int bpf_local_storage_map_alloc_check(union bpf_attr *attr)
+{
+ if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK ||
+ !(attr->map_flags & BPF_F_NO_PREALLOC) ||
+ attr->max_entries ||
+ attr->key_size != sizeof(int) || !attr->value_size ||
+ /* Enforce BTF for userspace sk dumping */
+ !attr->btf_key_type_id || !attr->btf_value_type_id)
+ return -EINVAL;
+
+ if (!bpf_capable())
+ return -EPERM;
+
+ if (attr->value_size > BPF_LOCAL_STORAGE_MAX_VALUE_SIZE)
+ return -E2BIG;
+
+ return 0;
+}
+
+struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_local_storage_map *smap;
+ unsigned int i;
+ u32 nbuckets;
+
+ smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
+ bpf_map_init_from_attr(&smap->map, attr);
+
+ nbuckets = roundup_pow_of_two(num_possible_cpus());
+ /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
+ nbuckets = max_t(u32, 2, nbuckets);
+ smap->bucket_log = ilog2(nbuckets);
+
+ smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
+ GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
+ if (!smap->buckets) {
+ kfree(smap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nbuckets; i++) {
+ INIT_HLIST_HEAD(&smap->buckets[i].list);
+ raw_spin_lock_init(&smap->buckets[i].lock);
+ }
+
+ smap->elem_size =
+ sizeof(struct bpf_local_storage_elem) + attr->value_size;
+
+ return smap;
+}
+
+int bpf_local_storage_map_check_btf(const struct bpf_map *map,
+ const struct btf *btf,
+ const struct btf_type *key_type,
+ const struct btf_type *value_type)
+{
+ u32 int_data;
+
+ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+ return -EINVAL;
+
+ int_data = *(u32 *)(key_type + 1);
+ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data))
+ return -EINVAL;
+
+ return 0;
+}
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 1b6b9349cb85..d99e89f113c4 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
static void bpf_common_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
+ u8 node_type = READ_ONCE(node->type);
unsigned long flags;
- if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
- WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
+ if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
+ WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
return;
- if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
+ if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index f02504640e18..6b12f06ee18c 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -30,7 +30,7 @@ struct bpf_lru_node {
struct bpf_lru_list {
struct list_head lists[NR_BPF_LRU_LIST_T];
unsigned int counts[NR_BPF_LRU_LIST_COUNT];
- /* The next inacitve list rotation starts from here */
+ /* The next inactive list rotation starts from here */
struct list_head *next_inactive_rotation;
raw_spinlock_t lock ____cacheline_aligned_in_smp;
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
new file mode 100644
index 000000000000..06062370c3b8
--- /dev/null
+++ b/kernel/bpf/bpf_lsm.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/binfmts.h>
+#include <linux/lsm_hooks.h>
+#include <linux/bpf_lsm.h>
+#include <linux/kallsyms.h>
+#include <linux/bpf_verifier.h>
+#include <net/bpf_sk_storage.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/btf_ids.h>
+#include <linux/ima.h>
+
+/* For every LSM hook that allows attachment of BPF programs, declare a nop
+ * function where a BPF program can be attached.
+ */
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
+noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
+{ \
+ return DEFAULT; \
+}
+
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+BTF_SET_START(bpf_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+BTF_SET_END(bpf_lsm_hooks)
+
+int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
+ const struct bpf_prog *prog)
+{
+ if (!prog->gpl_compatible) {
+ bpf_log(vlog,
+ "LSM programs must have a GPL compatible license\n");
+ return -EINVAL;
+ }
+
+ if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
+ bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
+ prog->aux->attach_btf_id, prog->aux->attach_func_name);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Mask for all the currently supported BPRM option flags */
+#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC
+
+BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags)
+{
+ if (flags & ~BPF_F_BRPM_OPTS_MASK)
+ return -EINVAL;
+
+ bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC);
+ return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm)
+
+static const struct bpf_func_proto bpf_bprm_opts_set_proto = {
+ .func = bpf_bprm_opts_set,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0],
+ .arg2_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size)
+{
+ return ima_inode_hash(inode, dst, size);
+}
+
+static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog)
+{
+ return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id);
+}
+
+BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode)
+
+static const struct bpf_func_proto bpf_ima_inode_hash_proto = {
+ .func = bpf_ima_inode_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0],
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE,
+ .allowed = bpf_ima_inode_hash_allowed,
+};
+
+static const struct bpf_func_proto *
+bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_inode_storage_get:
+ return &bpf_inode_storage_get_proto;
+ case BPF_FUNC_inode_storage_delete:
+ return &bpf_inode_storage_delete_proto;
+#ifdef CONFIG_NET
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#endif /* CONFIG_NET */
+ case BPF_FUNC_spin_lock:
+ return &bpf_spin_lock_proto;
+ case BPF_FUNC_spin_unlock:
+ return &bpf_spin_unlock_proto;
+ case BPF_FUNC_bprm_opts_set:
+ return &bpf_bprm_opts_set_proto;
+ case BPF_FUNC_ima_inode_hash:
+ return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL;
+ default:
+ return tracing_prog_func_proto(func_id, prog);
+ }
+}
+
+/* The set of hooks which are called without pagefaults disabled and are allowed
+ * to "sleep" and thus can be used for sleepable BPF programs.
+ */
+BTF_SET_START(sleepable_lsm_hooks)
+BTF_ID(func, bpf_lsm_bpf)
+BTF_ID(func, bpf_lsm_bpf_map)
+BTF_ID(func, bpf_lsm_bpf_map_alloc_security)
+BTF_ID(func, bpf_lsm_bpf_map_free_security)
+BTF_ID(func, bpf_lsm_bpf_prog)
+BTF_ID(func, bpf_lsm_bprm_check_security)
+BTF_ID(func, bpf_lsm_bprm_committed_creds)
+BTF_ID(func, bpf_lsm_bprm_committing_creds)
+BTF_ID(func, bpf_lsm_bprm_creds_for_exec)
+BTF_ID(func, bpf_lsm_bprm_creds_from_file)
+BTF_ID(func, bpf_lsm_capget)
+BTF_ID(func, bpf_lsm_capset)
+BTF_ID(func, bpf_lsm_cred_prepare)
+BTF_ID(func, bpf_lsm_file_ioctl)
+BTF_ID(func, bpf_lsm_file_lock)
+BTF_ID(func, bpf_lsm_file_open)
+BTF_ID(func, bpf_lsm_file_receive)
+
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_inet_conn_established)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+BTF_ID(func, bpf_lsm_inode_create)
+BTF_ID(func, bpf_lsm_inode_free_security)
+BTF_ID(func, bpf_lsm_inode_getattr)
+BTF_ID(func, bpf_lsm_inode_getxattr)
+BTF_ID(func, bpf_lsm_inode_mknod)
+BTF_ID(func, bpf_lsm_inode_need_killpriv)
+BTF_ID(func, bpf_lsm_inode_post_setxattr)
+BTF_ID(func, bpf_lsm_inode_readlink)
+BTF_ID(func, bpf_lsm_inode_rename)
+BTF_ID(func, bpf_lsm_inode_rmdir)
+BTF_ID(func, bpf_lsm_inode_setattr)
+BTF_ID(func, bpf_lsm_inode_setxattr)
+BTF_ID(func, bpf_lsm_inode_symlink)
+BTF_ID(func, bpf_lsm_inode_unlink)
+BTF_ID(func, bpf_lsm_kernel_module_request)
+BTF_ID(func, bpf_lsm_kernfs_init_security)
+
+#ifdef CONFIG_KEYS
+BTF_ID(func, bpf_lsm_key_free)
+#endif /* CONFIG_KEYS */
+
+BTF_ID(func, bpf_lsm_mmap_file)
+BTF_ID(func, bpf_lsm_netlink_send)
+BTF_ID(func, bpf_lsm_path_notify)
+BTF_ID(func, bpf_lsm_release_secctx)
+BTF_ID(func, bpf_lsm_sb_alloc_security)
+BTF_ID(func, bpf_lsm_sb_eat_lsm_opts)
+BTF_ID(func, bpf_lsm_sb_kern_mount)
+BTF_ID(func, bpf_lsm_sb_mount)
+BTF_ID(func, bpf_lsm_sb_remount)
+BTF_ID(func, bpf_lsm_sb_set_mnt_opts)
+BTF_ID(func, bpf_lsm_sb_show_options)
+BTF_ID(func, bpf_lsm_sb_statfs)
+BTF_ID(func, bpf_lsm_sb_umount)
+BTF_ID(func, bpf_lsm_settime)
+
+#ifdef CONFIG_SECURITY_NETWORK
+BTF_ID(func, bpf_lsm_socket_accept)
+BTF_ID(func, bpf_lsm_socket_bind)
+BTF_ID(func, bpf_lsm_socket_connect)
+BTF_ID(func, bpf_lsm_socket_create)
+BTF_ID(func, bpf_lsm_socket_getpeername)
+BTF_ID(func, bpf_lsm_socket_getpeersec_dgram)
+BTF_ID(func, bpf_lsm_socket_getsockname)
+BTF_ID(func, bpf_lsm_socket_getsockopt)
+BTF_ID(func, bpf_lsm_socket_listen)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_recvmsg)
+BTF_ID(func, bpf_lsm_socket_sendmsg)
+BTF_ID(func, bpf_lsm_socket_shutdown)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+#endif /* CONFIG_SECURITY_NETWORK */
+
+BTF_ID(func, bpf_lsm_syslog)
+BTF_ID(func, bpf_lsm_task_alloc)
+BTF_ID(func, bpf_lsm_task_getsecid_subj)
+BTF_ID(func, bpf_lsm_task_getsecid_obj)
+BTF_ID(func, bpf_lsm_task_prctl)
+BTF_ID(func, bpf_lsm_task_setscheduler)
+BTF_ID(func, bpf_lsm_task_to_inode)
+BTF_SET_END(sleepable_lsm_hooks)
+
+bool bpf_lsm_is_sleepable_hook(u32 btf_id)
+{
+ return btf_id_set_contains(&sleepable_lsm_hooks, btf_id);
+}
+
+const struct bpf_prog_ops lsm_prog_ops = {
+};
+
+const struct bpf_verifier_ops lsm_verifier_ops = {
+ .get_func_proto = bpf_lsm_func_proto,
+ .is_valid_access = btf_ctx_access,
+};
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 042f95534f86..21069dbe9138 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -23,11 +23,12 @@ enum bpf_struct_ops_state {
struct bpf_struct_ops_value {
BPF_STRUCT_OPS_COMMON_VALUE;
- char data[0] ____cacheline_aligned_in_smp;
+ char data[] ____cacheline_aligned_in_smp;
};
struct bpf_struct_ops_map {
struct bpf_map map;
+ struct rcu_head rcu;
const struct bpf_struct_ops *st_ops;
/* protect map_update */
struct mutex lock;
@@ -92,6 +93,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
+#ifdef CONFIG_NET
+ .test_run = bpf_struct_ops_test_run,
+#endif
};
static const struct btf_type *module_type;
@@ -161,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
- if (btf_member_bitfield_size(t, member)) {
+ if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@@ -292,14 +296,13 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype))
return PTR_ERR(mtype);
prev_mend = moff + msize;
@@ -312,6 +315,20 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
+int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
+ struct bpf_prog *prog,
+ const struct btf_func_model *model,
+ void *image, void *image_end)
+{
+ u32 flags;
+
+ tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
+ tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
+ return arch_prepare_bpf_trampoline(NULL, image, image_end,
+ model, flags, tprogs, NULL);
+}
+
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@@ -320,9 +337,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_struct_ops_value *uvalue, *kvalue;
const struct btf_member *member;
const struct btf_type *t = st_ops->type;
+ struct bpf_tramp_progs *tprogs = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
- void *image;
+ void *image, *image_end;
u32 i;
if (flags)
@@ -343,6 +361,10 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
if (uvalue->state || refcount_read(&uvalue->refcnt))
return -EINVAL;
+ tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
+ if (!tprogs)
+ return -ENOMEM;
+
uvalue = (struct bpf_struct_ops_value *)st_map->uvalue;
kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue;
@@ -358,13 +380,14 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
+ image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -391,8 +414,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
u32 msize;
mtype = btf_type_by_id(btf_vmlinux, member->type);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- NULL, NULL);
+ mtype = btf_resolve_size(btf_vmlinux, mtype, &msize);
if (IS_ERR(mtype)) {
err = PTR_ERR(mtype);
goto reset_unlock;
@@ -425,10 +447,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
- err = arch_prepare_bpf_trampoline(image,
- st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- &prog, 1, NULL, 0, NULL);
+ err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
+ &st_ops->func_models[i],
+ image, image_end);
if (err < 0)
goto reset_unlock;
@@ -469,6 +490,7 @@ reset_unlock:
memset(uvalue, 0, map->value_size);
memset(kvalue, 0, map->value_size);
unlock:
+ kfree(tprogs);
mutex_unlock(&st_map->lock);
return err;
}
@@ -482,13 +504,21 @@ static int bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
prev_state = cmpxchg(&st_map->kvalue.state,
BPF_STRUCT_OPS_STATE_INUSE,
BPF_STRUCT_OPS_STATE_TOBEFREE);
- if (prev_state == BPF_STRUCT_OPS_STATE_INUSE) {
+ switch (prev_state) {
+ case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops->unreg(&st_map->kvalue.data);
if (refcount_dec_and_test(&st_map->kvalue.refcnt))
bpf_map_put(map);
+ return 0;
+ case BPF_STRUCT_OPS_STATE_TOBEFREE:
+ return -EINPROGRESS;
+ case BPF_STRUCT_OPS_STATE_INIT:
+ return -ENOENT;
+ default:
+ WARN_ON_ONCE(1);
+ /* Should never happen. Treat it as not found. */
+ return -ENOENT;
}
-
- return 0;
}
static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key,
@@ -534,14 +564,12 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops *st_ops;
- size_t map_total_size, st_map_size;
+ size_t st_map_size;
struct bpf_struct_ops_map *st_map;
const struct btf_type *t, *vt;
- struct bpf_map_memory mem;
struct bpf_map *map;
- int err;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
@@ -559,20 +587,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
* struct bpf_struct_ops_tcp_congestions_ops
*/
(vt->size - sizeof(struct bpf_struct_ops_value));
- map_total_size = st_map_size +
- /* uvalue */
- sizeof(vt->size) +
- /* struct bpf_progs **progs */
- btf_type_vlen(t) * sizeof(struct bpf_prog *);
- err = bpf_map_charge_init(&mem, map_total_size);
- if (err < 0)
- return ERR_PTR(err);
st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE);
- if (!st_map) {
- bpf_map_charge_finish(&mem);
+ if (!st_map)
return ERR_PTR(-ENOMEM);
- }
+
st_map->st_ops = st_ops;
map = &st_map->map;
@@ -583,18 +602,17 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
st_map->image = bpf_jit_alloc_exec(PAGE_SIZE);
if (!st_map->uvalue || !st_map->progs || !st_map->image) {
bpf_struct_ops_map_free(map);
- bpf_map_charge_finish(&mem);
return ERR_PTR(-ENOMEM);
}
mutex_init(&st_map->lock);
set_vm_flush_reset_perms(st_map->image);
bpf_map_init_from_attr(map, attr);
- bpf_map_charge_move(&map->memory, &mem);
return map;
}
+static int bpf_struct_ops_map_btf_id;
const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_alloc_check = bpf_struct_ops_map_alloc_check,
.map_alloc = bpf_struct_ops_map_alloc,
@@ -604,6 +622,8 @@ const struct bpf_map_ops bpf_struct_ops_map_ops = {
.map_delete_elem = bpf_struct_ops_map_delete_elem,
.map_update_elem = bpf_struct_ops_map_update_elem,
.map_seq_show_elem = bpf_struct_ops_map_seq_show_elem,
+ .map_btf_name = "bpf_struct_ops_map",
+ .map_btf_id = &bpf_struct_ops_map_btf_id,
};
/* "const void *" because some subsystem is
@@ -618,6 +638,14 @@ bool bpf_struct_ops_get(const void *kdata)
return refcount_inc_not_zero(&kvalue->refcnt);
}
+static void bpf_struct_ops_put_rcu(struct rcu_head *head)
+{
+ struct bpf_struct_ops_map *st_map;
+
+ st_map = container_of(head, struct bpf_struct_ops_map, rcu);
+ bpf_map_put(&st_map->map);
+}
+
void bpf_struct_ops_put(const void *kdata)
{
struct bpf_struct_ops_value *kvalue;
@@ -628,6 +656,17 @@ void bpf_struct_ops_put(const void *kdata)
st_map = container_of(kvalue, struct bpf_struct_ops_map,
kvalue);
- bpf_map_put(&st_map->map);
+ /* The struct_ops's function may switch to another struct_ops.
+ *
+ * For example, bpf_tcp_cc_x->init() may switch to
+ * another tcp_cc_y by calling
+ * setsockopt(TCP_CONGESTION, "tcp_cc_y").
+ * During the switch, bpf_struct_ops_put(tcp_cc_x) is called
+ * and its map->refcnt may reach 0 which then free its
+ * trampoline image while tcp_cc_x is still running.
+ *
+ * Thus, a rcu grace period is needed here.
+ */
+ call_rcu(&st_map->rcu, bpf_struct_ops_put_rcu);
}
}
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
index 066d83ea1c99..5678a9ddf817 100644
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ b/kernel/bpf/bpf_struct_ops_types.h
@@ -2,6 +2,9 @@
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
+#ifdef CONFIG_NET
+BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
+#endif
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
new file mode 100644
index 000000000000..5da7bed0f5f6
--- /dev/null
+++ b/kernel/bpf/bpf_task_storage.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2020 Facebook
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/pid.h>
+#include <linux/sched.h>
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/bpf_local_storage.h>
+#include <linux/filter.h>
+#include <uapi/linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/fdtable.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(task_cache);
+
+static DEFINE_PER_CPU(int, bpf_task_storage_busy);
+
+static void bpf_task_storage_lock(void)
+{
+ migrate_disable();
+ __this_cpu_inc(bpf_task_storage_busy);
+}
+
+static void bpf_task_storage_unlock(void)
+{
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+}
+
+static bool bpf_task_storage_trylock(void)
+{
+ migrate_disable();
+ if (unlikely(__this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
+ __this_cpu_dec(bpf_task_storage_busy);
+ migrate_enable();
+ return false;
+ }
+ return true;
+}
+
+static struct bpf_local_storage __rcu **task_storage_ptr(void *owner)
+{
+ struct task_struct *task = owner;
+
+ return &task->bpf_storage;
+}
+
+static struct bpf_local_storage_data *
+task_storage_lookup(struct task_struct *task, struct bpf_map *map,
+ bool cacheit_lockit)
+{
+ struct bpf_local_storage *task_storage;
+ struct bpf_local_storage_map *smap;
+
+ task_storage =
+ rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
+ if (!task_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit);
+}
+
+void bpf_task_storage_free(struct task_struct *task)
+{
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage *local_storage;
+ bool free_task_storage = false;
+ struct hlist_node *n;
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ local_storage = rcu_dereference(task->bpf_storage);
+ if (!local_storage) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Neither the bpf_prog nor the bpf-map's syscall
+ * could be modifying the local_storage->list now.
+ * Thus, no elem can be added-to or deleted-from the
+ * local_storage->list by the bpf_prog or by the bpf-map's syscall.
+ *
+ * It is racing with bpf_local_storage_map_free() alone
+ * when unlinking elem from the local_storage->list and
+ * the map's bucket->list.
+ */
+ bpf_task_storage_lock();
+ raw_spin_lock_irqsave(&local_storage->lock, flags);
+ hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) {
+ /* Always unlink from map before unlinking from
+ * local_storage.
+ */
+ bpf_selem_unlink_map(selem);
+ free_task_storage = bpf_selem_unlink_storage_nolock(
+ local_storage, selem, false);
+ }
+ raw_spin_unlock_irqrestore(&local_storage->lock, flags);
+ bpf_task_storage_unlock();
+ rcu_read_unlock();
+
+ /* free_task_storage should always be true as long as
+ * local_storage->list was non-empty.
+ */
+ if (free_task_storage)
+ kfree_rcu(local_storage, rcu);
+}
+
+static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = task_storage_lookup(task, map, true);
+ bpf_task_storage_unlock();
+ put_pid(pid);
+ return sdata ? sdata->data : NULL;
+out:
+ put_pid(pid);
+ return ERR_PTR(err);
+}
+
+static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value, map_flags);
+ bpf_task_storage_unlock();
+
+ err = PTR_ERR_OR_ZERO(sdata);
+out:
+ put_pid(pid);
+ return err;
+}
+
+static int task_storage_delete(struct task_struct *task, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = task_storage_lookup(task, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata));
+
+ return 0;
+}
+
+static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct task_struct *task;
+ unsigned int f_flags;
+ struct pid *pid;
+ int fd, err;
+
+ fd = *(int *)key;
+ pid = pidfd_get_pid(fd, &f_flags);
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
+
+ /* We should be in an RCU read side critical section, it should be safe
+ * to call pid_task.
+ */
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ bpf_task_storage_lock();
+ err = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
+out:
+ put_pid(pid);
+ return err;
+}
+
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return (unsigned long)NULL;
+
+ if (!task)
+ return (unsigned long)NULL;
+
+ if (!bpf_task_storage_trylock())
+ return (unsigned long)NULL;
+
+ sdata = task_storage_lookup(task, map, true);
+ if (sdata)
+ goto unlock;
+
+ /* only allocate new storage, when the task is refcounted */
+ if (refcount_read(&task->usage) &&
+ (flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
+ sdata = bpf_local_storage_update(
+ task, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST);
+
+unlock:
+ bpf_task_storage_unlock();
+ return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL :
+ (unsigned long)sdata->data;
+}
+
+BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
+ task)
+{
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!task)
+ return -EINVAL;
+
+ if (!bpf_task_storage_trylock())
+ return -EBUSY;
+
+ /* This helper must only be called from places where the lifetime of the task
+ * is guaranteed. Either by being refcounted or by being protected
+ * by an RCU read-side critical section.
+ */
+ ret = task_storage_delete(task, map);
+ bpf_task_storage_unlock();
+ return ret;
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = bpf_local_storage_map_alloc(attr);
+ if (IS_ERR(smap))
+ return ERR_CAST(smap);
+
+ smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache);
+ return &smap->map;
+}
+
+static void task_storage_map_free(struct bpf_map *map)
+{
+ struct bpf_local_storage_map *smap;
+
+ smap = (struct bpf_local_storage_map *)map;
+ bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx);
+ bpf_local_storage_map_free(smap, &bpf_task_storage_busy);
+}
+
+static int task_storage_map_btf_id;
+const struct bpf_map_ops task_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = task_storage_map_alloc,
+ .map_free = task_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_pid_task_storage_lookup_elem,
+ .map_update_elem = bpf_pid_task_storage_update_elem,
+ .map_delete_elem = bpf_pid_task_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_name = "bpf_local_storage_map",
+ .map_btf_id = &task_storage_map_btf_id,
+ .map_owner_storage_ptr = task_storage_ptr,
+};
+
+const struct bpf_func_proto bpf_task_storage_get_proto = {
+ .func = bpf_task_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_task_storage_delete_proto = {
+ .func = bpf_task_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID,
+ .arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
+};
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 787140095e58..33bb8ae4a804 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -18,9 +18,14 @@
#include <linux/sort.h>
#include <linux/bpf_verifier.h>
#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/skmsg.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
#include <net/sock.h>
+#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@@ -47,7 +52,7 @@
* The BTF type section contains a list of 'struct btf_type' objects.
* Each one describes a C type. Recall from the above section
* that a 'struct btf_type' object could be immediately followed by extra
- * data in order to desribe some particular C types.
+ * data in order to describe some particular C types.
*
* type_id:
* ~~~~~~~
@@ -169,7 +174,7 @@
#define BITS_ROUNDUP_BYTES(bits) \
(BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
-#define BTF_INFO_MASK 0x8f00ffff
+#define BTF_INFO_MASK 0x9f00ffff
#define BTF_INT_MASK 0x0fffffff
#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE)
#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET)
@@ -185,11 +190,6 @@
i < btf_type_vlen(struct_type); \
i++, member++)
-#define for_each_vsi(i, struct_type, member) \
- for (i = 0, member = btf_type_var_secinfo(struct_type); \
- i < btf_type_vlen(struct_type); \
- i++, member++)
-
#define for_each_vsi_from(i, from, struct_type, member) \
for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@@ -206,12 +206,19 @@ struct btf {
const char *strings;
void *nohdr_data;
struct btf_header hdr;
- u32 nr_types;
+ u32 nr_types; /* includes VOID for base BTF */
u32 types_size;
u32 data_size;
refcount_t refcnt;
u32 id;
struct rcu_head rcu;
+
+ /* split BTF support */
+ struct btf *base_btf;
+ u32 start_id; /* first type ID in this BTF (0 for base BTF) */
+ u32 start_str_off; /* first string offset (0 for base BTF) */
+ char name[MODULE_NAME_LEN];
+ bool kernel_btf;
};
enum verifier_phase {
@@ -274,13 +281,101 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = "FUNC_PROTO",
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
+ [BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_DECL_TAG] = "DECL_TAG",
+ [BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
-static const char *btf_type_str(const struct btf_type *t)
+const char *btf_type_str(const struct btf_type *t)
{
return btf_kind_str[BTF_INFO_KIND(t->info)];
}
+/* Chunk size we use in safe copy of data to be shown. */
+#define BTF_SHOW_OBJ_SAFE_SIZE 32
+
+/*
+ * This is the maximum size of a base type value (equivalent to a
+ * 128-bit int); if we are at the end of our safe buffer and have
+ * less than 16 bytes space we can't be assured of being able
+ * to copy the next type safely, so in such cases we will initiate
+ * a new copy.
+ */
+#define BTF_SHOW_OBJ_BASE_TYPE_SIZE 16
+
+/* Type name size */
+#define BTF_SHOW_NAME_SIZE 80
+
+/*
+ * Common data to all BTF show operations. Private show functions can add
+ * their own data to a structure containing a struct btf_show and consult it
+ * in the show callback. See btf_type_show() below.
+ *
+ * One challenge with showing nested data is we want to skip 0-valued
+ * data, but in order to figure out whether a nested object is all zeros
+ * we need to walk through it. As a result, we need to make two passes
+ * when handling structs, unions and arrays; the first path simply looks
+ * for nonzero data, while the second actually does the display. The first
+ * pass is signalled by show->state.depth_check being set, and if we
+ * encounter a non-zero value we set show->state.depth_to_show to
+ * the depth at which we encountered it. When we have completed the
+ * first pass, we will know if anything needs to be displayed if
+ * depth_to_show > depth. See btf_[struct,array]_show() for the
+ * implementation of this.
+ *
+ * Another problem is we want to ensure the data for display is safe to
+ * access. To support this, the anonymous "struct {} obj" tracks the data
+ * object and our safe copy of it. We copy portions of the data needed
+ * to the object "copy" buffer, but because its size is limited to
+ * BTF_SHOW_OBJ_COPY_LEN bytes, multiple copies may be required as we
+ * traverse larger objects for display.
+ *
+ * The various data type show functions all start with a call to
+ * btf_show_start_type() which returns a pointer to the safe copy
+ * of the data needed (or if BTF_SHOW_UNSAFE is specified, to the
+ * raw data itself). btf_show_obj_safe() is responsible for
+ * using copy_from_kernel_nofault() to update the safe data if necessary
+ * as we traverse the object's data. skbuff-like semantics are
+ * used:
+ *
+ * - obj.head points to the start of the toplevel object for display
+ * - obj.size is the size of the toplevel object
+ * - obj.data points to the current point in the original data at
+ * which our safe data starts. obj.data will advance as we copy
+ * portions of the data.
+ *
+ * In most cases a single copy will suffice, but larger data structures
+ * such as "struct task_struct" will require many copies. The logic in
+ * btf_show_obj_safe() handles the logic that determines if a new
+ * copy_from_kernel_nofault() is needed.
+ */
+struct btf_show {
+ u64 flags;
+ void *target; /* target of show operation (seq file, buffer) */
+ void (*showfn)(struct btf_show *show, const char *fmt, va_list args);
+ const struct btf *btf;
+ /* below are used during iteration */
+ struct {
+ u8 depth;
+ u8 depth_to_show;
+ u8 depth_check;
+ u8 array_member:1,
+ array_terminated:1;
+ u16 array_encoding;
+ u32 type_id;
+ int status; /* non-zero for error */
+ const struct btf_type *type;
+ const struct btf_member *member;
+ char name[BTF_SHOW_NAME_SIZE]; /* space for member name/type */
+ } state;
+ struct {
+ u32 size;
+ void *head;
+ void *data;
+ u8 safe[BTF_SHOW_OBJ_SAFE_SIZE];
+ } obj;
+};
+
struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
@@ -297,9 +392,9 @@ struct btf_kind_operations {
const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
- void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ void (*show)(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m);
+ struct btf_show *show);
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -325,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
return true;
}
@@ -352,16 +448,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
-/* union is only a special case of struct:
- * all its offsetof(member) == 0
- */
-static bool btf_type_is_struct(const struct btf_type *t)
-{
- u8 kind = BTF_INFO_KIND(t->info);
-
- return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
-}
-
static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
@@ -372,24 +458,43 @@ static bool btf_type_is_array(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}
-static bool btf_type_is_var(const struct btf_type *t)
+static bool btf_type_is_datasec(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
-static bool btf_type_is_datasec(const struct btf_type *t)
+static bool btf_type_is_decl_tag(const struct btf_type *t)
{
- return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
+ return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
+}
+
+static bool btf_type_is_decl_tag_target(const struct btf_type *t)
+{
+ return btf_type_is_func(t) || btf_type_is_struct(t) ||
+ btf_type_is_var(t) || btf_type_is_typedef(t);
+}
+
+u32 btf_nr_types(const struct btf *btf)
+{
+ u32 total = 0;
+
+ while (btf) {
+ total += btf->nr_types;
+ btf = btf->base_btf;
+ }
+
+ return total;
}
s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind)
{
const struct btf_type *t;
const char *tname;
- u32 i;
+ u32 i, total;
- for (i = 1; i <= btf->nr_types; i++) {
- t = btf->types[i];
+ total = btf_nr_types(btf);
+ for (i = 1; i < total; i++) {
+ t = btf_type_by_id(btf, i);
if (BTF_INFO_KIND(t->info) != kind)
continue;
@@ -447,6 +552,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -473,6 +579,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
+ btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@@ -485,6 +592,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
+ case BTF_KIND_FLOAT:
return true;
}
@@ -525,9 +633,9 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
-static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
+static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
{
- return (const struct btf_var_secinfo *)(t + 1);
+ return (const struct btf_decl_tag *)(t + 1);
}
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
@@ -537,8 +645,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
{
- return BTF_STR_OFFSET_VALID(offset) &&
- offset < btf->hdr.str_len;
+ if (!BTF_STR_OFFSET_VALID(offset))
+ return false;
+
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ return offset < btf->hdr.str_len;
}
static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
@@ -552,10 +666,22 @@ static bool __btf_name_char_ok(char c, bool first, bool dot_ok)
return true;
}
+static const char *btf_str_by_offset(const struct btf *btf, u32 offset)
+{
+ while (offset < btf->start_str_off)
+ btf = btf->base_btf;
+
+ offset -= btf->start_str_off;
+ if (offset < btf->hdr.str_len)
+ return &btf->strings[offset];
+
+ return NULL;
+}
+
static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok)
{
/* offset must be valid */
- const char *src = &btf->strings[offset];
+ const char *src = btf_str_by_offset(btf, offset);
const char *src_limit;
if (!__btf_name_char_ok(*src, true, dot_ok))
@@ -588,27 +714,28 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset)
static const char *__btf_name_by_offset(const struct btf *btf, u32 offset)
{
+ const char *name;
+
if (!offset)
return "(anon)";
- else if (offset < btf->hdr.str_len)
- return &btf->strings[offset];
- else
- return "(invalid-name-offset)";
+
+ name = btf_str_by_offset(btf, offset);
+ return name ?: "(invalid-name-offset)";
}
const char *btf_name_by_offset(const struct btf *btf, u32 offset)
{
- if (offset < btf->hdr.str_len)
- return &btf->strings[offset];
-
- return NULL;
+ return btf_str_by_offset(btf, offset);
}
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
{
- if (type_id > btf->nr_types)
- return NULL;
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+ type_id -= btf->start_id;
+ if (type_id >= btf->nr_types)
+ return NULL;
return btf->types[type_id];
}
@@ -676,6 +803,487 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
return true;
}
+/* Similar to btf_type_skip_modifiers() but does not skip typedefs. */
+static const struct btf_type *btf_type_skip_qualifiers(const struct btf *btf,
+ u32 id)
+{
+ const struct btf_type *t = btf_type_by_id(btf, id);
+
+ while (btf_type_is_modifier(t) &&
+ BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF) {
+ t = btf_type_by_id(btf, t->type);
+ }
+
+ return t;
+}
+
+#define BTF_SHOW_MAX_ITER 10
+
+#define BTF_KIND_BIT(kind) (1ULL << kind)
+
+/*
+ * Populate show->state.name with type name information.
+ * Format of type name is
+ *
+ * [.member_name = ] (type_name)
+ */
+static const char *btf_show_name(struct btf_show *show)
+{
+ /* BTF_MAX_ITER array suffixes "[]" */
+ const char *array_suffixes = "[][][][][][][][][][]";
+ const char *array_suffix = &array_suffixes[strlen(array_suffixes)];
+ /* BTF_MAX_ITER pointer suffixes "*" */
+ const char *ptr_suffixes = "**********";
+ const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
+ const char *name = NULL, *prefix = "", *parens = "";
+ const struct btf_member *m = show->state.member;
+ const struct btf_type *t;
+ const struct btf_array *array;
+ u32 id = show->state.type_id;
+ const char *member = NULL;
+ bool show_member = false;
+ u64 kinds = 0;
+ int i;
+
+ show->state.name[0] = '\0';
+
+ /*
+ * Don't show type name if we're showing an array member;
+ * in that case we show the array type so don't need to repeat
+ * ourselves for each member.
+ */
+ if (show->state.array_member)
+ return "";
+
+ /* Retrieve member name, if any. */
+ if (m) {
+ member = btf_name_by_offset(show->btf, m->name_off);
+ show_member = strlen(member) > 0;
+ id = m->type;
+ }
+
+ /*
+ * Start with type_id, as we have resolved the struct btf_type *
+ * via btf_modifier_show() past the parent typedef to the child
+ * struct, int etc it is defined as. In such cases, the type_id
+ * still represents the starting type while the struct btf_type *
+ * in our show->state points at the resolved type of the typedef.
+ */
+ t = btf_type_by_id(show->btf, id);
+ if (!t)
+ return "";
+
+ /*
+ * The goal here is to build up the right number of pointer and
+ * array suffixes while ensuring the type name for a typedef
+ * is represented. Along the way we accumulate a list of
+ * BTF kinds we have encountered, since these will inform later
+ * display; for example, pointer types will not require an
+ * opening "{" for struct, we will just display the pointer value.
+ *
+ * We also want to accumulate the right number of pointer or array
+ * indices in the format string while iterating until we get to
+ * the typedef/pointee/array member target type.
+ *
+ * We start by pointing at the end of pointer and array suffix
+ * strings; as we accumulate pointers and arrays we move the pointer
+ * or array string backwards so it will show the expected number of
+ * '*' or '[]' for the type. BTF_SHOW_MAX_ITER of nesting of pointers
+ * and/or arrays and typedefs are supported as a precaution.
+ *
+ * We also want to get typedef name while proceeding to resolve
+ * type it points to so that we can add parentheses if it is a
+ * "typedef struct" etc.
+ */
+ for (i = 0; i < BTF_SHOW_MAX_ITER; i++) {
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ if (!name)
+ name = btf_name_by_offset(show->btf,
+ t->name_off);
+ kinds |= BTF_KIND_BIT(BTF_KIND_TYPEDEF);
+ id = t->type;
+ break;
+ case BTF_KIND_ARRAY:
+ kinds |= BTF_KIND_BIT(BTF_KIND_ARRAY);
+ parens = "[";
+ if (!t)
+ return "";
+ array = btf_type_array(t);
+ if (array_suffix > array_suffixes)
+ array_suffix -= 2;
+ id = array->type;
+ break;
+ case BTF_KIND_PTR:
+ kinds |= BTF_KIND_BIT(BTF_KIND_PTR);
+ if (ptr_suffix > ptr_suffixes)
+ ptr_suffix -= 1;
+ id = t->type;
+ break;
+ default:
+ id = 0;
+ break;
+ }
+ if (!id)
+ break;
+ t = btf_type_skip_qualifiers(show->btf, id);
+ }
+ /* We may not be able to represent this type; bail to be safe */
+ if (i == BTF_SHOW_MAX_ITER)
+ return "";
+
+ if (!name)
+ name = btf_name_by_offset(show->btf, t->name_off);
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ prefix = BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT ?
+ "struct" : "union";
+ /* if it's an array of struct/union, parens is already set */
+ if (!(kinds & (BTF_KIND_BIT(BTF_KIND_ARRAY))))
+ parens = "{";
+ break;
+ case BTF_KIND_ENUM:
+ prefix = "enum";
+ break;
+ default:
+ break;
+ }
+
+ /* pointer does not require parens */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_PTR))
+ parens = "";
+ /* typedef does not require struct/union/enum prefix */
+ if (kinds & BTF_KIND_BIT(BTF_KIND_TYPEDEF))
+ prefix = "";
+
+ if (!name)
+ name = "";
+
+ /* Even if we don't want type name info, we want parentheses etc */
+ if (show->flags & BTF_SHOW_NONAME)
+ snprintf(show->state.name, sizeof(show->state.name), "%s",
+ parens);
+ else
+ snprintf(show->state.name, sizeof(show->state.name),
+ "%s%s%s(%s%s%s%s%s%s)%s",
+ /* first 3 strings comprise ".member = " */
+ show_member ? "." : "",
+ show_member ? member : "",
+ show_member ? " = " : "",
+ /* ...next is our prefix (struct, enum, etc) */
+ prefix,
+ strlen(prefix) > 0 && strlen(name) > 0 ? " " : "",
+ /* ...this is the type name itself */
+ name,
+ /* ...suffixed by the appropriate '*', '[]' suffixes */
+ strlen(ptr_suffix) > 0 ? " " : "", ptr_suffix,
+ array_suffix, parens);
+
+ return show->state.name;
+}
+
+static const char *__btf_show_indent(struct btf_show *show)
+{
+ const char *indents = " ";
+ const char *indent = &indents[strlen(indents)];
+
+ if ((indent - show->state.depth) >= indents)
+ return indent - show->state.depth;
+ return indents;
+}
+
+static const char *btf_show_indent(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : __btf_show_indent(show);
+}
+
+static const char *btf_show_newline(struct btf_show *show)
+{
+ return show->flags & BTF_SHOW_COMPACT ? "" : "\n";
+}
+
+static const char *btf_show_delim(struct btf_show *show)
+{
+ if (show->state.depth == 0)
+ return "";
+
+ if ((show->flags & BTF_SHOW_COMPACT) && show->state.type &&
+ BTF_INFO_KIND(show->state.type->info) == BTF_KIND_UNION)
+ return "|";
+
+ return ",";
+}
+
+__printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
+{
+ va_list args;
+
+ if (!show->state.depth_check) {
+ va_start(args, fmt);
+ show->showfn(show, fmt, args);
+ va_end(args);
+ }
+}
+
+/* Macros are used here as btf_show_type_value[s]() prepends and appends
+ * format specifiers to the format specifier passed in; these do the work of
+ * adding indentation, delimiters etc while the caller simply has to specify
+ * the type value(s) in the format specifier + value(s).
+ */
+#define btf_show_type_value(show, fmt, value) \
+ do { \
+ if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ show->state.depth == 0) { \
+ btf_show(show, "%s%s" fmt "%s%s", \
+ btf_show_indent(show), \
+ btf_show_name(show), \
+ value, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } \
+ } while (0)
+
+#define btf_show_type_values(show, fmt, ...) \
+ do { \
+ btf_show(show, "%s%s" fmt "%s%s", btf_show_indent(show), \
+ btf_show_name(show), \
+ __VA_ARGS__, btf_show_delim(show), \
+ btf_show_newline(show)); \
+ if (show->state.depth > show->state.depth_to_show) \
+ show->state.depth_to_show = show->state.depth; \
+ } while (0)
+
+/* How much is left to copy to safe buffer after @data? */
+static int btf_show_obj_size_left(struct btf_show *show, void *data)
+{
+ return show->obj.head + show->obj.size - data;
+}
+
+/* Is object pointed to by @data of @size already copied to our safe buffer? */
+static bool btf_show_obj_is_safe(struct btf_show *show, void *data, int size)
+{
+ return data >= show->obj.data &&
+ (data + size) < (show->obj.data + BTF_SHOW_OBJ_SAFE_SIZE);
+}
+
+/*
+ * If object pointed to by @data of @size falls within our safe buffer, return
+ * the equivalent pointer to the same safe data. Assumes
+ * copy_from_kernel_nofault() has already happened and our safe buffer is
+ * populated.
+ */
+static void *__btf_show_obj_safe(struct btf_show *show, void *data, int size)
+{
+ if (btf_show_obj_is_safe(show, data, size))
+ return show->obj.safe + (data - show->obj.data);
+ return NULL;
+}
+
+/*
+ * Return a safe-to-access version of data pointed to by @data.
+ * We do this by copying the relevant amount of information
+ * to the struct btf_show obj.safe buffer using copy_from_kernel_nofault().
+ *
+ * If BTF_SHOW_UNSAFE is specified, just return data as-is; no
+ * safe copy is needed.
+ *
+ * Otherwise we need to determine if we have the required amount
+ * of data (determined by the @data pointer and the size of the
+ * largest base type we can encounter (represented by
+ * BTF_SHOW_OBJ_BASE_TYPE_SIZE). Having that much data ensures
+ * that we will be able to print some of the current object,
+ * and if more is needed a copy will be triggered.
+ * Some objects such as structs will not fit into the buffer;
+ * in such cases additional copies when we iterate over their
+ * members may be needed.
+ *
+ * btf_show_obj_safe() is used to return a safe buffer for
+ * btf_show_start_type(); this ensures that as we recurse into
+ * nested types we always have safe data for the given type.
+ * This approach is somewhat wasteful; it's possible for example
+ * that when iterating over a large union we'll end up copying the
+ * same data repeatedly, but the goal is safety not performance.
+ * We use stack data as opposed to per-CPU buffers because the
+ * iteration over a type can take some time, and preemption handling
+ * would greatly complicate use of the safe buffer.
+ */
+static void *btf_show_obj_safe(struct btf_show *show,
+ const struct btf_type *t,
+ void *data)
+{
+ const struct btf_type *rt;
+ int size_left, size;
+ void *safe = NULL;
+
+ if (show->flags & BTF_SHOW_UNSAFE)
+ return data;
+
+ rt = btf_resolve_size(show->btf, t, &size);
+ if (IS_ERR(rt)) {
+ show->state.status = PTR_ERR(rt);
+ return NULL;
+ }
+
+ /*
+ * Is this toplevel object? If so, set total object size and
+ * initialize pointers. Otherwise check if we still fall within
+ * our safe object data.
+ */
+ if (show->state.depth == 0) {
+ show->obj.size = size;
+ show->obj.head = data;
+ } else {
+ /*
+ * If the size of the current object is > our remaining
+ * safe buffer we _may_ need to do a new copy. However
+ * consider the case of a nested struct; it's size pushes
+ * us over the safe buffer limit, but showing any individual
+ * struct members does not. In such cases, we don't need
+ * to initiate a fresh copy yet; however we definitely need
+ * at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes left
+ * in our buffer, regardless of the current object size.
+ * The logic here is that as we resolve types we will
+ * hit a base type at some point, and we need to be sure
+ * the next chunk of data is safely available to display
+ * that type info safely. We cannot rely on the size of
+ * the current object here because it may be much larger
+ * than our current buffer (e.g. task_struct is 8k).
+ * All we want to do here is ensure that we can print the
+ * next basic type, which we can if either
+ * - the current type size is within the safe buffer; or
+ * - at least BTF_SHOW_OBJ_BASE_TYPE_SIZE bytes are left in
+ * the safe buffer.
+ */
+ safe = __btf_show_obj_safe(show, data,
+ min(size,
+ BTF_SHOW_OBJ_BASE_TYPE_SIZE));
+ }
+
+ /*
+ * We need a new copy to our safe object, either because we haven't
+ * yet copied and are initializing safe data, or because the data
+ * we want falls outside the boundaries of the safe object.
+ */
+ if (!safe) {
+ size_left = btf_show_obj_size_left(show, data);
+ if (size_left > BTF_SHOW_OBJ_SAFE_SIZE)
+ size_left = BTF_SHOW_OBJ_SAFE_SIZE;
+ show->state.status = copy_from_kernel_nofault(show->obj.safe,
+ data, size_left);
+ if (!show->state.status) {
+ show->obj.data = data;
+ safe = show->obj.safe;
+ }
+ }
+
+ return safe;
+}
+
+/*
+ * Set the type we are starting to show and return a safe data pointer
+ * to be used for showing the associated data.
+ */
+static void *btf_show_start_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ show->state.type = t;
+ show->state.type_id = type_id;
+ show->state.name[0] = '\0';
+
+ return btf_show_obj_safe(show, t, data);
+}
+
+static void btf_show_end_type(struct btf_show *show)
+{
+ show->state.type = NULL;
+ show->state.type_id = 0;
+ show->state.name[0] = '\0';
+}
+
+static void *btf_show_start_aggr_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id, void *data)
+{
+ void *safe_data = btf_show_start_type(show, t, type_id, data);
+
+ if (!safe_data)
+ return safe_data;
+
+ btf_show(show, "%s%s%s", btf_show_indent(show),
+ btf_show_name(show),
+ btf_show_newline(show));
+ show->state.depth++;
+ return safe_data;
+}
+
+static void btf_show_end_aggr_type(struct btf_show *show,
+ const char *suffix)
+{
+ show->state.depth--;
+ btf_show(show, "%s%s%s%s", btf_show_indent(show), suffix,
+ btf_show_delim(show), btf_show_newline(show));
+ btf_show_end_type(show);
+}
+
+static void btf_show_start_member(struct btf_show *show,
+ const struct btf_member *m)
+{
+ show->state.member = m;
+}
+
+static void btf_show_start_array_member(struct btf_show *show)
+{
+ show->state.array_member = 1;
+ btf_show_start_member(show, NULL);
+}
+
+static void btf_show_end_member(struct btf_show *show)
+{
+ show->state.member = NULL;
+}
+
+static void btf_show_end_array_member(struct btf_show *show)
+{
+ show->state.array_member = 0;
+ btf_show_end_member(show);
+}
+
+static void *btf_show_start_array_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ u16 array_encoding,
+ void *data)
+{
+ show->state.array_encoding = array_encoding;
+ show->state.array_terminated = 0;
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_array_type(struct btf_show *show)
+{
+ show->state.array_encoding = 0;
+ show->state.array_terminated = 0;
+ btf_show_end_aggr_type(show, "]");
+}
+
+static void *btf_show_start_struct_type(struct btf_show *show,
+ const struct btf_type *t,
+ u32 type_id,
+ void *data)
+{
+ return btf_show_start_aggr_type(show, t, type_id, data);
+}
+
+static void btf_show_end_struct_type(struct btf_show *show)
+{
+ btf_show_end_aggr_type(show, "}");
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -846,17 +1454,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
{
struct btf *btf = env->btf;
- /* < 2 because +1 for btf_void which is always in btf->types[0].
- * btf_void is not accounted in btf->nr_types because btf_void
- * does not come from the BTF file.
- */
- if (btf->types_size - btf->nr_types < 2) {
+ if (btf->types_size == btf->nr_types) {
/* Expand 'types' array */
struct btf_type **new_types;
u32 expand_by, new_size;
- if (btf->types_size == BTF_MAX_TYPE) {
+ if (btf->start_id + btf->types_size == BTF_MAX_TYPE) {
btf_verifier_log(env, "Exceeded max num of types");
return -E2BIG;
}
@@ -870,18 +1474,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
if (!new_types)
return -ENOMEM;
- if (btf->nr_types == 0)
- new_types[0] = &btf_void;
- else
+ if (btf->nr_types == 0) {
+ if (!btf->base_btf) {
+ /* lazily init VOID type */
+ new_types[0] = &btf_void;
+ btf->nr_types++;
+ }
+ } else {
memcpy(new_types, btf->types,
- sizeof(*btf->types) * (btf->nr_types + 1));
+ sizeof(*btf->types) * btf->nr_types);
+ }
kvfree(btf->types);
btf->types = new_types;
btf->types_size = new_size;
}
- btf->types[++(btf->nr_types)] = t;
+ btf->types[btf->nr_types++] = t;
return 0;
}
@@ -938,6 +1547,11 @@ static void btf_free_rcu(struct rcu_head *rcu)
btf_free(btf);
}
+void btf_get(struct btf *btf)
+{
+ refcount_inc(&btf->refcnt);
+}
+
void btf_put(struct btf *btf)
{
if (btf && refcount_dec_and_test(&btf->refcnt)) {
@@ -954,18 +1568,17 @@ static int env_resolve_init(struct btf_verifier_env *env)
u32 *resolved_ids = NULL;
u8 *visit_states = NULL;
- /* +1 for btf_void */
- resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes),
+ resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_sizes)
goto nomem;
- resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids),
+ resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids),
GFP_KERNEL | __GFP_NOWARN);
if (!resolved_ids)
goto nomem;
- visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states),
+ visit_states = kvcalloc(nr_types, sizeof(*visit_states),
GFP_KERNEL | __GFP_NOWARN);
if (!visit_states)
goto nomem;
@@ -1017,21 +1630,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
static bool env_type_is_resolved(const struct btf_verifier_env *env,
u32 type_id)
{
- return env->visit_states[type_id] == RESOLVED;
+ /* base BTF types should be resolved by now */
+ if (type_id < env->btf->start_id)
+ return true;
+
+ return env->visit_states[type_id - env->btf->start_id] == RESOLVED;
}
static int env_stack_push(struct btf_verifier_env *env,
const struct btf_type *t, u32 type_id)
{
+ const struct btf *btf = env->btf;
struct resolve_vertex *v;
if (env->top_stack == MAX_RESOLVE_DEPTH)
return -E2BIG;
- if (env->visit_states[type_id] != NOT_VISITED)
+ if (type_id < btf->start_id
+ || env->visit_states[type_id - btf->start_id] != NOT_VISITED)
return -EEXIST;
- env->visit_states[type_id] = VISITED;
+ env->visit_states[type_id - btf->start_id] = VISITED;
v = &env->stack[env->top_stack++];
v->t = t;
@@ -1061,6 +1680,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env,
u32 type_id = env->stack[--(env->top_stack)].type_id;
struct btf *btf = env->btf;
+ type_id -= btf->start_id; /* adjust to local type id */
btf->resolved_sizes[type_id] = resolved_size;
btf->resolved_ids[type_id] = resolved_type_id;
env->visit_states[type_id] = RESOLVED;
@@ -1078,23 +1698,27 @@ static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
* *type_size: (x * y * sizeof(u32)). Hence, *type_size always
* corresponds to the return type.
* *elem_type: u32
+ * *elem_id: id of u32
* *total_nelems: (x * y). Hence, individual elem size is
* (*type_size / *total_nelems)
+ * *type_id: id of type if it's changed within the function, 0 if not
*
* type: is not an array (e.g. const struct X)
* return type: type "struct X"
* *type_size: sizeof(struct X)
* *elem_type: same as return type ("struct X")
+ * *elem_id: 0
* *total_nelems: 1
+ * *type_id: id of type if it's changed within the function, 0 if not
*/
-const struct btf_type *
-btf_resolve_size(const struct btf *btf, const struct btf_type *type,
- u32 *type_size, const struct btf_type **elem_type,
- u32 *total_nelems)
+static const struct btf_type *
+__btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size, const struct btf_type **elem_type,
+ u32 *elem_id, u32 *total_nelems, u32 *type_id)
{
const struct btf_type *array_type = NULL;
- const struct btf_array *array;
- u32 i, size, nelems = 1;
+ const struct btf_array *array = NULL;
+ u32 i, size, nelems = 1, id = 0;
for (i = 0; i < MAX_RESOLVE_DEPTH; i++) {
switch (BTF_INFO_KIND(type->info)) {
@@ -1103,6 +1727,7 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_STRUCT:
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
+ case BTF_KIND_FLOAT:
size = type->size;
goto resolved;
@@ -1115,6 +1740,8 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
+ case BTF_KIND_TYPE_TAG:
+ id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@@ -1145,18 +1772,45 @@ resolved:
*total_nelems = nelems;
if (elem_type)
*elem_type = type;
+ if (elem_id)
+ *elem_id = array ? array->type : 0;
+ if (type_id && id)
+ *type_id = id;
return array_type ? : type;
}
+const struct btf_type *
+btf_resolve_size(const struct btf *btf, const struct btf_type *type,
+ u32 *type_size)
+{
+ return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL);
+}
+
+static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_ids[type_id - btf->start_id];
+}
+
/* The input param "type_id" must point to a needs_resolve type */
static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
u32 *type_id)
{
- *type_id = btf->resolved_ids[*type_id];
+ *type_id = btf_resolved_type_id(btf, *type_id);
return btf_type_by_id(btf, *type_id);
}
+static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id)
+{
+ while (type_id < btf->start_id)
+ btf = btf->base_btf;
+
+ return btf->resolved_sizes[type_id - btf->start_id];
+}
+
const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 *type_id, u32 *ret_size)
{
@@ -1171,7 +1825,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
if (btf_type_has_size(size_type)) {
size = size_type->size;
} else if (btf_type_is_array(size_type)) {
- size = btf->resolved_sizes[size_type_id];
+ size = btf_resolved_type_size(btf, size_type_id);
} else if (btf_type_is_ptr(size_type)) {
size = sizeof(void *);
} else {
@@ -1179,14 +1833,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf,
!btf_type_is_var(size_type)))
return NULL;
- size_type_id = btf->resolved_ids[size_type_id];
+ size_type_id = btf_resolved_type_id(btf, size_type_id);
size_type = btf_type_by_id(btf, size_type_id);
if (btf_type_nosize_or_null(size_type))
return NULL;
else if (btf_type_has_size(size_type))
size = size_type->size;
else if (btf_type_is_array(size_type))
- size = btf->resolved_sizes[size_type_id];
+ size = btf_resolved_type_size(btf, size_type_id);
else if (btf_type_is_ptr(size_type))
size = sizeof(void *);
else
@@ -1220,7 +1874,7 @@ static int btf_df_check_kflag_member(struct btf_verifier_env *env,
return -EINVAL;
}
-/* Used for ptr, array and struct/union type members.
+/* Used for ptr, array struct/union and float type members.
* int, enum and modifier types have their specific callback functions.
*/
static int btf_generic_check_kflag_member(struct btf_verifier_env *env,
@@ -1249,11 +1903,11 @@ static int btf_df_resolve(struct btf_verifier_env *env,
return -EINVAL;
}
-static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offsets,
- struct seq_file *m)
+static void btf_df_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct btf_show *show)
{
- seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+ btf_show(show, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
}
static int btf_int_check_member(struct btf_verifier_env *env,
@@ -1426,7 +2080,7 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
-static void btf_int128_print(struct seq_file *m, void *data)
+static void btf_int128_print(struct btf_show *show, void *data)
{
/* data points to a __int128 number.
* Suppose
@@ -1445,9 +2099,10 @@ static void btf_int128_print(struct seq_file *m, void *data)
lower_num = *(u64 *)data;
#endif
if (upper_num == 0)
- seq_printf(m, "0x%llx", lower_num);
+ btf_show_type_value(show, "0x%llx", lower_num);
else
- seq_printf(m, "0x%llx%016llx", upper_num, lower_num);
+ btf_show_type_values(show, "0x%llx%016llx", upper_num,
+ lower_num);
}
static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
@@ -1491,8 +2146,8 @@ static void btf_int128_shift(u64 *print_num, u16 left_shift_bits,
#endif
}
-static void btf_bitfield_seq_show(void *data, u8 bits_offset,
- u8 nr_bits, struct seq_file *m)
+static void btf_bitfield_show(void *data, u8 bits_offset,
+ u8 nr_bits, struct btf_show *show)
{
u16 left_shift_bits, right_shift_bits;
u8 nr_copy_bytes;
@@ -1512,14 +2167,14 @@ static void btf_bitfield_seq_show(void *data, u8 bits_offset,
right_shift_bits = BITS_PER_U128 - nr_bits;
btf_int128_shift(print_num, left_shift_bits, right_shift_bits);
- btf_int128_print(m, print_num);
+ btf_int128_print(show, print_num);
}
-static void btf_int_bits_seq_show(const struct btf *btf,
- const struct btf_type *t,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_bits_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 nr_bits = BTF_INT_BITS(int_data);
@@ -1532,55 +2187,77 @@ static void btf_int_bits_seq_show(const struct btf *btf,
total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
- btf_bitfield_seq_show(data, bits_offset, nr_bits, m);
+ btf_bitfield_show(data, bits_offset, nr_bits, show);
}
-static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_int_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
u32 int_data = btf_type_int(t);
u8 encoding = BTF_INT_ENCODING(int_data);
bool sign = encoding & BTF_INT_SIGNED;
u8 nr_bits = BTF_INT_BITS(int_data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
if (bits_offset || BTF_INT_OFFSET(int_data) ||
BITS_PER_BYTE_MASKED(nr_bits)) {
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
- return;
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ goto out;
}
switch (nr_bits) {
case 128:
- btf_int128_print(m, data);
+ btf_int128_print(show, safe_data);
break;
case 64:
if (sign)
- seq_printf(m, "%lld", *(s64 *)data);
+ btf_show_type_value(show, "%lld", *(s64 *)safe_data);
else
- seq_printf(m, "%llu", *(u64 *)data);
+ btf_show_type_value(show, "%llu", *(u64 *)safe_data);
break;
case 32:
if (sign)
- seq_printf(m, "%d", *(s32 *)data);
+ btf_show_type_value(show, "%d", *(s32 *)safe_data);
else
- seq_printf(m, "%u", *(u32 *)data);
+ btf_show_type_value(show, "%u", *(u32 *)safe_data);
break;
case 16:
if (sign)
- seq_printf(m, "%d", *(s16 *)data);
+ btf_show_type_value(show, "%d", *(s16 *)safe_data);
else
- seq_printf(m, "%u", *(u16 *)data);
+ btf_show_type_value(show, "%u", *(u16 *)safe_data);
break;
case 8:
+ if (show->state.array_encoding == BTF_INT_CHAR) {
+ /* check for null terminator */
+ if (show->state.array_terminated)
+ break;
+ if (*(char *)data == '\0') {
+ show->state.array_terminated = 1;
+ break;
+ }
+ if (isprint(*(char *)data)) {
+ btf_show_type_value(show, "'%c'",
+ *(char *)safe_data);
+ break;
+ }
+ }
if (sign)
- seq_printf(m, "%d", *(s8 *)data);
+ btf_show_type_value(show, "%d", *(s8 *)safe_data);
else
- seq_printf(m, "%u", *(u8 *)data);
+ btf_show_type_value(show, "%u", *(u8 *)safe_data);
break;
default:
- btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ btf_int_bits_show(btf, t, safe_data, bits_offset, show);
+ break;
}
+out:
+ btf_show_end_type(show);
}
static const struct btf_kind_operations int_ops = {
@@ -1589,7 +2266,7 @@ static const struct btf_kind_operations int_ops = {
.check_member = btf_int_check_member,
.check_kflag_member = btf_int_check_kflag_member,
.log_details = btf_int_log,
- .seq_show = btf_int_seq_show,
+ .show = btf_int_show,
};
static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1672,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
+ const char *value;
+
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@@ -1687,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- /* typedef type must have a valid name, and other ref types,
+ /* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@@ -1696,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
+ } else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@@ -1853,34 +2538,44 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return 0;
}
-static void btf_modifier_seq_show(const struct btf *btf,
- const struct btf_type *t,
- u32 type_id, void *data,
- u8 bits_offset, struct seq_file *m)
+static void btf_modifier_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct btf_show *show)
{
if (btf->resolved_ids)
t = btf_type_id_resolve(btf, &type_id);
else
t = btf_type_skip_modifiers(btf, type_id, NULL);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_var_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
t = btf_type_id_resolve(btf, &type_id);
- btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+ btf_type_ops(t)->show(btf, t, type_id, data, bits_offset, show);
}
-static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_ptr_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- /* It is a hashed value */
- seq_printf(m, "%p", *(void **)data);
+ void *safe_data;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ /* It is a hashed value unless BTF_SHOW_PTR_RAW is specified */
+ if (show->flags & BTF_SHOW_PTR_RAW)
+ btf_show_type_value(show, "0x%px", *(void **)safe_data);
+ else
+ btf_show_type_value(show, "0x%p", *(void **)safe_data);
+ btf_show_end_type(show);
}
static void btf_ref_type_log(struct btf_verifier_env *env,
@@ -1895,7 +2590,7 @@ static struct btf_kind_operations modifier_ops = {
.check_member = btf_modifier_check_member,
.check_kflag_member = btf_modifier_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_modifier_seq_show,
+ .show = btf_modifier_show,
};
static struct btf_kind_operations ptr_ops = {
@@ -1904,7 +2599,7 @@ static struct btf_kind_operations ptr_ops = {
.check_member = btf_ptr_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_ptr_seq_show,
+ .show = btf_ptr_show,
};
static s32 btf_fwd_check_meta(struct btf_verifier_env *env,
@@ -1945,7 +2640,7 @@ static struct btf_kind_operations fwd_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_fwd_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static int btf_array_check_member(struct btf_verifier_env *env,
@@ -2104,28 +2799,90 @@ static void btf_array_log(struct btf_verifier_env *env,
array->type, array->index_type, array->nelems);
}
-static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void __btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_array *array = btf_type_array(t);
const struct btf_kind_operations *elem_ops;
const struct btf_type *elem_type;
- u32 i, elem_size, elem_type_id;
+ u32 i, elem_size = 0, elem_type_id;
+ u16 encoding = 0;
elem_type_id = array->type;
- elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_type = btf_type_skip_modifiers(btf, elem_type_id, NULL);
+ if (elem_type && btf_type_has_size(elem_type))
+ elem_size = elem_type->size;
+
+ if (elem_type && btf_type_is_int(elem_type)) {
+ u32 int_type = btf_type_int(elem_type);
+
+ encoding = BTF_INT_ENCODING(int_type);
+
+ /*
+ * BTF_INT_CHAR encoding never seems to be set for
+ * char arrays, so if size is 1 and element is
+ * printable as a char, we'll do that.
+ */
+ if (elem_size == 1)
+ encoding = BTF_INT_CHAR;
+ }
+
+ if (!btf_show_start_array_type(show, t, type_id, encoding, data))
+ return;
+
+ if (!elem_type)
+ goto out;
elem_ops = btf_type_ops(elem_type);
- seq_puts(m, "[");
+
for (i = 0; i < array->nelems; i++) {
- if (i)
- seq_puts(m, ",");
- elem_ops->seq_show(btf, elem_type, elem_type_id, data,
- bits_offset, m);
+ btf_show_start_array_member(show);
+
+ elem_ops->show(btf, elem_type, elem_type_id, data,
+ bits_offset, show);
data += elem_size;
+
+ btf_show_end_array_member(show);
+
+ if (show->state.array_terminated)
+ break;
+ }
+out:
+ btf_show_end_array_type(show);
+}
+
+static void btf_array_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
+ show->state.member = m;
+
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero array member(s).
+ */
}
- seq_puts(m, "]");
+ __btf_array_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations array_ops = {
@@ -2134,7 +2891,7 @@ static struct btf_kind_operations array_ops = {
.check_member = btf_array_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_array_log,
- .seq_show = btf_array_seq_show,
+ .show = btf_array_show,
};
static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -2213,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- offset = btf_member_bit_offset(t, member);
+ offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@@ -2320,52 +3077,104 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-/* find 'struct bpf_spin_lock' in map value.
- * return >= 0 offset if found
- * and < 0 in case of error
- */
-int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
- if (!__btf_type_is_struct(t))
- return -EINVAL;
-
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
- if (member_type->size != sizeof(struct bpf_spin_lock))
+ if (member_type->size != sz)
continue;
- if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
- "bpf_spin_lock"))
+ if (strcmp(__btf_name_by_offset(btf, member_type->name_off), name))
continue;
if (off != -ENOENT)
- /* only one 'struct bpf_spin_lock' is allowed */
+ /* only one such field is allowed */
return -E2BIG;
- off = btf_member_bit_offset(t, member);
+ off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
- if (off % __alignof__(struct bpf_spin_lock))
- /* valid struct bpf_spin_lock will be 4 byte aligned */
+ if (off % align)
return -EINVAL;
}
return off;
}
-static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+ const struct btf_var_secinfo *vsi;
+ u32 i, off = -ENOENT;
+
+ for_each_vsi(i, t, vsi) {
+ const struct btf_type *var = btf_type_by_id(btf, vsi->type);
+ const struct btf_type *var_type = btf_type_by_id(btf, var->type);
+
+ if (!__btf_type_is_struct(var_type))
+ continue;
+ if (var_type->size != sz)
+ continue;
+ if (vsi->size != sz)
+ continue;
+ if (strcmp(__btf_name_by_offset(btf, var_type->name_off), name))
+ continue;
+ if (off != -ENOENT)
+ /* only one such field is allowed */
+ return -E2BIG;
+ off = vsi->offset;
+ if (off % align)
+ return -EINVAL;
+ }
+ return off;
+}
+
+static int btf_find_field(const struct btf *btf, const struct btf_type *t,
+ const char *name, int sz, int align)
+{
+
+ if (__btf_type_is_struct(t))
+ return btf_find_struct_field(btf, t, name, sz, align);
+ else if (btf_type_is_datasec(t))
+ return btf_find_datasec_var(btf, t, name, sz, align);
+ return -EINVAL;
+}
+
+/* find 'struct bpf_spin_lock' in map value.
+ * return >= 0 offset if found
+ * and < 0 in case of error
+ */
+int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_spin_lock",
+ sizeof(struct bpf_spin_lock),
+ __alignof__(struct bpf_spin_lock));
+}
+
+int btf_find_timer(const struct btf *btf, const struct btf_type *t)
+{
+ return btf_find_field(btf, t, "bpf_timer",
+ sizeof(struct bpf_timer),
+ __alignof__(struct bpf_timer));
+}
+
+static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
- const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
const struct btf_member *member;
+ void *safe_data;
u32 i;
- seq_puts(m, "{");
+ safe_data = btf_show_start_struct_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
@@ -2374,23 +3183,65 @@ static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 bytes_offset;
u8 bits8_offset;
- if (i)
- seq_puts(m, seq);
+ btf_show_start_member(show, member);
- member_offset = btf_member_bit_offset(t, member);
- bitfield_size = btf_member_bitfield_size(t, member);
+ member_offset = __btf_member_bit_offset(t, member);
+ bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
- btf_bitfield_seq_show(data + bytes_offset, bits8_offset,
- bitfield_size, m);
+ safe_data = btf_show_start_type(show, member_type,
+ member->type,
+ data + bytes_offset);
+ if (safe_data)
+ btf_bitfield_show(safe_data,
+ bits8_offset,
+ bitfield_size, show);
+ btf_show_end_type(show);
} else {
ops = btf_type_ops(member_type);
- ops->seq_show(btf, member_type, member->type,
- data + bytes_offset, bits8_offset, m);
+ ops->show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, show);
}
+
+ btf_show_end_member(show);
}
- seq_puts(m, "}");
+
+ btf_show_end_struct_type(show);
+}
+
+static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_member *m = show->state.member;
+
+ /*
+ * First check if any members would be shown (are non-zero).
+ * See comments above "struct btf_show" definition for more
+ * details on how this works at a high-level.
+ */
+ if (show->state.depth > 0 && !(show->flags & BTF_SHOW_ZERO)) {
+ if (!show->state.depth_check) {
+ show->state.depth_check = show->state.depth + 1;
+ show->state.depth_to_show = 0;
+ }
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
+ /* Restore saved member data here */
+ show->state.member = m;
+ if (show->state.depth_check != show->state.depth + 1)
+ return;
+ show->state.depth_check = 0;
+
+ if (show->state.depth_to_show <= show->state.depth)
+ return;
+ /*
+ * Reaching here indicates we have recursed and found
+ * non-zero child values.
+ */
+ }
+
+ __btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations struct_ops = {
@@ -2399,7 +3250,7 @@ static struct btf_kind_operations struct_ops = {
.check_member = btf_struct_check_member,
.check_kflag_member = btf_generic_check_kflag_member,
.log_details = btf_struct_log,
- .seq_show = btf_struct_seq_show,
+ .show = btf_struct_show,
};
static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -2418,7 +3269,7 @@ static int btf_enum_check_member(struct btf_verifier_env *env,
struct_size = struct_type->size;
bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
- if (struct_size - bytes_offset < sizeof(int)) {
+ if (struct_size - bytes_offset < member_type->size) {
btf_verifier_log_member(env, struct_type, member,
"Member exceeds struct_size");
return -EINVAL;
@@ -2530,24 +3381,35 @@ static void btf_enum_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
- u32 type_id, void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_enum *enums = btf_type_enum(t);
u32 i, nr_enums = btf_type_vlen(t);
- int v = *(int *)data;
+ void *safe_data;
+ int v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(int *)safe_data;
for (i = 0; i < nr_enums; i++) {
- if (v == enums[i].val) {
- seq_printf(m, "%s",
- __btf_name_by_offset(btf,
- enums[i].name_off));
- return;
- }
+ if (v != enums[i].val)
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
}
- seq_printf(m, "%d", v);
+ btf_show_type_value(show, "%d", v);
+ btf_show_end_type(show);
}
static struct btf_kind_operations enum_ops = {
@@ -2556,7 +3418,7 @@ static struct btf_kind_operations enum_ops = {
.check_member = btf_enum_check_member,
.check_kflag_member = btf_enum_check_kflag_member,
.log_details = btf_enum_log,
- .seq_show = btf_enum_seq_show,
+ .show = btf_enum_show,
};
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
@@ -2635,7 +3497,7 @@ static struct btf_kind_operations func_proto_ops = {
* BTF_KIND_FUNC_PROTO cannot be directly referred by
* a struct's member.
*
- * It should be a funciton pointer instead.
+ * It should be a function pointer instead.
* (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO)
*
* Hence, there is no btf_func_check_member().
@@ -2643,7 +3505,7 @@ static struct btf_kind_operations func_proto_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_func_proto_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_func_check_meta(struct btf_verifier_env *env,
@@ -2677,7 +3539,7 @@ static struct btf_kind_operations func_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_ref_type_log,
- .seq_show = btf_df_seq_show,
+ .show = btf_df_show,
};
static s32 btf_var_check_meta(struct btf_verifier_env *env,
@@ -2741,7 +3603,7 @@ static const struct btf_kind_operations var_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_var_log,
- .seq_show = btf_var_seq_show,
+ .show = btf_var_show,
};
static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
@@ -2760,11 +3622,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (!btf_type_vlen(t)) {
- btf_verifier_log_type(env, t, "vlen == 0");
- return -EINVAL;
- }
-
if (!t->size) {
btf_verifier_log_type(env, t, "size == 0");
return -EINVAL;
@@ -2867,24 +3724,28 @@ static void btf_datasec_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
-static void btf_datasec_seq_show(const struct btf *btf,
- const struct btf_type *t, u32 type_id,
- void *data, u8 bits_offset,
- struct seq_file *m)
+static void btf_datasec_show(const struct btf *btf,
+ const struct btf_type *t, u32 type_id,
+ void *data, u8 bits_offset,
+ struct btf_show *show)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *var;
u32 i;
- seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off));
+ if (!btf_show_start_type(show, t, type_id, data))
+ return;
+
+ btf_show_type_value(show, "section (\"%s\") = {",
+ __btf_name_by_offset(btf, t->name_off));
for_each_vsi(i, t, vsi) {
var = btf_type_by_id(btf, vsi->type);
if (i)
- seq_puts(m, ",");
- btf_type_ops(var)->seq_show(btf, var, vsi->type,
- data + vsi->offset, bits_offset, m);
+ btf_show(show, ",");
+ btf_type_ops(var)->show(btf, var, vsi->type,
+ data + vsi->offset, bits_offset, show);
}
- seq_puts(m, "}");
+ btf_show_end_type(show);
}
static const struct btf_kind_operations datasec_ops = {
@@ -2893,7 +3754,186 @@ static const struct btf_kind_operations datasec_ops = {
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_datasec_log,
- .seq_show = btf_datasec_seq_show,
+ .show = btf_datasec_show,
+};
+
+static s32 btf_float_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ if (t->size != 2 && t->size != 4 && t->size != 8 && t->size != 12 &&
+ t->size != 16) {
+ btf_verifier_log_type(env, t, "Invalid type_size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static int btf_float_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u64 start_offset_bytes;
+ u64 end_offset_bytes;
+ u64 misalign_bits;
+ u64 align_bytes;
+ u64 align_bits;
+
+ /* Different architectures have different alignment requirements, so
+ * here we check only for the reasonable minimum. This way we ensure
+ * that types after CO-RE can pass the kernel BTF verifier.
+ */
+ align_bytes = min_t(u64, sizeof(void *), member_type->size);
+ align_bits = align_bytes * BITS_PER_BYTE;
+ div64_u64_rem(member->offset, align_bits, &misalign_bits);
+ if (misalign_bits) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not properly aligned");
+ return -EINVAL;
+ }
+
+ start_offset_bytes = member->offset / BITS_PER_BYTE;
+ end_offset_bytes = start_offset_bytes + member_type->size;
+ if (end_offset_bytes > struct_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void btf_float_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u", t->size);
+}
+
+static const struct btf_kind_operations float_ops = {
+ .check_meta = btf_float_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_float_check_member,
+ .check_kflag_member = btf_generic_check_kflag_member,
+ .log_details = btf_float_log,
+ .show = btf_df_show,
+};
+
+static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_decl_tag *tag;
+ u32 meta_needed = sizeof(*tag);
+ s32 component_idx;
+ const char *value;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ value = btf_name_by_offset(env->btf, t->name_off);
+ if (!value || !value[0]) {
+ btf_verifier_log_type(env, t, "Invalid value");
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (btf_type_kflag(t)) {
+ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
+ return -EINVAL;
+ }
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx < -1) {
+ btf_verifier_log_type(env, t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static int btf_decl_tag_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ s32 component_idx;
+ u32 vlen;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ component_idx = btf_type_decl_tag(t)->component_idx;
+ if (component_idx != -1) {
+ if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_struct(next_type)) {
+ vlen = btf_type_vlen(next_type);
+ } else {
+ /* next_type should be a function */
+ next_type = btf_type_by_id(btf, next_type->type);
+ vlen = btf_type_vlen(next_type);
+ }
+
+ if ((u32)component_idx >= vlen) {
+ btf_verifier_log_type(env, v->t, "Invalid component_idx");
+ return -EINVAL;
+ }
+ }
+
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
+static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
+{
+ btf_verifier_log(env, "type=%u component_idx=%d", t->type,
+ btf_type_decl_tag(t)->component_idx);
+}
+
+static const struct btf_kind_operations decl_tag_ops = {
+ .check_meta = btf_decl_tag_check_meta,
+ .resolve = btf_decl_tag_resolve,
+ .check_member = btf_df_check_member,
+ .check_kflag_member = btf_df_check_kflag_member,
+ .log_details = btf_decl_tag_log,
+ .show = btf_df_show,
};
static int btf_func_proto_check(struct btf_verifier_env *env,
@@ -3029,6 +4069,9 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FUNC_PROTO] = &func_proto_ops,
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
+ [BTF_KIND_FLOAT] = &float_ops,
+ [BTF_KIND_DECL_TAG] = &decl_tag_ops,
+ [BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -3083,7 +4126,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
cur = btf->nohdr_data + hdr->type_off;
end = cur + hdr->type_len;
- env->log_type_id = 1;
+ env->log_type_id = btf->base_btf ? btf->start_id : 1;
while (cur < end) {
struct btf_type *t = cur;
s32 meta_size;
@@ -3110,8 +4153,12 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return false;
if (btf_type_is_struct(t) || btf_type_is_datasec(t))
- return !btf->resolved_ids[type_id] &&
- !btf->resolved_sizes[type_id];
+ return !btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
+
+ if (btf_type_is_decl_tag(t))
+ return btf_resolved_type_id(btf, type_id) &&
+ !btf_resolved_type_size(btf, type_id);
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
@@ -3131,7 +4178,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
return elem_type && !btf_type_is_modifier(elem_type) &&
(array->nelems * elem_size ==
- btf->resolved_sizes[type_id]);
+ btf_resolved_type_size(btf, type_id));
}
return false;
@@ -3173,7 +4220,8 @@ static int btf_resolve(struct btf_verifier_env *env,
static int btf_check_all_types(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
- u32 type_id;
+ const struct btf_type *t;
+ u32 type_id, i;
int err;
err = env_resolve_init(env);
@@ -3181,8 +4229,9 @@ static int btf_check_all_types(struct btf_verifier_env *env)
return err;
env->phase++;
- for (type_id = 1; type_id <= btf->nr_types; type_id++) {
- const struct btf_type *t = btf_type_by_id(btf, type_id);
+ for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) {
+ type_id = btf->start_id + i;
+ t = btf_type_by_id(btf, type_id);
env->log_type_id = type_id;
if (btf_type_needs_resolve(t) &&
@@ -3219,7 +4268,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env)
return -EINVAL;
}
- if (!hdr->type_len) {
+ if (!env->btf->base_btf && !hdr->type_len) {
btf_verifier_log(env, "No type found");
return -EINVAL;
}
@@ -3246,13 +4295,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env)
return -EINVAL;
}
- if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
- start[0] || end[-1]) {
+ btf->strings = start;
+
+ if (btf->base_btf && !hdr->str_len)
+ return 0;
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+ if (!btf->base_btf && start[0]) {
btf_verifier_log(env, "Invalid string section");
return -EINVAL;
}
-
- btf->strings = start;
return 0;
}
@@ -3381,7 +4435,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return -ENOTSUPP;
}
- if (btf_data_size == hdr->hdr_len) {
+ if (!btf->base_btf && btf_data_size == hdr->hdr_len) {
btf_verifier_log(env, "No data");
return -EINVAL;
}
@@ -3393,7 +4447,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env)
return 0;
}
-static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+static struct btf *btf_parse(bpfptr_t btf_data, u32 btf_data_size,
u32 log_level, char __user *log_ubuf, u32 log_size)
{
struct btf_verifier_env *env = NULL;
@@ -3419,8 +4473,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
log->len_total = log_size;
/* log attributes have to be sane */
- if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
- !log->level || !log->ubuf) {
+ if (!bpf_verifier_log_attr_valid(log)) {
err = -EINVAL;
goto errout;
}
@@ -3442,7 +4495,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
btf->data = data;
btf->data_size = btf_data_size;
- if (copy_from_user(data, btf_data, btf_data_size)) {
+ if (copy_from_bpfptr(data, btf_data, btf_data_size)) {
err = -EFAULT;
goto errout;
}
@@ -3477,11 +4530,12 @@ errout:
return ERR_PTR(err);
}
-extern char __weak _binary__btf_vmlinux_bin_start[];
-extern char __weak _binary__btf_vmlinux_bin_end[];
+extern char __weak __start_BTF[];
+extern char __weak __stop_BTF[];
extern struct btf *btf_vmlinux;
#define BPF_MAP_TYPE(_id, _ops)
+#define BPF_LINK_TYPE(_id, _name)
static union {
struct bpf_ctx_convert {
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
@@ -3508,9 +4562,10 @@ static u8 bpf_ctx_convert_map[] = {
0, /* avoid empty array */
};
#undef BPF_MAP_TYPE
+#undef BPF_LINK_TYPE
static const struct btf_member *
-btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
+btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf,
const struct btf_type *t, enum bpf_prog_type prog_type,
int arg)
{
@@ -3533,8 +4588,6 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
* is not supported yet.
* BPF_PROG_TYPE_RAW_TRACEPOINT is fine.
*/
- if (log->level & BPF_LOG_LEVEL)
- bpf_log(log, "arg#%d type is not a struct\n", arg);
return NULL;
}
tname = btf_name_by_offset(btf, t->name_off);
@@ -3569,6 +4622,41 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, struct btf *btf,
return ctx_type;
}
+static const struct bpf_map_ops * const btf_vmlinux_map_ops[] = {
+#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type)
+#define BPF_LINK_TYPE(_id, _name)
+#define BPF_MAP_TYPE(_id, _ops) \
+ [_id] = &_ops,
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_LINK_TYPE
+#undef BPF_MAP_TYPE
+};
+
+static int btf_vmlinux_map_ids_init(const struct btf *btf,
+ struct bpf_verifier_log *log)
+{
+ const struct bpf_map_ops *ops;
+ int i, btf_id;
+
+ for (i = 0; i < ARRAY_SIZE(btf_vmlinux_map_ops); ++i) {
+ ops = btf_vmlinux_map_ops[i];
+ if (!ops || (!ops->map_btf_name && !ops->map_btf_id))
+ continue;
+ if (!ops->map_btf_name || !ops->map_btf_id) {
+ bpf_log(log, "map type %d is misconfigured\n", i);
+ return -EINVAL;
+ }
+ btf_id = btf_find_by_name_kind(btf, ops->map_btf_name,
+ BTF_KIND_STRUCT);
+ if (btf_id < 0)
+ return btf_id;
+ *ops->map_btf_id = btf_id;
+ }
+
+ return 0;
+}
+
static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
struct btf *btf,
const struct btf_type *t,
@@ -3584,12 +4672,15 @@ static int btf_translate_to_vmlinux(struct bpf_verifier_log *log,
return kern_ctx_type->type;
}
+BTF_ID_LIST(bpf_ctx_convert_btf_id)
+BTF_ID(struct, bpf_ctx_convert)
+
struct btf *btf_parse_vmlinux(void)
{
struct btf_verifier_env *env = NULL;
struct bpf_verifier_log *log;
struct btf *btf = NULL;
- int err, i;
+ int err;
env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
if (!env)
@@ -3605,9 +4696,10 @@ struct btf *btf_parse_vmlinux(void)
}
env->btf = btf;
- btf->data = _binary__btf_vmlinux_bin_start;
- btf->data_size = _binary__btf_vmlinux_bin_end -
- _binary__btf_vmlinux_bin_start;
+ btf->data = __start_BTF;
+ btf->data_size = __stop_BTF - __start_BTF;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "vmlinux");
err = btf_parse_hdr(env);
if (err)
@@ -3623,27 +4715,90 @@ struct btf *btf_parse_vmlinux(void)
if (err)
goto errout;
- /* find struct bpf_ctx_convert for type checking later */
- for (i = 1; i <= btf->nr_types; i++) {
- const struct btf_type *t;
- const char *tname;
+ /* btf_parse_vmlinux() runs under bpf_verifier_lock */
+ bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
- t = btf_type_by_id(btf, i);
- if (!__btf_type_is_struct(t))
- continue;
- tname = __btf_name_by_offset(btf, t->name_off);
- if (!strcmp(tname, "bpf_ctx_convert")) {
- /* btf_parse_vmlinux() runs under bpf_verifier_lock */
- bpf_ctx_convert.t = t;
- break;
- }
+ /* find bpf map structs for map_ptr access checking */
+ err = btf_vmlinux_map_ids_init(btf, log);
+ if (err < 0)
+ goto errout;
+
+ bpf_struct_ops_init(btf, log);
+
+ refcount_set(&btf->refcnt, 1);
+
+ err = btf_alloc_id(btf);
+ if (err)
+ goto errout;
+
+ btf_verifier_env_free(env);
+ return btf;
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf) {
+ kvfree(btf->types);
+ kfree(btf);
}
- if (i > btf->nr_types) {
- err = -ENOENT;
+ return ERR_PTR(err);
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+
+static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL, *base_btf;
+ int err;
+
+ base_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(base_btf))
+ return base_btf;
+ if (!base_btf)
+ return ERR_PTR(-EINVAL);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ log->level = BPF_LOG_KERNEL;
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
goto errout;
}
+ env->btf = btf;
- bpf_struct_ops_init(btf, log);
+ btf->base_btf = base_btf;
+ btf->start_id = base_btf->nr_types;
+ btf->start_str_off = base_btf->hdr.str_len;
+ btf->kernel_btf = true;
+ snprintf(btf->name, sizeof(btf->name), "%s", module_name);
+
+ btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!btf->data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+ memcpy(btf->data, data, data_size);
+ btf->data_size = data_size;
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ btf->nohdr_data = btf->data + btf->hdr.hdr_len;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ goto errout;
btf_verifier_env_free(env);
refcount_set(&btf->refcnt, 1);
@@ -3652,24 +4807,26 @@ struct btf *btf_parse_vmlinux(void)
errout:
btf_verifier_env_free(env);
if (btf) {
+ kvfree(btf->data);
kvfree(btf->types);
kfree(btf);
}
return ERR_PTR(err);
}
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
{
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
- if (tgt_prog) {
+ if (tgt_prog)
return tgt_prog->aux->btf;
- } else {
- return btf_vmlinux;
- }
+ else
+ return prog->aux->attach_btf;
}
-static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
+static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
@@ -3678,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
- /* char, signed char, unsigned char */
- return btf_type_is_int(t) && t->size == 1;
+ return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@@ -3687,13 +4843,13 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const struct btf_type *t = prog->aux->attach_func_proto;
- struct bpf_prog *tgt_prog = prog->aux->linked_prog;
+ struct bpf_prog *tgt_prog = prog->aux->dst_prog;
struct btf *btf = bpf_prog_get_target_btf(prog);
const char *tname = prog->aux->attach_func_name;
struct bpf_verifier_log *log = info->log;
const struct btf_param *args;
u32 nr_args, arg;
- int ret;
+ int i, ret;
if (off % 8) {
bpf_log(log, "func '%s' offset %d is not multiple of 8\n",
@@ -3702,35 +4858,74 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
arg = off / 8;
args = (const struct btf_param *)(t + 1);
- /* if (t == NULL) Fall back to default BPF prog with 5 u64 arguments */
- nr_args = t ? btf_type_vlen(t) : 5;
+ /* if (t == NULL) Fall back to default BPF prog with
+ * MAX_BPF_FUNC_REG_ARGS u64 arguments.
+ */
+ nr_args = t ? btf_type_vlen(t) : MAX_BPF_FUNC_REG_ARGS;
if (prog->aux->attach_btf_trace) {
/* skip first 'void *__data' argument in btf_trace_##name typedef */
args++;
nr_args--;
}
- if (prog->expected_attach_type == BPF_TRACE_FEXIT &&
- arg == nr_args) {
- if (!t)
- /* Default prog with 5 args. 6th arg is retval. */
- return true;
- /* function return type */
- t = btf_type_by_id(btf, t->type);
- } else if (arg >= nr_args) {
+ if (arg > nr_args) {
bpf_log(log, "func '%s' doesn't have %d-th argument\n",
tname, arg + 1);
return false;
+ }
+
+ if (arg == nr_args) {
+ switch (prog->expected_attach_type) {
+ case BPF_LSM_MAC:
+ case BPF_TRACE_FEXIT:
+ /* When LSM programs are attached to void LSM hooks
+ * they use FEXIT trampolines and when attached to
+ * int LSM hooks, they use MODIFY_RETURN trampolines.
+ *
+ * While the LSM programs are BPF_MODIFY_RETURN-like
+ * the check:
+ *
+ * if (ret_type != 'int')
+ * return -EINVAL;
+ *
+ * is _not_ done here. This is still safe as LSM hooks
+ * have only void and int return types.
+ */
+ if (!t)
+ return true;
+ t = btf_type_by_id(btf, t->type);
+ break;
+ case BPF_MODIFY_RETURN:
+ /* For now the BPF_MODIFY_RETURN can only be attached to
+ * functions that return an int.
+ */
+ if (!t)
+ return false;
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+ if (!btf_type_is_small_int(t)) {
+ bpf_log(log,
+ "ret type %s not allowed for fmod_ret\n",
+ btf_kind_str[BTF_INFO_KIND(t->info)]);
+ return false;
+ }
+ break;
+ default:
+ bpf_log(log, "func '%s' doesn't have %d-th argument\n",
+ tname, arg + 1);
+ return false;
+ }
} else {
if (!t)
- /* Default prog with 5 args */
+ /* Default prog with MAX_BPF_FUNC_REG_ARGS args */
return true;
t = btf_type_by_id(btf, args[arg].type);
}
+
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_type_is_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -3741,6 +4936,21 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
btf_kind_str[BTF_INFO_KIND(t->info)]);
return false;
}
+
+ /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ u32 type, flag;
+
+ type = base_type(ctx_arg_info->reg_type);
+ flag = type_flag(ctx_arg_info->reg_type);
+ if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
+ (flag & PTR_MAYBE_NULL)) {
+ info->reg_type = ctx_arg_info->reg_type;
+ return true;
+ }
+ }
+
if (t->type == 0)
/* This is a pointer to void.
* It is the same as scalar from the verifier safety pov.
@@ -3748,15 +4958,38 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
- if (is_string_ptr(btf, t))
+ if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
- info->reg_type = PTR_TO_BTF_ID;
+ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+ const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+ if (ctx_arg_info->offset == off) {
+ if (!ctx_arg_info->btf_id) {
+ bpf_log(log,"invalid btf_id for context argument offset %u\n", off);
+ return false;
+ }
+
+ info->reg_type = ctx_arg_info->reg_type;
+ info->btf = btf_vmlinux;
+ info->btf_id = ctx_arg_info->btf_id;
+ return true;
+ }
+ }
+
+ info->reg_type = PTR_TO_BTF_ID;
if (tgt_prog) {
- ret = btf_translate_to_vmlinux(log, btf, t, tgt_prog->type, arg);
+ enum bpf_prog_type tgt_type;
+
+ if (tgt_prog->type == BPF_PROG_TYPE_EXT)
+ tgt_type = tgt_prog->aux->saved_dst_prog_type;
+ else
+ tgt_type = tgt_prog->type;
+
+ ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg);
if (ret > 0) {
+ info->btf = btf_vmlinux;
info->btf_id = ret;
return true;
} else {
@@ -3764,6 +4997,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
}
}
+ info->btf = btf;
info->btf_id = t->type;
t = btf_type_by_id(btf, t->type);
/* skip modifiers */
@@ -3783,24 +5017,66 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
return true;
}
-int btf_struct_access(struct bpf_verifier_log *log,
- const struct btf_type *t, int off, int size,
- enum bpf_access_type atype,
- u32 *next_btf_id)
+enum bpf_struct_walk_result {
+ /* < 0 error */
+ WALK_SCALAR = 0,
+ WALK_PTR,
+ WALK_STRUCT,
+};
+
+static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ u32 *next_btf_id)
{
u32 i, moff, mtrue_end, msize = 0, total_nelems = 0;
const struct btf_type *mtype, *elem_type = NULL;
const struct btf_member *member;
const char *tname, *mname;
+ u32 vlen, elem_id, mid;
again:
- tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
+ tname = __btf_name_by_offset(btf, t->name_off);
if (!btf_type_is_struct(t)) {
bpf_log(log, "Type '%s' is not a struct\n", tname);
return -EINVAL;
}
+ vlen = btf_type_vlen(t);
if (off + size > t->size) {
+ /* If the last element is a variable size array, we may
+ * need to relax the rule.
+ */
+ struct btf_array *array_elem;
+
+ if (vlen == 0)
+ goto error;
+
+ member = btf_type_member(t) + vlen - 1;
+ mtype = btf_type_skip_modifiers(btf, member->type,
+ NULL);
+ if (!btf_type_is_array(mtype))
+ goto error;
+
+ array_elem = (struct btf_array *)(mtype + 1);
+ if (array_elem->nelems != 0)
+ goto error;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ if (off < moff)
+ goto error;
+
+ /* Only allow structure for now, can be relaxed for
+ * other types later.
+ */
+ t = btf_type_skip_modifiers(btf, array_elem->type,
+ NULL);
+ if (!btf_type_is_struct(t))
+ goto error;
+
+ off = (off - moff) % t->size;
+ goto again;
+
+error:
bpf_log(log, "access beyond struct %s at off %u size %u\n",
tname, off, size);
return -EACCES;
@@ -3808,14 +5084,14 @@ again:
for_each_member(i, t, member) {
/* offset of the field in bytes */
- moff = btf_member_bit_offset(t, member) / 8;
+ moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
- if (btf_member_bitfield_size(t, member)) {
- u32 end_bit = btf_member_bit_offset(t, member) +
- btf_member_bitfield_size(t, member);
+ if (__btf_member_bitfield_size(t, member)) {
+ u32 end_bit = __btf_member_bit_offset(t, member) +
+ __btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@@ -3827,7 +5103,7 @@ again:
*/
if (off <= moff &&
BITS_ROUNDUP_BYTES(end_bit) <= off + size)
- return SCALAR_VALUE;
+ return WALK_SCALAR;
/* off may be accessing a following member
*
@@ -3849,11 +5125,13 @@ again:
break;
/* type of the field */
- mtype = btf_type_by_id(btf_vmlinux, member->type);
- mname = __btf_name_by_offset(btf_vmlinux, member->name_off);
+ mid = member->type;
+ mtype = btf_type_by_id(btf, member->type);
+ mname = __btf_name_by_offset(btf, member->name_off);
- mtype = btf_resolve_size(btf_vmlinux, mtype, &msize,
- &elem_type, &total_nelems);
+ mtype = __btf_resolve_size(btf, mtype, &msize,
+ &elem_type, &elem_id, &total_nelems,
+ &mid);
if (IS_ERR(mtype)) {
bpf_log(log, "field %s doesn't have size\n", mname);
return -EFAULT;
@@ -3867,7 +5145,7 @@ again:
if (btf_type_is_array(mtype)) {
u32 elem_idx;
- /* btf_resolve_size() above helps to
+ /* __btf_resolve_size() above helps to
* linearize a multi-dimensional array.
*
* The logic here is treating an array
@@ -3915,6 +5193,7 @@ again:
elem_idx = (off - moff) / msize;
moff += elem_idx * msize;
mtype = elem_type;
+ mid = elem_id;
}
/* the 'off' we're looking for is either equal to start
@@ -3924,6 +5203,12 @@ again:
/* our field must be inside that union or struct */
t = mtype;
+ /* return if the offset matches the member offset */
+ if (off == moff) {
+ *next_btf_id = mid;
+ return WALK_STRUCT;
+ }
+
/* adjust offset we're looking for */
off -= moff;
goto again;
@@ -3939,11 +5224,10 @@ again:
mname, moff, tname, off, size);
return -EACCES;
}
-
- stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id);
+ stype = btf_type_skip_modifiers(btf, mtype->type, &id);
if (btf_type_is_struct(stype)) {
*next_btf_id = id;
- return PTR_TO_BTF_ID;
+ return WALK_PTR;
}
}
@@ -3960,102 +5244,98 @@ again:
return -EACCES;
}
- return SCALAR_VALUE;
+ return WALK_SCALAR;
}
bpf_log(log, "struct %s doesn't have field at offset %d\n", tname, off);
return -EINVAL;
}
-static int __btf_resolve_helper_id(struct bpf_verifier_log *log, void *fn,
- int arg)
+int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf,
+ const struct btf_type *t, int off, int size,
+ enum bpf_access_type atype __maybe_unused,
+ u32 *next_btf_id)
{
- char fnname[KSYM_SYMBOL_LEN + 4] = "btf_";
- const struct btf_param *args;
- const struct btf_type *t;
- const char *tname, *sym;
- u32 btf_id, i;
-
- if (IS_ERR(btf_vmlinux)) {
- bpf_log(log, "btf_vmlinux is malformed\n");
- return -EINVAL;
- }
+ int err;
+ u32 id;
- sym = kallsyms_lookup((long)fn, NULL, NULL, NULL, fnname + 4);
- if (!sym) {
- bpf_log(log, "kernel doesn't have kallsyms\n");
- return -EFAULT;
- }
+ do {
+ err = btf_struct_walk(log, btf, t, off, size, &id);
- for (i = 1; i <= btf_vmlinux->nr_types; i++) {
- t = btf_type_by_id(btf_vmlinux, i);
- if (BTF_INFO_KIND(t->info) != BTF_KIND_TYPEDEF)
- continue;
- tname = __btf_name_by_offset(btf_vmlinux, t->name_off);
- if (!strcmp(tname, fnname))
+ switch (err) {
+ case WALK_PTR:
+ /* If we found the pointer or scalar on t+off,
+ * we're done.
+ */
+ *next_btf_id = id;
+ return PTR_TO_BTF_ID;
+ case WALK_SCALAR:
+ return SCALAR_VALUE;
+ case WALK_STRUCT:
+ /* We found nested struct, so continue the search
+ * by diving in it. At this point the offset is
+ * aligned with the new type, so set it to 0.
+ */
+ t = btf_type_by_id(btf, id);
+ off = 0;
break;
- }
- if (i > btf_vmlinux->nr_types) {
- bpf_log(log, "helper %s type is not found\n", fnname);
- return -ENOENT;
- }
-
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_ptr(t))
- return -EFAULT;
- t = btf_type_by_id(btf_vmlinux, t->type);
- if (!btf_type_is_func_proto(t))
- return -EFAULT;
+ default:
+ /* It's either error or unknown return value..
+ * scream and leave.
+ */
+ if (WARN_ONCE(err > 0, "unknown btf_struct_walk return value"))
+ return -EINVAL;
+ return err;
+ }
+ } while (t);
- args = (const struct btf_param *)(t + 1);
- if (arg >= btf_type_vlen(t)) {
- bpf_log(log, "bpf helper %s doesn't have %d-th argument\n",
- fnname, arg);
- return -EINVAL;
- }
+ return -EINVAL;
+}
- t = btf_type_by_id(btf_vmlinux, args[arg].type);
- if (!btf_type_is_ptr(t) || !t->type) {
- /* anything but the pointer to struct is a helper config bug */
- bpf_log(log, "ARG_PTR_TO_BTF is misconfigured\n");
- return -EFAULT;
- }
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- /* skip modifiers */
- while (btf_type_is_modifier(t)) {
- btf_id = t->type;
- t = btf_type_by_id(btf_vmlinux, t->type);
- }
- if (!btf_type_is_struct(t)) {
- bpf_log(log, "ARG_PTR_TO_BTF is not a struct\n");
- return -EFAULT;
- }
- bpf_log(log, "helper %s arg%d has btf_id %d struct %s\n", fnname + 4,
- arg, btf_id, __btf_name_by_offset(btf_vmlinux, t->name_off));
- return btf_id;
+/* Check that two BTF types, each specified as an BTF object + id, are exactly
+ * the same. Trivial ID check is not enough due to module BTFs, because we can
+ * end up with two different module BTFs, but IDs point to the common type in
+ * vmlinux BTF.
+ */
+static bool btf_types_are_same(const struct btf *btf1, u32 id1,
+ const struct btf *btf2, u32 id2)
+{
+ if (id1 != id2)
+ return false;
+ if (btf1 == btf2)
+ return true;
+ return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2);
}
-int btf_resolve_helper_id(struct bpf_verifier_log *log,
- const struct bpf_func_proto *fn, int arg)
+bool btf_struct_ids_match(struct bpf_verifier_log *log,
+ const struct btf *btf, u32 id, int off,
+ const struct btf *need_btf, u32 need_type_id)
{
- int *btf_id = &fn->btf_id[arg];
- int ret;
+ const struct btf_type *type;
+ int err;
- if (fn->arg_type[arg] != ARG_PTR_TO_BTF_ID)
- return -EINVAL;
+ /* Are we already done? */
+ if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id))
+ return true;
- ret = READ_ONCE(*btf_id);
- if (ret)
- return ret;
- /* ok to race the search. The result is the same */
- ret = __btf_resolve_helper_id(log, fn->func, arg);
- if (!ret) {
- /* Function argument cannot be type 'void' */
- bpf_log(log, "BTF resolution bug\n");
- return -EFAULT;
+again:
+ type = btf_type_by_id(btf, id);
+ if (!type)
+ return false;
+ err = btf_struct_walk(log, btf, type, off, 1, &id);
+ if (err != WALK_STRUCT)
+ return false;
+
+ /* We found nested struct object. If it matches
+ * the requested ID, we're done. Otherwise let's
+ * continue the search with offset 0 in the new
+ * type.
+ */
+ if (!btf_types_are_same(btf, id, need_btf, need_type_id)) {
+ off = 0;
+ goto again;
}
- WRITE_ONCE(*btf_id, ret);
- return ret;
+
+ return true;
}
static int __get_type_size(struct btf *btf, u32 btf_id,
@@ -4070,7 +5350,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
while (t && btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (!t) {
- *bad_type = btf->types[0];
+ *bad_type = btf_type_by_id(btf, 0);
return -EINVAL;
}
if (btf_type_is_ptr(t))
@@ -4095,12 +5375,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
if (!func) {
/* BTF function prototype doesn't match the verifier types.
- * Fall back to 5 u64 args.
+ * Fall back to MAX_BPF_FUNC_REG_ARGS u64 args.
*/
- for (i = 0; i < 5; i++)
+ for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++)
m->arg_size[i] = 8;
m->ret_size = 8;
- m->nr_args = 5;
+ m->nr_args = MAX_BPF_FUNC_REG_ARGS;
return 0;
}
args = (const struct btf_param *)(func + 1);
@@ -4121,6 +5401,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
m->ret_size = ret;
for (i = 0; i < nargs; i++) {
+ if (i == nargs - 1 && args[i].type == 0) {
+ bpf_log(log,
+ "The function %s with variable args is unsupported.\n",
+ tname);
+ return -EINVAL;
+ }
ret = __get_type_size(btf, args[i].type, &t);
if (ret < 0) {
bpf_log(log,
@@ -4128,6 +5414,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]);
return -EINVAL;
}
+ if (ret == 0) {
+ bpf_log(log,
+ "The function %s has malformed void argument.\n",
+ tname);
+ return -EINVAL;
+ }
m->arg_size[i] = ret;
}
m->nr_args = nargs;
@@ -4253,7 +5545,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
}
/* Compare BTFs of given program with BTF of target program */
-int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
+int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog,
struct btf *btf2, const struct btf_type *t2)
{
struct btf *btf1 = prog->aux->btf;
@@ -4261,7 +5553,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
u32 btf_id = 0;
if (!prog->aux->func_info) {
- bpf_log(&env->log, "Program extension requires BTF\n");
+ bpf_log(log, "Program extension requires BTF\n");
return -EINVAL;
}
@@ -4273,104 +5565,245 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog,
if (!t1 || !btf_type_is_func(t1))
return -EFAULT;
- return btf_check_func_type_match(&env->log, btf1, t1, btf2, t2);
+ return btf_check_func_type_match(log, btf1, t1, btf2, t2);
}
-/* Compare BTF of a function with given bpf_reg_state.
- * Returns:
- * EFAULT - there is a verifier bug. Abort verification.
- * EINVAL - there is a type mismatch or BTF is not available.
- * 0 - BTF matches with what bpf_reg_state expects.
- * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
- */
-int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
+#ifdef CONFIG_NET
+ [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK],
+ [PTR_TO_SOCK_COMMON] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ [PTR_TO_TCP_SOCK] = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+#endif
+};
+
+/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
+static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
+ const struct btf *btf,
+ const struct btf_type *t, int rec)
{
- struct bpf_verifier_log *log = &env->log;
- struct bpf_prog *prog = env->prog;
- struct btf *btf = prog->aux->btf;
- const struct btf_param *args;
- const struct btf_type *t;
- u32 i, nargs, btf_id;
- const char *tname;
+ const struct btf_type *member_type;
+ const struct btf_member *member;
+ u32 i;
- if (!prog->aux->func_info)
- return -EINVAL;
+ if (!btf_type_is_struct(t))
+ return false;
- btf_id = prog->aux->func_info[subprog].type_id;
- if (!btf_id)
- return -EFAULT;
+ for_each_member(i, t, member) {
+ const struct btf_array *array;
- if (prog->aux->func_info_aux[subprog].unreliable)
- return -EINVAL;
+ member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+ if (btf_type_is_struct(member_type)) {
+ if (rec >= 3) {
+ bpf_log(log, "max struct nesting depth exceeded\n");
+ return false;
+ }
+ if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
+ return false;
+ continue;
+ }
+ if (btf_type_is_array(member_type)) {
+ array = btf_type_array(member_type);
+ if (!array->nelems)
+ return false;
+ member_type = btf_type_skip_modifiers(btf, array->type, NULL);
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ continue;
+ }
+ if (!btf_type_is_scalar(member_type))
+ return false;
+ }
+ return true;
+}
- t = btf_type_by_id(btf, btf_id);
+static int btf_check_func_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs,
+ bool ptr_to_mem_ok)
+{
+ struct bpf_verifier_log *log = &env->log;
+ bool is_kfunc = btf_is_kernel(btf);
+ const char *func_name, *ref_tname;
+ const struct btf_type *t, *ref_t;
+ const struct btf_param *args;
+ u32 i, nargs, ref_id;
+
+ t = btf_type_by_id(btf, func_id);
if (!t || !btf_type_is_func(t)) {
/* These checks were already done by the verifier while loading
- * struct bpf_func_info
+ * struct bpf_func_info or in add_kfunc_call().
*/
- bpf_log(log, "BTF of func#%d doesn't point to KIND_FUNC\n",
- subprog);
+ bpf_log(log, "BTF of func_id %u doesn't point to KIND_FUNC\n",
+ func_id);
return -EFAULT;
}
- tname = btf_name_by_offset(btf, t->name_off);
+ func_name = btf_name_by_offset(btf, t->name_off);
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
- bpf_log(log, "Invalid BTF of func %s\n", tname);
+ bpf_log(log, "Invalid BTF of func %s\n", func_name);
return -EFAULT;
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs);
- goto out;
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Function %s has %d > %d args\n", func_name, nargs,
+ MAX_BPF_FUNC_REG_ARGS);
+ return -EINVAL;
}
+
/* check that BTF function arguments match actual types that the
* verifier sees.
*/
for (i = 0; i < nargs; i++) {
- t = btf_type_by_id(btf, args[i].type);
- while (btf_type_is_modifier(t))
- t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- if (reg[i + 1].type == SCALAR_VALUE)
+ u32 regno = i + 1;
+ struct bpf_reg_state *reg = &regs[regno];
+
+ t = btf_type_skip_modifiers(btf, args[i].type, NULL);
+ if (btf_type_is_scalar(t)) {
+ if (reg->type == SCALAR_VALUE)
continue;
- bpf_log(log, "R%d is not a scalar\n", i + 1);
- goto out;
+ bpf_log(log, "R%d is not a scalar\n", regno);
+ return -EINVAL;
}
- if (btf_type_is_ptr(t)) {
- if (reg[i + 1].type == SCALAR_VALUE) {
- bpf_log(log, "R%d is not a pointer\n", i + 1);
- goto out;
- }
+
+ if (!btf_type_is_ptr(t)) {
+ bpf_log(log, "Unrecognized arg#%d type %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+
+ ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
+ ref_tname = btf_name_by_offset(btf, ref_t->name_off);
+ if (btf_get_prog_ctx_type(log, btf, t,
+ env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
- if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) {
- if (reg[i + 1].type != PTR_TO_CTX) {
+ if (reg->type != PTR_TO_CTX) {
+ bpf_log(log,
+ "arg#%d expected pointer to ctx, but got %s\n",
+ i, btf_type_str(t));
+ return -EINVAL;
+ }
+ if (check_ctx_reg(env, reg, regno))
+ return -EINVAL;
+ } else if (is_kfunc && (reg->type == PTR_TO_BTF_ID || reg2btf_ids[reg->type])) {
+ const struct btf_type *reg_ref_t;
+ const struct btf *reg_btf;
+ const char *reg_ref_tname;
+ u32 reg_ref_id;
+
+ if (!btf_type_is_struct(ref_t)) {
+ bpf_log(log, "kernel function %s args#%d pointer type %s %s is not supported\n",
+ func_name, i, btf_type_str(ref_t),
+ ref_tname);
+ return -EINVAL;
+ }
+
+ if (reg->type == PTR_TO_BTF_ID) {
+ reg_btf = reg->btf;
+ reg_ref_id = reg->btf_id;
+ } else {
+ reg_btf = btf_vmlinux;
+ reg_ref_id = *reg2btf_ids[reg->type];
+ }
+
+ reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
+ &reg_ref_id);
+ reg_ref_tname = btf_name_by_offset(reg_btf,
+ reg_ref_t->name_off);
+ if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
+ reg->off, btf, ref_id)) {
+ bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
+ func_name, i,
+ btf_type_str(ref_t), ref_tname,
+ regno, btf_type_str(reg_ref_t),
+ reg_ref_tname);
+ return -EINVAL;
+ }
+ } else if (ptr_to_mem_ok) {
+ const struct btf_type *resolve_ret;
+ u32 type_size;
+
+ if (is_kfunc) {
+ /* Permit pointer to mem, but only when argument
+ * type is pointer to scalar, or struct composed
+ * (recursively) of scalars.
+ */
+ if (!btf_type_is_scalar(ref_t) &&
+ !__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
bpf_log(log,
- "arg#%d expected pointer to ctx, but got %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
+ "arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
+ i, btf_type_str(ref_t), ref_tname);
+ return -EINVAL;
}
- if (check_ctx_reg(env, &reg[i + 1], i + 1))
- goto out;
- continue;
}
+
+ resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
+ if (IS_ERR(resolve_ret)) {
+ bpf_log(log,
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(ref_t), ref_tname,
+ PTR_ERR(resolve_ret));
+ return -EINVAL;
+ }
+
+ if (check_mem_reg(env, reg, regno, type_size))
+ return -EINVAL;
+ } else {
+ bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
+ is_kfunc ? "kernel " : "", func_name, func_id);
+ return -EINVAL;
}
- bpf_log(log, "Unrecognized arg#%d type %s\n",
- i, btf_kind_str[BTF_INFO_KIND(t->info)]);
- goto out;
}
+
return 0;
-out:
+}
+
+/* Compare BTF of a function with given bpf_reg_state.
+ * Returns:
+ * EFAULT - there is a verifier bug. Abort verification.
+ * EINVAL - there is a type mismatch or BTF is not available.
+ * 0 - BTF matches with what bpf_reg_state expects.
+ * Only PTR_TO_CTX and SCALAR_VALUE states are recognized.
+ */
+int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
+ struct bpf_reg_state *regs)
+{
+ struct bpf_prog *prog = env->prog;
+ struct btf *btf = prog->aux->btf;
+ bool is_global;
+ u32 btf_id;
+ int err;
+
+ if (!prog->aux->func_info)
+ return -EINVAL;
+
+ btf_id = prog->aux->func_info[subprog].type_id;
+ if (!btf_id)
+ return -EFAULT;
+
+ if (prog->aux->func_info_aux[subprog].unreliable)
+ return -EINVAL;
+
+ is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
* In such cases mark the function as unreliable from BTF point of view.
*/
- prog->aux->func_info_aux[subprog].unreliable = true;
- return -EINVAL;
+ if (err)
+ prog->aux->func_info_aux[subprog].unreliable = true;
+ return err;
+}
+
+int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
+ const struct btf *btf, u32 func_id,
+ struct bpf_reg_state *regs)
+{
+ return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -4381,14 +5814,14 @@ out:
* (either PTR_TO_CTX or SCALAR_VALUE).
*/
int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
- struct bpf_reg_state *reg)
+ struct bpf_reg_state *regs)
{
struct bpf_verifier_log *log = &env->log;
struct bpf_prog *prog = env->prog;
enum bpf_prog_type prog_type = prog->type;
struct btf *btf = prog->aux->btf;
const struct btf_param *args;
- const struct btf_type *t;
+ const struct btf_type *t, *ref_t;
u32 i, nargs, btf_id;
const char *tname;
@@ -4424,7 +5857,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EFAULT;
}
if (prog_type == BPF_PROG_TYPE_EXT)
- prog_type = prog->aux->linked_prog->type;
+ prog_type = prog->aux->dst_prog->type;
t = btf_type_by_id(btf, t->type);
if (!t || !btf_type_is_func_proto(t)) {
@@ -4433,9 +5866,9 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
}
args = (const struct btf_param *)(t + 1);
nargs = btf_type_vlen(t);
- if (nargs > 5) {
- bpf_log(log, "Global function %s() with %d > 5 args. Buggy compiler.\n",
- tname, nargs);
+ if (nargs > MAX_BPF_FUNC_REG_ARGS) {
+ bpf_log(log, "Global function %s() with %d > %d args. Buggy compiler.\n",
+ tname, nargs, MAX_BPF_FUNC_REG_ARGS);
return -EINVAL;
}
/* check that function returns int */
@@ -4452,16 +5885,35 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
* Only PTR_TO_CTX and SCALAR are supported atm.
*/
for (i = 0; i < nargs; i++) {
+ struct bpf_reg_state *reg = &regs[i + 1];
+
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
if (btf_type_is_int(t) || btf_type_is_enum(t)) {
- reg[i + 1].type = SCALAR_VALUE;
+ reg->type = SCALAR_VALUE;
continue;
}
- if (btf_type_is_ptr(t) &&
- btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
- reg[i + 1].type = PTR_TO_CTX;
+ if (btf_type_is_ptr(t)) {
+ if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) {
+ reg->type = PTR_TO_CTX;
+ continue;
+ }
+
+ t = btf_type_skip_modifiers(btf, t->type, NULL);
+
+ ref_t = btf_resolve_size(btf, t, &reg->mem_size);
+ if (IS_ERR(ref_t)) {
+ bpf_log(log,
+ "arg#%d reference type('%s %s') size cannot be determined: %ld\n",
+ i, btf_type_str(t), btf_name_by_offset(btf, t->name_off),
+ PTR_ERR(ref_t));
+ return -EINVAL;
+ }
+
+ reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+ reg->id = ++env->id_gen;
+
continue;
}
bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n",
@@ -4471,12 +5923,93 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return 0;
}
+static void btf_type_show(const struct btf *btf, u32 type_id, void *obj,
+ struct btf_show *show)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ show->btf = btf;
+ memset(&show->state, 0, sizeof(show->state));
+ memset(&show->obj, 0, sizeof(show->obj));
+
+ btf_type_ops(t)->show(btf, t, type_id, obj, 0, show);
+}
+
+static void btf_seq_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ seq_vprintf((struct seq_file *)show->target, fmt, args);
+}
+
+int btf_type_seq_show_flags(const struct btf *btf, u32 type_id,
+ void *obj, struct seq_file *m, u64 flags)
+{
+ struct btf_show sseq;
+
+ sseq.target = m;
+ sseq.showfn = btf_seq_show;
+ sseq.flags = flags;
+
+ btf_type_show(btf, type_id, obj, &sseq);
+
+ return sseq.state.status;
+}
+
void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
struct seq_file *m)
{
- const struct btf_type *t = btf_type_by_id(btf, type_id);
+ (void) btf_type_seq_show_flags(btf, type_id, obj, m,
+ BTF_SHOW_NONAME | BTF_SHOW_COMPACT |
+ BTF_SHOW_ZERO | BTF_SHOW_UNSAFE);
+}
+
+struct btf_show_snprintf {
+ struct btf_show show;
+ int len_left; /* space left in string */
+ int len; /* length we would have written */
+};
+
+static void btf_snprintf_show(struct btf_show *show, const char *fmt,
+ va_list args)
+{
+ struct btf_show_snprintf *ssnprintf = (struct btf_show_snprintf *)show;
+ int len;
+
+ len = vsnprintf(show->target, ssnprintf->len_left, fmt, args);
+
+ if (len < 0) {
+ ssnprintf->len_left = 0;
+ ssnprintf->len = len;
+ } else if (len > ssnprintf->len_left) {
+ /* no space, drive on to get length we would have written */
+ ssnprintf->len_left = 0;
+ ssnprintf->len += len;
+ } else {
+ ssnprintf->len_left -= len;
+ ssnprintf->len += len;
+ show->target += len;
+ }
+}
+
+int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj,
+ char *buf, int len, u64 flags)
+{
+ struct btf_show_snprintf ssnprintf;
+
+ ssnprintf.show.target = buf;
+ ssnprintf.show.flags = flags;
+ ssnprintf.show.showfn = btf_snprintf_show;
+ ssnprintf.len_left = len;
+ ssnprintf.len = 0;
+
+ btf_type_show(btf, type_id, obj, (struct btf_show *)&ssnprintf);
- btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+ /* If we encontered an error, return it. */
+ if (ssnprintf.show.state.status)
+ return ssnprintf.show.state.status;
+
+ /* Otherwise return length we would have written */
+ return ssnprintf.len;
}
#ifdef CONFIG_PROC_FS
@@ -4506,12 +6039,12 @@ static int __btf_new_fd(struct btf *btf)
return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC);
}
-int btf_new_fd(const union bpf_attr *attr)
+int btf_new_fd(const union bpf_attr *attr, bpfptr_t uattr)
{
struct btf *btf;
int ret;
- btf = btf_parse(u64_to_user_ptr(attr->btf),
+ btf = btf_parse(make_bpfptr(attr->btf, uattr.is_kernel),
attr->btf_size, attr->btf_log_level,
u64_to_user_ptr(attr->btf_log_buf),
attr->btf_log_size);
@@ -4564,15 +6097,18 @@ int btf_get_info_by_fd(const struct btf *btf,
union bpf_attr __user *uattr)
{
struct bpf_btf_info __user *uinfo;
- struct bpf_btf_info info = {};
+ struct bpf_btf_info info;
u32 info_copy, btf_copy;
void __user *ubtf;
- u32 uinfo_len;
+ char __user *uname;
+ u32 uinfo_len, uname_len, name_len;
+ int ret = 0;
uinfo = u64_to_user_ptr(attr->info.info);
uinfo_len = attr->info.info_len;
info_copy = min_t(u32, uinfo_len, sizeof(info));
+ memset(&info, 0, sizeof(info));
if (copy_from_user(&info, uinfo, info_copy))
return -EFAULT;
@@ -4583,11 +6119,37 @@ int btf_get_info_by_fd(const struct btf *btf,
return -EFAULT;
info.btf_size = btf->data_size;
+ info.kernel_btf = btf->kernel_btf;
+
+ uname = u64_to_user_ptr(info.name);
+ uname_len = info.name_len;
+ if (!uname ^ !uname_len)
+ return -EINVAL;
+
+ name_len = strlen(btf->name);
+ info.name_len = name_len;
+
+ if (uname) {
+ if (uname_len >= name_len + 1) {
+ if (copy_to_user(uname, btf->name, name_len + 1))
+ return -EFAULT;
+ } else {
+ char zero = '\0';
+
+ if (copy_to_user(uname, btf->name, uname_len - 1))
+ return -EFAULT;
+ if (put_user(zero, uname + uname_len - 1))
+ return -EFAULT;
+ /* let user-space know about too short buffer */
+ ret = -ENOSPC;
+ }
+ }
+
if (copy_to_user(uinfo, &info, info_copy) ||
put_user(info_copy, &uattr->info.info_len))
return -EFAULT;
- return 0;
+ return ret;
}
int btf_get_fd_by_id(u32 id)
@@ -4611,7 +6173,674 @@ int btf_get_fd_by_id(u32 id)
return fd;
}
-u32 btf_id(const struct btf *btf)
+u32 btf_obj_id(const struct btf *btf)
{
return btf->id;
}
+
+bool btf_is_kernel(const struct btf *btf)
+{
+ return btf->kernel_btf;
+}
+
+bool btf_is_module(const struct btf *btf)
+{
+ return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0;
+}
+
+static int btf_id_cmp_func(const void *a, const void *b)
+{
+ const int *pa = a, *pb = b;
+
+ return *pa - *pb;
+}
+
+bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
+{
+ return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+struct btf_module {
+ struct list_head list;
+ struct module *module;
+ struct btf *btf;
+ struct bin_attribute *sysfs_attr;
+};
+
+static LIST_HEAD(btf_modules);
+static DEFINE_MUTEX(btf_module_mutex);
+
+static ssize_t
+btf_module_read(struct file *file, struct kobject *kobj,
+ struct bin_attribute *bin_attr,
+ char *buf, loff_t off, size_t len)
+{
+ const struct btf *btf = bin_attr->private;
+
+ memcpy(buf, btf->data + off, len);
+ return len;
+}
+
+static void purge_cand_cache(struct btf *btf);
+
+static int btf_module_notify(struct notifier_block *nb, unsigned long op,
+ void *module)
+{
+ struct btf_module *btf_mod, *tmp;
+ struct module *mod = module;
+ struct btf *btf;
+ int err = 0;
+
+ if (mod->btf_data_size == 0 ||
+ (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING))
+ goto out;
+
+ switch (op) {
+ case MODULE_STATE_COMING:
+ btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL);
+ if (!btf_mod) {
+ err = -ENOMEM;
+ goto out;
+ }
+ btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size);
+ if (IS_ERR(btf)) {
+ pr_warn("failed to validate module [%s] BTF: %ld\n",
+ mod->name, PTR_ERR(btf));
+ kfree(btf_mod);
+ err = PTR_ERR(btf);
+ goto out;
+ }
+ err = btf_alloc_id(btf);
+ if (err) {
+ btf_free(btf);
+ kfree(btf_mod);
+ goto out;
+ }
+
+ purge_cand_cache(NULL);
+ mutex_lock(&btf_module_mutex);
+ btf_mod->module = module;
+ btf_mod->btf = btf;
+ list_add(&btf_mod->list, &btf_modules);
+ mutex_unlock(&btf_module_mutex);
+
+ if (IS_ENABLED(CONFIG_SYSFS)) {
+ struct bin_attribute *attr;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr)
+ goto out;
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = btf->name;
+ attr->attr.mode = 0444;
+ attr->size = btf->data_size;
+ attr->private = btf;
+ attr->read = btf_module_read;
+
+ err = sysfs_create_bin_file(btf_kobj, attr);
+ if (err) {
+ pr_warn("failed to register module [%s] BTF in sysfs: %d\n",
+ mod->name, err);
+ kfree(attr);
+ err = 0;
+ goto out;
+ }
+
+ btf_mod->sysfs_attr = attr;
+ }
+
+ break;
+ case MODULE_STATE_GOING:
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->module != module)
+ continue;
+
+ list_del(&btf_mod->list);
+ if (btf_mod->sysfs_attr)
+ sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
+ purge_cand_cache(btf_mod->btf);
+ btf_put(btf_mod->btf);
+ kfree(btf_mod->sysfs_attr);
+ kfree(btf_mod);
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+ break;
+ }
+out:
+ return notifier_from_errno(err);
+}
+
+static struct notifier_block btf_module_nb = {
+ .notifier_call = btf_module_notify,
+};
+
+static int __init btf_module_init(void)
+{
+ register_module_notifier(&btf_module_nb);
+ return 0;
+}
+
+fs_initcall(btf_module_init);
+#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */
+
+struct module *btf_try_get_module(const struct btf *btf)
+{
+ struct module *res = NULL;
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+ struct btf_module *btf_mod, *tmp;
+
+ mutex_lock(&btf_module_mutex);
+ list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) {
+ if (btf_mod->btf != btf)
+ continue;
+
+ if (try_module_get(btf_mod->module))
+ res = btf_mod->module;
+
+ break;
+ }
+ mutex_unlock(&btf_module_mutex);
+#endif
+
+ return res;
+}
+
+BPF_CALL_4(bpf_btf_find_by_name_kind, char *, name, int, name_sz, u32, kind, int, flags)
+{
+ struct btf *btf;
+ long ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (name_sz <= 1 || name[name_sz - 1])
+ return -EINVAL;
+
+ btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ ret = btf_find_by_name_kind(btf, name, kind);
+ /* ret is never zero, since btf_find_by_name_kind returns
+ * positive btf_id or negative error.
+ */
+ if (ret < 0) {
+ struct btf *mod_btf;
+ int id;
+
+ /* If name is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ ret = btf_find_by_name_kind(mod_btf, name, kind);
+ if (ret > 0) {
+ int btf_obj_fd;
+
+ btf_obj_fd = __btf_new_fd(mod_btf);
+ if (btf_obj_fd < 0) {
+ btf_put(mod_btf);
+ return btf_obj_fd;
+ }
+ return ret | (((u64)btf_obj_fd) << 32);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ }
+ return ret;
+}
+
+const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
+ .func = bpf_btf_find_by_name_kind,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg2_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
+#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
+BTF_TRACING_TYPE_xxx
+#undef BTF_TRACING_TYPE
+
+/* BTF ID set registration API for modules */
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+
+void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+ struct kfunc_btf_id_set *s)
+{
+ mutex_lock(&l->mutex);
+ list_add(&s->list, &l->list);
+ mutex_unlock(&l->mutex);
+}
+EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
+
+void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
+ struct kfunc_btf_id_set *s)
+{
+ mutex_lock(&l->mutex);
+ list_del_init(&s->list);
+ mutex_unlock(&l->mutex);
+}
+EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
+
+bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
+ struct module *owner)
+{
+ struct kfunc_btf_id_set *s;
+
+ mutex_lock(&klist->mutex);
+ list_for_each_entry(s, &klist->list, list) {
+ if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
+ mutex_unlock(&klist->mutex);
+ return true;
+ }
+ }
+ mutex_unlock(&klist->mutex);
+ return false;
+}
+
+#define DEFINE_KFUNC_BTF_ID_LIST(name) \
+ struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
+ __MUTEX_INITIALIZER(name.mutex) }; \
+ EXPORT_SYMBOL_GPL(name)
+
+DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
+DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
+
+#endif
+
+int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
+ const struct btf *targ_btf, __u32 targ_id)
+{
+ return -EOPNOTSUPP;
+}
+
+static bool bpf_core_is_flavor_sep(const char *s)
+{
+ /* check X___Y name pattern, where X and Y are not underscores */
+ return s[0] != '_' && /* X */
+ s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
+ s[4] != '_'; /* Y */
+}
+
+size_t bpf_core_essential_name_len(const char *name)
+{
+ size_t n = strlen(name);
+ int i;
+
+ for (i = n - 5; i >= 0; i--) {
+ if (bpf_core_is_flavor_sep(name + i))
+ return i + 1;
+ }
+ return n;
+}
+
+struct bpf_cand_cache {
+ const char *name;
+ u32 name_len;
+ u16 kind;
+ u16 cnt;
+ struct {
+ const struct btf *btf;
+ u32 id;
+ } cands[];
+};
+
+static void bpf_free_cands(struct bpf_cand_cache *cands)
+{
+ if (!cands->cnt)
+ /* empty candidate array was allocated on stack */
+ return;
+ kfree(cands);
+}
+
+static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
+{
+ kfree(cands->name);
+ kfree(cands);
+}
+
+#define VMLINUX_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
+
+#define MODULE_CAND_CACHE_SIZE 31
+static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
+
+static DEFINE_MUTEX(cand_cache_mutex);
+
+static void __print_cand_cache(struct bpf_verifier_log *log,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ bpf_log(log, "[%d]%s(", i, cc->name);
+ for (j = 0; j < cc->cnt; j++) {
+ bpf_log(log, "%d", cc->cands[j].id);
+ if (j < cc->cnt - 1)
+ bpf_log(log, " ");
+ }
+ bpf_log(log, "), ");
+ }
+}
+
+static void print_cand_cache(struct bpf_verifier_log *log)
+{
+ mutex_lock(&cand_cache_mutex);
+ bpf_log(log, "vmlinux_cand_cache:");
+ __print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ bpf_log(log, "\nmodule_cand_cache:");
+ __print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ bpf_log(log, "\n");
+ mutex_unlock(&cand_cache_mutex);
+}
+
+static u32 hash_cands(struct bpf_cand_cache *cands)
+{
+ return jhash(cands->name, cands->name_len, 0);
+}
+
+static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
+
+ if (cc && cc->name_len == cands->name_len &&
+ !strncmp(cc->name, cands->name, cands->name_len))
+ return cc;
+ return NULL;
+}
+
+static size_t sizeof_cands(int cnt)
+{
+ return offsetof(struct bpf_cand_cache, cands[cnt]);
+}
+
+static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
+ struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
+
+ if (*cc) {
+ bpf_free_cands_from_cache(*cc);
+ *cc = NULL;
+ }
+ new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ /* strdup the name, since it will stay in cache.
+ * the cands->name points to strings in prog's BTF and the prog can be unloaded.
+ */
+ new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
+ bpf_free_cands(cands);
+ if (!new_cands->name) {
+ kfree(new_cands);
+ return ERR_PTR(-ENOMEM);
+ }
+ *cc = new_cands;
+ return new_cands;
+}
+
+#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
+static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
+ int cache_size)
+{
+ struct bpf_cand_cache *cc;
+ int i, j;
+
+ for (i = 0; i < cache_size; i++) {
+ cc = cache[i];
+ if (!cc)
+ continue;
+ if (!btf) {
+ /* when new module is loaded purge all of module_cand_cache,
+ * since new module might have candidates with the name
+ * that matches cached cands.
+ */
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ continue;
+ }
+ /* when module is unloaded purge cache entries
+ * that match module's btf
+ */
+ for (j = 0; j < cc->cnt; j++)
+ if (cc->cands[j].btf == btf) {
+ bpf_free_cands_from_cache(cc);
+ cache[i] = NULL;
+ break;
+ }
+ }
+
+}
+
+static void purge_cand_cache(struct btf *btf)
+{
+ mutex_lock(&cand_cache_mutex);
+ __purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ mutex_unlock(&cand_cache_mutex);
+}
+#endif
+
+static struct bpf_cand_cache *
+bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
+ int targ_start_id)
+{
+ struct bpf_cand_cache *new_cands;
+ const struct btf_type *t;
+ const char *targ_name;
+ size_t targ_essent_len;
+ int n, i;
+
+ n = btf_nr_types(targ_btf);
+ for (i = targ_start_id; i < n; i++) {
+ t = btf_type_by_id(targ_btf, i);
+ if (btf_kind(t) != cands->kind)
+ continue;
+
+ targ_name = btf_name_by_offset(targ_btf, t->name_off);
+ if (!targ_name)
+ continue;
+
+ /* the resched point is before strncmp to make sure that search
+ * for non-existing name will have a chance to schedule().
+ */
+ cond_resched();
+
+ if (strncmp(cands->name, targ_name, cands->name_len) != 0)
+ continue;
+
+ targ_essent_len = bpf_core_essential_name_len(targ_name);
+ if (targ_essent_len != cands->name_len)
+ continue;
+
+ /* most of the time there is only one candidate for a given kind+name pair */
+ new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
+ if (!new_cands) {
+ bpf_free_cands(cands);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ memcpy(new_cands, cands, sizeof_cands(cands->cnt));
+ bpf_free_cands(cands);
+ cands = new_cands;
+ cands->cands[cands->cnt].btf = targ_btf;
+ cands->cands[cands->cnt].id = i;
+ cands->cnt++;
+ }
+ return cands;
+}
+
+static struct bpf_cand_cache *
+bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
+{
+ struct bpf_cand_cache *cands, *cc, local_cand = {};
+ const struct btf *local_btf = ctx->btf;
+ const struct btf_type *local_type;
+ const struct btf *main_btf;
+ size_t local_essent_len;
+ struct btf *mod_btf;
+ const char *name;
+ int id;
+
+ main_btf = bpf_get_btf_vmlinux();
+ if (IS_ERR(main_btf))
+ return ERR_CAST(main_btf);
+
+ local_type = btf_type_by_id(local_btf, local_type_id);
+ if (!local_type)
+ return ERR_PTR(-EINVAL);
+
+ name = btf_name_by_offset(local_btf, local_type->name_off);
+ if (str_is_empty(name))
+ return ERR_PTR(-EINVAL);
+ local_essent_len = bpf_core_essential_name_len(name);
+
+ cands = &local_cand;
+ cands->name = name;
+ cands->kind = btf_kind(local_type);
+ cands->name_len = local_essent_len;
+
+ cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ /* cands is a pointer to stack here */
+ if (cc) {
+ if (cc->cnt)
+ return cc;
+ goto check_modules;
+ }
+
+ /* Attempt to find target candidates in vmlinux BTF first */
+ cands = bpf_core_add_cands(cands, main_btf, 1);
+ if (IS_ERR(cands))
+ return ERR_CAST(cands);
+
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
+
+ /* populate cache even when cands->cnt == 0 */
+ cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
+ if (IS_ERR(cc))
+ return ERR_CAST(cc);
+
+ /* if vmlinux BTF has any candidate, don't go for module BTFs */
+ if (cc->cnt)
+ return cc;
+
+check_modules:
+ /* cands is a pointer to stack here and cands->cnt == 0 */
+ cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+ if (cc)
+ /* if cache has it return it even if cc->cnt == 0 */
+ return cc;
+
+ /* If candidate is not found in vmlinux's BTF then search in module's BTFs */
+ spin_lock_bh(&btf_idr_lock);
+ idr_for_each_entry(&btf_idr, mod_btf, id) {
+ if (!btf_is_module(mod_btf))
+ continue;
+ /* linear search could be slow hence unlock/lock
+ * the IDR to avoiding holding it for too long
+ */
+ btf_get(mod_btf);
+ spin_unlock_bh(&btf_idr_lock);
+ cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
+ if (IS_ERR(cands)) {
+ btf_put(mod_btf);
+ return ERR_CAST(cands);
+ }
+ spin_lock_bh(&btf_idr_lock);
+ btf_put(mod_btf);
+ }
+ spin_unlock_bh(&btf_idr_lock);
+ /* cands is a pointer to kmalloced memory here if cands->cnt > 0
+ * or pointer to stack if cands->cnd == 0.
+ * Copy it into the cache even when cands->cnt == 0 and
+ * return the result.
+ */
+ return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
+}
+
+int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
+ int relo_idx, void *insn)
+{
+ bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
+ struct bpf_core_cand_list cands = {};
+ struct bpf_core_spec *specs;
+ int err;
+
+ /* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
+ * into arrays of btf_ids of struct fields and array indices.
+ */
+ specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
+ if (!specs)
+ return -ENOMEM;
+
+ if (need_cands) {
+ struct bpf_cand_cache *cc;
+ int i;
+
+ mutex_lock(&cand_cache_mutex);
+ cc = bpf_core_find_cands(ctx, relo->type_id);
+ if (IS_ERR(cc)) {
+ bpf_log(ctx->log, "target candidate search failed for %d\n",
+ relo->type_id);
+ err = PTR_ERR(cc);
+ goto out;
+ }
+ if (cc->cnt) {
+ cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
+ if (!cands.cands) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ for (i = 0; i < cc->cnt; i++) {
+ bpf_log(ctx->log,
+ "CO-RE relocating %s %s: found target candidate [%d]\n",
+ btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
+ cands.cands[i].btf = cc->cands[i].btf;
+ cands.cands[i].id = cc->cands[i].id;
+ }
+ cands.len = cc->cnt;
+ /* cand_cache_mutex needs to span the cache lookup and
+ * copy of btf pointer into bpf_core_cand_list,
+ * since module can be unloaded while bpf_core_apply_relo_insn
+ * is working with module's btf.
+ */
+ }
+
+ err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
+ relo, relo_idx, ctx->btf, &cands, specs);
+out:
+ kfree(specs);
+ if (need_cands) {
+ kfree(cands.cands);
+ mutex_unlock(&cand_cache_mutex);
+ if (ctx->log->level & BPF_LOG_LEVEL2)
+ print_cand_cache(ctx->log);
+ }
+ return err;
+}
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 9a500fadbef5..514b4681a90a 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -19,7 +19,7 @@
#include "../cgroup/cgroup-internal.h"
-DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
EXPORT_SYMBOL(cgroup_bpf_enabled_key);
void cgroup_bpf_offline(struct cgroup *cgrp)
@@ -28,6 +28,78 @@ void cgroup_bpf_offline(struct cgroup *cgrp)
percpu_ref_kill(&cgrp->bpf.refcnt);
}
+static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(storages[stype]);
+}
+
+static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
+ struct bpf_cgroup_storage *new_storages[],
+ enum bpf_attach_type type,
+ struct bpf_prog *prog,
+ struct cgroup *cgrp)
+{
+ enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_map *map;
+
+ key.cgroup_inode_id = cgroup_id(cgrp);
+ key.attach_type = type;
+
+ for_each_cgroup_storage_type(stype) {
+ map = prog->aux->cgroup_storage[stype];
+ if (!map)
+ continue;
+
+ storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+ if (storages[stype])
+ continue;
+
+ storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
+ if (IS_ERR(storages[stype])) {
+ bpf_cgroup_storages_free(new_storages);
+ return -ENOMEM;
+ }
+
+ new_storages[stype] = storages[stype];
+ }
+
+ return 0;
+}
+
+static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
+ struct bpf_cgroup_storage *src[])
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ dst[stype] = src[stype];
+}
+
+static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
+ struct cgroup *cgrp,
+ enum bpf_attach_type attach_type)
+{
+ enum bpf_cgroup_storage_type stype;
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
+}
+
+/* Called when bpf_cgroup_link is auto-detached from dying cgroup.
+ * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
+ * doesn't free link memory, which will eventually be done by bpf_link's
+ * release() callback, when its last FD is closed.
+ */
+static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link)
+{
+ cgroup_put(link->cgroup);
+ link->cgroup = NULL;
+}
+
/**
* cgroup_bpf_release() - put references of all bpf programs and
* release all cgroup bpf data
@@ -37,32 +109,38 @@ static void cgroup_bpf_release(struct work_struct *work)
{
struct cgroup *p, *cgrp = container_of(work, struct cgroup,
bpf.release_work);
- enum bpf_cgroup_storage_type stype;
struct bpf_prog_array *old_array;
- unsigned int type;
+ struct list_head *storages = &cgrp->bpf.storages;
+ struct bpf_cgroup_storage *storage, *stmp;
+
+ unsigned int atype;
mutex_lock(&cgroup_mutex);
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
- struct list_head *progs = &cgrp->bpf.progs[type];
- struct bpf_prog_list *pl, *tmp;
+ for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
+ struct list_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl, *pltmp;
- list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_for_each_entry_safe(pl, pltmp, progs, node) {
list_del(&pl->node);
- bpf_prog_put(pl->prog);
- for_each_cgroup_storage_type(stype) {
- bpf_cgroup_storage_unlink(pl->storage[stype]);
- bpf_cgroup_storage_free(pl->storage[stype]);
- }
+ if (pl->prog)
+ bpf_prog_put(pl->prog);
+ if (pl->link)
+ bpf_cgroup_link_auto_detach(pl->link);
kfree(pl);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
old_array = rcu_dereference_protected(
- cgrp->bpf.effective[type],
+ cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
bpf_prog_array_free(old_array);
}
+ list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+ bpf_cgroup_storage_unlink(storage);
+ bpf_cgroup_storage_free(storage);
+ }
+
mutex_unlock(&cgroup_mutex);
for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
@@ -85,6 +163,18 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
queue_work(system_wq, &cgrp->bpf.release_work);
}
+/* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
+ * link or direct prog.
+ */
+static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
+{
+ if (pl->prog)
+ return pl->prog;
+ if (pl->link)
+ return pl->link->link.prog;
+ return NULL;
+}
+
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
@@ -94,7 +184,7 @@ static u32 prog_list_length(struct list_head *head)
u32 cnt = 0;
list_for_each_entry(pl, head, node) {
- if (!pl->prog)
+ if (!prog_list_prog(pl))
continue;
cnt++;
}
@@ -106,7 +196,7 @@ static u32 prog_list_length(struct list_head *head)
* if parent has overridable or multi-prog, allow attaching
*/
static bool hierarchy_allows_attach(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *p;
@@ -114,12 +204,12 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
if (!p)
return true;
do {
- u32 flags = p->bpf.flags[type];
+ u32 flags = p->bpf.flags[atype];
u32 cnt;
if (flags & BPF_F_ALLOW_MULTI)
return true;
- cnt = prog_list_length(&p->bpf.progs[type]);
+ cnt = prog_list_length(&p->bpf.progs[atype]);
WARN_ON_ONCE(cnt > 1);
if (cnt == 1)
return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -135,10 +225,10 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
* to programs in this cgroup
*/
static int compute_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array **array)
{
- enum bpf_cgroup_storage_type stype;
+ struct bpf_prog_array_item *item;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
struct cgroup *p = cgrp;
@@ -146,8 +236,8 @@ static int compute_effective_progs(struct cgroup *cgrp,
/* count number of effective programs by walking parents */
do {
- if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
- cnt += prog_list_length(&p->bpf.progs[type]);
+ if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[atype]);
p = cgroup_parent(p);
} while (p);
@@ -159,17 +249,17 @@ static int compute_effective_progs(struct cgroup *cgrp,
cnt = 0;
p = cgrp;
do {
- if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[type], node) {
- if (!pl->prog)
+ list_for_each_entry(pl, &p->bpf.progs[atype], node) {
+ if (!prog_list_prog(pl))
continue;
- progs->items[cnt].prog = pl->prog;
- for_each_cgroup_storage_type(stype)
- progs->items[cnt].cgroup_storage[stype] =
- pl->storage[stype];
+ item = &progs->items[cnt];
+ item->prog = prog_list_prog(pl);
+ bpf_cgroup_storages_assign(item->cgroup_storage,
+ pl->storage);
cnt++;
}
} while ((p = cgroup_parent(p)));
@@ -179,10 +269,10 @@ static int compute_effective_progs(struct cgroup *cgrp,
}
static void activate_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type,
+ enum cgroup_bpf_attach_type atype,
struct bpf_prog_array *old_array)
{
- old_array = rcu_replace_pointer(cgrp->bpf.effective[type], old_array,
+ old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array,
lockdep_is_held(&cgroup_mutex));
/* free prog array after grace period, since __cgroup_bpf_run_*()
* might be still walking the array
@@ -215,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
for (i = 0; i < NR; i++)
INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_LIST_HEAD(&cgrp->bpf.storages);
+
for (i = 0; i < NR; i++)
if (compute_effective_progs(cgrp, i, &arrays[i]))
goto cleanup;
@@ -227,13 +319,16 @@ cleanup:
for (i = 0; i < NR; i++)
bpf_prog_array_free(arrays[i]);
+ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
+ cgroup_bpf_put(p);
+
percpu_ref_exit(&cgrp->bpf.refcnt);
return -ENOMEM;
}
static int update_effective_progs(struct cgroup *cgrp,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup_subsys_state *css;
int err;
@@ -245,7 +340,7 @@ static int update_effective_progs(struct cgroup *cgrp,
if (percpu_ref_is_zero(&desc->bpf.refcnt))
continue;
- err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ err = compute_effective_progs(desc, atype, &desc->bpf.inactive);
if (err)
goto cleanup;
}
@@ -262,7 +357,7 @@ static int update_effective_progs(struct cgroup *cgrp,
continue;
}
- activate_effective_progs(desc, type, desc->bpf.inactive);
+ activate_effective_progs(desc, atype, desc->bpf.inactive);
desc->bpf.inactive = NULL;
}
@@ -284,39 +379,92 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
+static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+ struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *replace_prog,
+ bool allow_multi)
+{
+ struct bpf_prog_list *pl;
+
+ /* single-attach case */
+ if (!allow_multi) {
+ if (list_empty(progs))
+ return NULL;
+ return list_first_entry(progs, typeof(*pl), node);
+ }
+
+ list_for_each_entry(pl, progs, node) {
+ if (prog && pl->prog == prog && prog != replace_prog)
+ /* disallow attaching the same prog twice */
+ return ERR_PTR(-EINVAL);
+ if (link && pl->link == link)
+ /* disallow attaching the same link twice */
+ return ERR_PTR(-EINVAL);
+ }
+
+ /* direct prog multi-attach w/ replacement case */
+ if (replace_prog) {
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == replace_prog)
+ /* a match found */
+ return pl;
+ }
+ /* prog to replace not found for cgroup */
+ return ERR_PTR(-ENOENT);
+ }
+
+ return NULL;
+}
+
/**
- * __cgroup_bpf_attach() - Attach the program to a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to attach
+ * @link: A link to attach
* @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
* @type: Type of attach operation
* @flags: Option flags
*
+ * Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_prog *replace_prog,
- enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
- struct list_head *progs = &cgrp->bpf.progs[type];
struct bpf_prog *old_prog = NULL;
- struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE],
- *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL};
- struct bpf_prog_list *pl, *replace_pl = NULL;
- enum bpf_cgroup_storage_type stype;
+ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog_list *pl;
+ struct list_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
/* invalid combination */
return -EINVAL;
+ if (link && (prog || replace_prog))
+ /* only either link or prog/replace_prog can be specified */
+ return -EINVAL;
+ if (!!replace_prog != !!(flags & BPF_F_REPLACE))
+ /* replace_prog implies BPF_F_REPLACE, and vice versa */
+ return -EINVAL;
- if (!hierarchy_allows_attach(cgrp, type))
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+
+ if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[type] != saved_flags)
+ if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -326,178 +474,308 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
return -E2BIG;
- if (flags & BPF_F_ALLOW_MULTI) {
- list_for_each_entry(pl, progs, node) {
- if (pl->prog == prog)
- /* disallow attaching the same prog twice */
- return -EINVAL;
- if (pl->prog == replace_prog)
- replace_pl = pl;
- }
- if ((flags & BPF_F_REPLACE) && !replace_pl)
- /* prog to replace not found for cgroup */
- return -ENOENT;
- } else if (!list_empty(progs)) {
- replace_pl = list_first_entry(progs, typeof(*pl), node);
- }
+ pl = find_attach_entry(progs, prog, link, replace_prog,
+ flags & BPF_F_ALLOW_MULTI);
+ if (IS_ERR(pl))
+ return PTR_ERR(pl);
- for_each_cgroup_storage_type(stype) {
- storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
- if (IS_ERR(storage[stype])) {
- storage[stype] = NULL;
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
- return -ENOMEM;
- }
- }
+ if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+ prog ? : link->link.prog, cgrp))
+ return -ENOMEM;
- if (replace_pl) {
- pl = replace_pl;
+ if (pl) {
old_prog = pl->prog;
- for_each_cgroup_storage_type(stype) {
- old_storage[stype] = pl->storage[stype];
- bpf_cgroup_storage_unlink(old_storage[stype]);
- }
} else {
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_free(storage[stype]);
+ bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
list_add_tail(&pl->node, progs);
}
pl->prog = prog;
- for_each_cgroup_storage_type(stype)
- pl->storage[stype] = storage[stype];
+ pl->link = link;
+ bpf_cgroup_storages_assign(pl->storage, storage);
+ cgrp->bpf.flags[atype] = saved_flags;
- cgrp->bpf.flags[type] = saved_flags;
-
- err = update_effective_progs(cgrp, type);
+ err = update_effective_progs(cgrp, atype);
if (err)
goto cleanup;
- static_branch_inc(&cgroup_bpf_enabled_key);
- for_each_cgroup_storage_type(stype) {
- if (!old_storage[stype])
- continue;
- bpf_cgroup_storage_free(old_storage[stype]);
- }
- if (old_prog) {
+ if (old_prog)
bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key);
- }
- for_each_cgroup_storage_type(stype)
- bpf_cgroup_storage_link(storage[stype], cgrp, type);
+ else
+ static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
cleanup:
- /* and cleanup the prog list */
- pl->prog = old_prog;
- for_each_cgroup_storage_type(stype) {
- bpf_cgroup_storage_free(pl->storage[stype]);
- pl->storage[stype] = old_storage[stype];
- bpf_cgroup_storage_link(old_storage[stype], cgrp, type);
+ if (old_prog) {
+ pl->prog = old_prog;
+ pl->link = NULL;
}
- if (!replace_pl) {
+ bpf_cgroup_storages_free(new_storage);
+ if (!old_prog) {
list_del(&pl->node);
kfree(pl);
}
return err;
}
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+/* Swap updated BPF program for given link in effective program arrays across
+ * all descendant cgroups. This function is guaranteed to succeed.
+ */
+static void replace_effective_prog(struct cgroup *cgrp,
+ enum cgroup_bpf_attach_type atype,
+ struct bpf_cgroup_link *link)
+{
+ struct bpf_prog_array_item *item;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct list_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ list_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+found:
+ BUG_ON(!cg);
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ item = &progs->items[pos];
+ WRITE_ONCE(item->prog, link->link.prog);
+ }
+}
+
/**
- * __cgroup_bpf_detach() - Detach the program from a cgroup, and
- * propagate the change to descendants
+ * __cgroup_bpf_replace() - Replace link's program and propagate the change
+ * to descendants
* @cgrp: The cgroup which descendants to traverse
- * @prog: A program to detach or NULL
- * @type: Type of detach operation
+ * @link: A link for which to replace BPF program
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
+static int __cgroup_bpf_replace(struct cgroup *cgrp,
+ struct bpf_cgroup_link *link,
+ struct bpf_prog *new_prog)
{
- struct list_head *progs = &cgrp->bpf.progs[type];
- enum bpf_cgroup_storage_type stype;
- u32 flags = cgrp->bpf.flags[type];
- struct bpf_prog *old_prog = NULL;
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- int err;
+ struct list_head *progs;
+ bool found = false;
- if (flags & BPF_F_ALLOW_MULTI) {
- if (!prog)
- /* to detach MULTI prog the user has to specify valid FD
- * of the program to be detached
- */
- return -EINVAL;
- } else {
- if (list_empty(progs))
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
- }
+ atype = to_cgroup_bpf_attach_type(link->type);
+ if (atype < 0)
+ return -EINVAL;
- if (flags & BPF_F_ALLOW_MULTI) {
- /* find the prog and detach it */
- list_for_each_entry(pl, progs, node) {
- if (pl->prog != prog)
- continue;
- old_prog = prog;
- /* mark it deleted, so it's ignored while
- * recomputing effective
- */
- pl->prog = NULL;
+ progs = &cgrp->bpf.progs[atype];
+
+ if (link->link.prog->type != new_prog->type)
+ return -EINVAL;
+
+ list_for_each_entry(pl, progs, node) {
+ if (pl->link == link) {
+ found = true;
break;
}
- if (!old_prog)
- return -ENOENT;
- } else {
+ }
+ if (!found)
+ return -ENOENT;
+
+ old_prog = xchg(&link->link.prog, new_prog);
+ replace_effective_prog(cgrp, atype, link);
+ bpf_prog_put(old_prog);
+ return 0;
+}
+
+static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_cgroup_link *cg_link;
+ int ret;
+
+ cg_link = container_of(link, struct bpf_cgroup_link, link);
+
+ mutex_lock(&cgroup_mutex);
+ /* link might have been auto-released by dying cgroup, so fail */
+ if (!cg_link->cgroup) {
+ ret = -ENOLINK;
+ goto out_unlock;
+ }
+ if (old_prog && link->prog != old_prog) {
+ ret = -EPERM;
+ goto out_unlock;
+ }
+ ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog);
+out_unlock:
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
+static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+ struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ bool allow_multi)
+{
+ struct bpf_prog_list *pl;
+
+ if (!allow_multi) {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return ERR_PTR(-ENOENT);
+
/* to maintain backward compatibility NONE and OVERRIDE cgroups
- * allow detaching with invalid FD (prog==NULL)
+ * allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- pl = list_first_entry(progs, typeof(*pl), node);
- old_prog = pl->prog;
- pl->prog = NULL;
+ return list_first_entry(progs, typeof(*pl), node);
}
- err = update_effective_progs(cgrp, type);
+ if (!prog && !link)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program or link to be detached
+ */
+ return ERR_PTR(-EINVAL);
+
+ /* find the prog or link and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog == prog && pl->link == link)
+ return pl;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @type: Type of detach operation
+ *
+ * At most one of @prog or @link can be non-NULL.
+ * Must be called with cgroup_mutex held.
+ */
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
+{
+ enum cgroup_bpf_attach_type atype;
+ struct bpf_prog *old_prog;
+ struct bpf_prog_list *pl;
+ struct list_head *progs;
+ u32 flags;
+ int err;
+
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
+
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
+
+ if (prog && link)
+ /* only one of prog or link can be specified */
+ return -EINVAL;
+
+ pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI);
+ if (IS_ERR(pl))
+ return PTR_ERR(pl);
+
+ /* mark it deleted, so it's ignored while recomputing effective */
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ pl->link = NULL;
+
+ err = update_effective_progs(cgrp, atype);
if (err)
goto cleanup;
/* now can actually delete it from this cgroup list */
list_del(&pl->node);
- for_each_cgroup_storage_type(stype) {
- bpf_cgroup_storage_unlink(pl->storage[stype]);
- bpf_cgroup_storage_free(pl->storage[stype]);
- }
kfree(pl);
if (list_empty(progs))
/* last program was detached, reset flags to zero */
- cgrp->bpf.flags[type] = 0;
-
- bpf_prog_put(old_prog);
- static_branch_dec(&cgroup_bpf_enabled_key);
+ cgrp->bpf.flags[atype] = 0;
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
cleanup:
- /* and restore back old_prog */
+ /* restore back prog or link */
pl->prog = old_prog;
+ pl->link = link;
return err;
}
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
- struct list_head *progs = &cgrp->bpf.progs[type];
- u32 flags = cgrp->bpf.flags[type];
+ enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
+ struct list_head *progs;
+ struct bpf_prog *prog;
int cnt, ret = 0, i;
+ u32 flags;
+
+ atype = to_cgroup_bpf_attach_type(type);
+ if (atype < 0)
+ return -EINVAL;
- effective = rcu_dereference_protected(cgrp->bpf.effective[type],
+ progs = &cgrp->bpf.progs[atype];
+ flags = cgrp->bpf.flags[atype];
+
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
lockdep_is_held(&cgroup_mutex));
if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
@@ -525,7 +803,8 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
i = 0;
list_for_each_entry(pl, progs, node) {
- id = pl->prog->aux->id;
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
if (copy_to_user(prog_ids + i, &id, sizeof(id)))
return -EFAULT;
if (++i == cnt)
@@ -535,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
@@ -555,8 +845,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
}
}
- ret = cgroup_bpf_attach(cgrp, prog, replace_prog, attr->attach_type,
- attr->attach_flags);
+ ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
+ attr->attach_type, attr->attach_flags);
if (replace_prog)
bpf_prog_put(replace_prog);
@@ -578,7 +868,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
if (IS_ERR(prog))
prog = NULL;
- ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
if (prog)
bpf_prog_put(prog);
@@ -586,6 +876,141 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
return ret;
}
+static void bpf_cgroup_link_release(struct bpf_link *link)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ struct cgroup *cg;
+
+ /* link might have been auto-detached by dying cgroup already,
+ * in that case our work is done here
+ */
+ if (!cg_link->cgroup)
+ return;
+
+ mutex_lock(&cgroup_mutex);
+
+ /* re-check cgroup under lock again */
+ if (!cg_link->cgroup) {
+ mutex_unlock(&cgroup_mutex);
+ return;
+ }
+
+ WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
+ cg_link->type));
+
+ cg = cg_link->cgroup;
+ cg_link->cgroup = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+
+ cgroup_put(cg);
+}
+
+static void bpf_cgroup_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+
+ kfree(cg_link);
+}
+
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+ bpf_cgroup_link_release(link);
+
+ return 0;
+}
+
+static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ seq_printf(seq,
+ "cgroup_id:\t%llu\n"
+ "attach_type:\t%d\n",
+ cg_id,
+ cg_link->type);
+}
+
+static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_cgroup_link *cg_link =
+ container_of(link, struct bpf_cgroup_link, link);
+ u64 cg_id = 0;
+
+ mutex_lock(&cgroup_mutex);
+ if (cg_link->cgroup)
+ cg_id = cgroup_id(cg_link->cgroup);
+ mutex_unlock(&cgroup_mutex);
+
+ info->cgroup.cgroup_id = cg_id;
+ info->cgroup.attach_type = cg_link->type;
+ return 0;
+}
+
+static const struct bpf_link_ops bpf_cgroup_link_lops = {
+ .release = bpf_cgroup_link_release,
+ .dealloc = bpf_cgroup_link_dealloc,
+ .detach = bpf_cgroup_link_detach,
+ .update_prog = cgroup_bpf_replace,
+ .show_fdinfo = bpf_cgroup_link_show_fdinfo,
+ .fill_link_info = bpf_cgroup_link_fill_link_info,
+};
+
+int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct bpf_cgroup_link *link;
+ struct cgroup *cgrp;
+ int err;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out_put_cgroup;
+ }
+ bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops,
+ prog);
+ link->cgroup = cgrp;
+ link->type = attr->link_create.attach_type;
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto out_put_cgroup;
+ }
+
+ err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
+ link->type, BPF_F_ALLOW_MULTI);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ goto out_put_cgroup;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+out_put_cgroup:
+ cgroup_put(cgrp);
+ return err;
+}
+
int cgroup_bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
@@ -627,7 +1052,7 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
*/
int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
unsigned int offset = skb->data - skb_network_header(skb);
struct sock *save_sk;
@@ -649,12 +1074,12 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
/* compute pointers for the bpf prog */
bpf_compute_and_save_data_end(skb, &saved_data_end);
- if (type == BPF_CGROUP_INET_EGRESS) {
+ if (atype == CGROUP_INET_EGRESS) {
ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(
- cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb);
+ cgrp->bpf.effective[atype], skb, __bpf_prog_run_save_cb);
} else {
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
- __bpf_prog_run_save_cb);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], skb,
+ __bpf_prog_run_save_cb);
ret = (ret == 1 ? 0 : -EPERM);
}
bpf_restore_data_end(skb, saved_data_end);
@@ -679,12 +1104,12 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
* and if it returned != 1 during execution. In all other cases, 0 is returned.
*/
int __cgroup_bpf_run_filter_sk(struct sock *sk,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sk, bpf_prog_run);
return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -696,6 +1121,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
* @uaddr: sockaddr struct provided by user
* @type: The type of program to be exectuted
* @t_ctx: Pointer to attach type specific context
+ * @flags: Pointer to u32 which contains higher bits of BPF program
+ * return value (OR'ed together).
*
* socket is expected to be of type INET or INET6.
*
@@ -704,8 +1131,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
*/
int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
struct sockaddr *uaddr,
- enum bpf_attach_type type,
- void *t_ctx)
+ enum cgroup_bpf_attach_type atype,
+ void *t_ctx,
+ u32 *flags)
{
struct bpf_sock_addr_kern ctx = {
.sk = sk,
@@ -728,7 +1156,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
}
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG_FLAGS(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run, flags);
return ret == 1 ? 0 : -EPERM;
}
@@ -752,19 +1181,19 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
*/
int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
struct bpf_sock_ops_kern *sock_ops,
- enum bpf_attach_type type)
+ enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
int ret;
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
- BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], sock_ops,
+ bpf_prog_run);
return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
- short access, enum bpf_attach_type type)
+ short access, enum cgroup_bpf_attach_type atype)
{
struct cgroup *cgrp;
struct bpf_cgroup_dev_ctx ctx = {
@@ -772,46 +1201,31 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
.major = major,
.minor = minor,
};
- int allow = 1;
+ int allow;
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
- BPF_PROG_RUN);
+ allow = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx,
+ bpf_prog_run);
rcu_read_unlock();
return !allow;
}
-EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
static const struct bpf_func_proto *
cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_map_push_elem:
- return &bpf_map_push_elem_proto;
- case BPF_FUNC_map_pop_elem:
- return &bpf_map_pop_elem_proto;
- case BPF_FUNC_map_peek_elem:
- return &bpf_map_peek_elem_proto;
case BPF_FUNC_get_current_uid_gid:
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_get_current_cgroup_id:
return &bpf_get_current_cgroup_id_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* fall through */
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
default:
- return NULL;
+ return bpf_base_func_proto(func_id);
}
}
@@ -865,16 +1279,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
* @head: sysctl table header
* @table: sysctl table
* @write: sysctl is being read (= 0) or written (= 1)
- * @buf: pointer to buffer passed by user space
+ * @buf: pointer to buffer (in and out)
* @pcount: value-result argument: value is size of buffer pointed to by @buf,
* result is size of @new_buf if program set new value, initial value
* otherwise
* @ppos: value-result argument: value is position at which read from or write
* to sysctl is happening, result is new position if program overrode it,
* initial value otherwise
- * @new_buf: pointer to pointer to new buffer that will be allocated if program
- * overrides new value provided by user space on sysctl write
- * NOTE: it's caller responsibility to free *new_buf if it was set
* @type: type of program to be executed
*
* Program is run when sysctl is being accessed, either read or written, and
@@ -885,9 +1296,8 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = {
*/
int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
struct ctl_table *table, int write,
- void __user *buf, size_t *pcount,
- loff_t *ppos, void **new_buf,
- enum bpf_attach_type type)
+ char **buf, size_t *pcount, loff_t *ppos,
+ enum cgroup_bpf_attach_type atype)
{
struct bpf_sysctl_kern ctx = {
.head = head,
@@ -901,47 +1311,40 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
.new_updated = 0,
};
struct cgroup *cgrp;
+ loff_t pos = 0;
int ret;
ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL);
- if (ctx.cur_val) {
- mm_segment_t old_fs;
- loff_t pos = 0;
-
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- if (table->proc_handler(table, 0, (void __user *)ctx.cur_val,
- &ctx.cur_len, &pos)) {
- /* Let BPF program decide how to proceed. */
- ctx.cur_len = 0;
- }
- set_fs(old_fs);
- } else {
+ if (!ctx.cur_val ||
+ table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) {
/* Let BPF program decide how to proceed. */
ctx.cur_len = 0;
}
- if (write && buf && *pcount) {
+ if (write && *buf && *pcount) {
/* BPF program should be able to override new value with a
* buffer bigger than provided by user.
*/
ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount);
- if (!ctx.new_val ||
- copy_from_user(ctx.new_val, buf, ctx.new_len))
+ if (ctx.new_val) {
+ memcpy(ctx.new_val, *buf, ctx.new_len);
+ } else {
/* Let BPF program decide how to proceed. */
ctx.new_len = 0;
+ }
}
rcu_read_lock();
cgrp = task_dfl_cgroup(current);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[atype], &ctx, bpf_prog_run);
rcu_read_unlock();
kfree(ctx.cur_val);
if (ret == 1 && ctx.new_updated) {
- *new_buf = ctx.new_val;
+ kfree(*buf);
+ *buf = ctx.new_val;
*pcount = ctx.new_len;
} else {
kfree(ctx.new_val);
@@ -949,11 +1352,10 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
return ret == 1 ? 0 : -EPERM;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
#ifdef CONFIG_NET
static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
- enum bpf_attach_type attach_type)
+ enum cgroup_bpf_attach_type attach_type)
{
struct bpf_prog_array *prog_array;
bool empty;
@@ -966,30 +1368,57 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp,
return empty;
}
-static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen)
+static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen,
+ struct bpf_sockopt_buf *buf)
{
- if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0)
+ if (unlikely(max_optlen < 0))
return -EINVAL;
+ if (unlikely(max_optlen > PAGE_SIZE)) {
+ /* We don't expose optvals that are greater than PAGE_SIZE
+ * to the BPF program.
+ */
+ max_optlen = PAGE_SIZE;
+ }
+
+ if (max_optlen <= sizeof(buf->data)) {
+ /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE
+ * bytes avoid the cost of kzalloc.
+ */
+ ctx->optval = buf->data;
+ ctx->optval_end = ctx->optval + max_optlen;
+ return max_optlen;
+ }
+
ctx->optval = kzalloc(max_optlen, GFP_USER);
if (!ctx->optval)
return -ENOMEM;
ctx->optval_end = ctx->optval + max_optlen;
- return 0;
+ return max_optlen;
}
-static void sockopt_free_buf(struct bpf_sockopt_kern *ctx)
+static void sockopt_free_buf(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
{
+ if (ctx->optval == buf->data)
+ return;
kfree(ctx->optval);
}
+static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx,
+ struct bpf_sockopt_buf *buf)
+{
+ return ctx->optval != buf->data;
+}
+
int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
int *optname, char __user *optval,
int *optlen, char **kernel_optval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = *level,
@@ -1001,8 +1430,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_SETSOCKOPT))
return 0;
/* Allocate a bit more than the initial user buffer for
@@ -1011,20 +1439,20 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
*/
max_optlen = max_t(int, 16, *optlen);
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+ if (max_optlen < 0)
+ return max_optlen;
ctx.optlen = *optlen;
- if (copy_from_user(ctx.optval, optval, *optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_SETSOCKOPT],
+ &ctx, bpf_prog_run);
release_sock(sk);
if (!ret) {
@@ -1045,16 +1473,39 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level,
/* export any potential modifications */
*level = ctx.level;
*optname = ctx.optname;
- *optlen = ctx.optlen;
- *kernel_optval = ctx.optval;
+
+ /* optlen == 0 from BPF indicates that we should
+ * use original userspace data.
+ */
+ if (ctx.optlen != 0) {
+ *optlen = ctx.optlen;
+ /* We've used bpf_sockopt_kern->buf as an intermediary
+ * storage, but the BPF program indicates that we need
+ * to pass this data to the kernel setsockopt handler.
+ * No way to export on-stack buf, have to allocate a
+ * new buffer.
+ */
+ if (!sockopt_buf_allocated(&ctx, &buf)) {
+ void *p = kmalloc(ctx.optlen, GFP_USER);
+
+ if (!p) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(p, ctx.optval, ctx.optlen);
+ *kernel_optval = p;
+ } else {
+ *kernel_optval = ctx.optval;
+ }
+ /* export and don't free sockopt buf */
+ return 0;
+ }
}
out:
- if (ret)
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt);
int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int optname, char __user *optval,
@@ -1062,6 +1513,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
int retval)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_buf buf = {};
struct bpf_sockopt_kern ctx = {
.sk = sk,
.level = level,
@@ -1074,16 +1526,15 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
* attached to the hook so we don't waste time allocating
* memory and locking the socket.
*/
- if (!cgroup_bpf_enabled ||
- __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT))
+ if (__cgroup_bpf_prog_array_is_empty(cgrp, CGROUP_GETSOCKOPT))
return retval;
- ret = sockopt_alloc_buf(&ctx, max_optlen);
- if (ret)
- return ret;
-
ctx.optlen = max_optlen;
+ max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf);
+ if (max_optlen < 0)
+ return max_optlen;
+
if (!retval) {
/* If kernel getsockopt finished successfully,
* copy whatever was returned to the user back
@@ -1097,18 +1548,21 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (ctx.optlen > max_optlen)
- ctx.optlen = max_optlen;
+ if (ctx.optlen < 0) {
+ ret = -EFAULT;
+ goto out;
+ }
- if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) {
+ if (copy_from_user(ctx.optval, optval,
+ min(ctx.optlen, max_optlen)) != 0) {
ret = -EFAULT;
goto out;
}
}
lock_sock(sk);
- ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT],
- &ctx, BPF_PROG_RUN);
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+ &ctx, bpf_prog_run);
release_sock(sk);
if (!ret) {
@@ -1116,7 +1570,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (ctx.optlen > max_optlen) {
+ if (ctx.optlen > max_optlen || ctx.optlen < 0) {
ret = -EFAULT;
goto out;
}
@@ -1129,19 +1583,66 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level,
goto out;
}
- if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
- put_user(ctx.optlen, optlen)) {
- ret = -EFAULT;
- goto out;
+ if (ctx.optlen != 0) {
+ if (copy_to_user(optval, ctx.optval, ctx.optlen) ||
+ put_user(ctx.optlen, optlen)) {
+ ret = -EFAULT;
+ goto out;
+ }
}
ret = ctx.retval;
out:
- sockopt_free_buf(&ctx);
+ sockopt_free_buf(&ctx, &buf);
return ret;
}
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt);
+
+int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level,
+ int optname, void *optval,
+ int *optlen, int retval)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_sockopt_kern ctx = {
+ .sk = sk,
+ .level = level,
+ .optname = optname,
+ .retval = retval,
+ .optlen = *optlen,
+ .optval = optval,
+ .optval_end = optval + *optlen,
+ };
+ int ret;
+
+ /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy
+ * user data back into BPF buffer when reval != 0. This is
+ * done as an optimization to avoid extra copy, assuming
+ * kernel won't populate the data in case of an error.
+ * Here we always pass the data and memset() should
+ * be called if that data shouldn't be "exported".
+ */
+
+ ret = BPF_PROG_RUN_ARRAY_CG(cgrp->bpf.effective[CGROUP_GETSOCKOPT],
+ &ctx, bpf_prog_run);
+ if (!ret)
+ return -EPERM;
+
+ if (ctx.optlen > *optlen)
+ return -EFAULT;
+
+ /* BPF programs only allowed to set retval to 0, not some
+ * arbitrary value.
+ */
+ if (ctx.retval != 0 && ctx.retval != retval)
+ return -EFAULT;
+
+ /* BPF programs can shrink the buffer, export the modifications.
+ */
+ if (ctx.optlen != 0)
+ *optlen = ctx.optlen;
+
+ return ctx.retval;
+}
#endif
static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp,
@@ -1288,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -1308,6 +1809,8 @@ sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sysctl_get_new_value_proto;
case BPF_FUNC_sysctl_set_new_value:
return &bpf_sysctl_set_new_value_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
return cgroup_base_func_proto(func_id, prog);
}
@@ -1411,15 +1914,41 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = {
const struct bpf_prog_ops cg_sysctl_prog_ops = {
};
+#ifdef CONFIG_NET
+BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx)
+{
+ const struct net *net = ctx ? sock_net(ctx->sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = {
+ .func = bpf_get_netns_cookie_sockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+#endif
+
static const struct bpf_func_proto *
cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
#ifdef CONFIG_NET
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sockopt_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT)
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
@@ -1451,7 +1980,7 @@ static bool cg_sockopt_is_valid_access(int off, int size,
return prog->expected_attach_type ==
BPF_CGROUP_GETSOCKOPT;
case offsetof(struct bpf_sockopt, optname):
- /* fallthrough */
+ fallthrough;
case offsetof(struct bpf_sockopt, level):
if (size != size_default)
return false;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 973a20d49749..de3e5bc6781f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -25,13 +25,16 @@
#include <linux/moduleloader.h>
#include <linux/bpf.h>
#include <linux/btf.h>
-#include <linux/frame.h>
+#include <linux/objtool.h>
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
+#include <linux/bpf_verifier.h>
+
+#include <asm/barrier.h>
#include <asm/unaligned.h>
/* Registers */
@@ -77,34 +80,42 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog_aux *aux;
struct bpf_prog *fp;
size = round_up(size, PAGE_SIZE);
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(size, gfp_flags);
if (fp == NULL)
return NULL;
- aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
return NULL;
}
+ fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags);
+ if (!fp->active) {
+ vfree(fp);
+ kfree(aux);
+ return NULL;
+ }
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
- INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
+ INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
+ mutex_init(&fp->aux->used_maps_mutex);
+ mutex_init(&fp->aux->dst_mutex);
return fp;
}
struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *prog;
int cpu;
@@ -112,8 +123,9 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (!prog)
return NULL;
- prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
- if (!prog->aux->stats) {
+ prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags);
+ if (!prog->stats) {
+ free_percpu(prog->active);
kfree(prog->aux);
vfree(prog);
return NULL;
@@ -122,7 +134,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
for_each_possible_cpu(cpu) {
struct bpf_prog_stats *pstats;
- pstats = per_cpu_ptr(prog->aux->stats, cpu);
+ pstats = per_cpu_ptr(prog->stats, cpu);
u64_stats_init(&pstats->syncp);
}
return prog;
@@ -134,25 +146,25 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog)
if (!prog->aux->nr_linfo || !prog->jit_requested)
return 0;
- prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo,
- sizeof(*prog->aux->jited_linfo),
- GFP_KERNEL | __GFP_NOWARN);
+ prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo,
+ sizeof(*prog->aux->jited_linfo),
+ GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (!prog->aux->jited_linfo)
return -ENOMEM;
return 0;
}
-void bpf_prog_free_jited_linfo(struct bpf_prog *prog)
+void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
{
- kfree(prog->aux->jited_linfo);
- prog->aux->jited_linfo = NULL;
-}
+ if (prog->aux->jited_linfo &&
+ (!prog->jited || !prog->aux->jited_linfo[0])) {
+ kvfree(prog->aux->jited_linfo);
+ prog->aux->jited_linfo = NULL;
+ }
-void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog)
-{
- if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0])
- bpf_prog_free_jited_linfo(prog);
+ kfree(prog->aux->kfunc_tab);
+ prog->aux->kfunc_tab = NULL;
}
/* The jit engine is responsible to provide an array
@@ -208,34 +220,20 @@ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
insn_to_jit_off[linfo[i].insn_off - insn_start - 1];
}
-void bpf_prog_free_linfo(struct bpf_prog *prog)
-{
- bpf_prog_free_jited_linfo(prog);
- kvfree(prog->aux->linfo);
-}
-
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
gfp_t gfp_extra_flags)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
+ gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
- u32 pages, delta;
- int ret;
+ u32 pages;
size = round_up(size, PAGE_SIZE);
pages = size / PAGE_SIZE;
if (pages <= fp_old->pages)
return fp_old;
- delta = pages - fp_old->pages;
- ret = __bpf_prog_charge(fp_old->aux->user, delta);
- if (ret)
- return NULL;
-
- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
- if (fp == NULL) {
- __bpf_prog_uncharge(fp_old->aux->user, delta);
- } else {
+ fp = __vmalloc(size, gfp_flags);
+ if (fp) {
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = pages;
fp->aux->prog = fp;
@@ -244,6 +242,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
* reallocated structure.
*/
fp_old->aux = NULL;
+ fp_old->stats = NULL;
+ fp_old->active = NULL;
__bpf_prog_free(fp_old);
}
@@ -253,19 +253,22 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
void __bpf_prog_free(struct bpf_prog *fp)
{
if (fp->aux) {
- free_percpu(fp->aux->stats);
+ mutex_destroy(&fp->aux->used_maps_mutex);
+ mutex_destroy(&fp->aux->dst_mutex);
kfree(fp->aux->poke_tab);
kfree(fp->aux);
}
+ free_percpu(fp->stats);
+ free_percpu(fp->active);
vfree(fp);
}
int bpf_prog_calc_tag(struct bpf_prog *fp)
{
- const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
+ const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64);
u32 raw_size = bpf_prog_tag_scratch_size(fp);
- u32 digest[SHA_DIGEST_WORDS];
- u32 ws[SHA_WORKSPACE_WORDS];
+ u32 digest[SHA1_DIGEST_WORDS];
+ u32 ws[SHA1_WORKSPACE_WORDS];
u32 i, bsize, psize, blocks;
struct bpf_insn *dst;
bool was_ld_map;
@@ -277,7 +280,7 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
if (!raw)
return -ENOMEM;
- sha_init(digest);
+ sha1_init(digest);
memset(ws, 0, sizeof(ws));
/* We need to take out the map fd for the digest calculation
@@ -308,8 +311,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
memset(&raw[psize], 0, raw_size - psize);
raw[psize++] = 0x80;
- bsize = round_up(psize, SHA_MESSAGE_BYTES);
- blocks = bsize / SHA_MESSAGE_BYTES;
+ bsize = round_up(psize, SHA1_BLOCK_SIZE);
+ blocks = bsize / SHA1_BLOCK_SIZE;
todo = raw;
if (bsize - psize >= sizeof(__be64)) {
bits = (__be64 *)(todo + bsize - sizeof(__be64));
@@ -320,12 +323,12 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
*bits = cpu_to_be64((psize - 1) << 3);
while (blocks--) {
- sha_transform(digest, todo, ws);
- todo += SHA_MESSAGE_BYTES;
+ sha1_transform(digest, todo, ws);
+ todo += SHA1_BLOCK_SIZE;
}
result = (__force __be32 *)digest;
- for (i = 0; i < SHA_DIGEST_WORDS; i++)
+ for (i = 0; i < SHA1_DIGEST_WORDS; i++)
result[i] = cpu_to_be32(digest[i]);
memcpy(fp->tag, result, sizeof(fp->tag));
@@ -387,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
@@ -522,23 +532,24 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
-static __always_inline void
-bpf_get_prog_addr_region(const struct bpf_prog *prog,
- unsigned long *symbol_start,
- unsigned long *symbol_end)
+static void
+bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
unsigned long addr = (unsigned long)hdr;
WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));
- *symbol_start = addr;
- *symbol_end = addr + hdr->pages * PAGE_SIZE;
+ prog->aux->ksym.start = (unsigned long) prog->bpf_func;
+ prog->aux->ksym.end = addr + hdr->pages * PAGE_SIZE;
}
-void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
+static void
+bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
+ char *sym = prog->aux->ksym.name;
const char *end = sym + KSYM_NAME_LEN;
const struct btf_type *type;
const char *func_name;
@@ -572,36 +583,27 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
*sym = 0;
}
-static __always_inline unsigned long
-bpf_get_prog_addr_start(struct latch_tree_node *n)
+static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
- unsigned long symbol_start, symbol_end;
- const struct bpf_prog_aux *aux;
-
- aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
- bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
-
- return symbol_start;
+ return container_of(n, struct bpf_ksym, tnode)->start;
}
static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
struct latch_tree_node *b)
{
- return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
+ return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}
static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
unsigned long val = (unsigned long)key;
- unsigned long symbol_start, symbol_end;
- const struct bpf_prog_aux *aux;
+ const struct bpf_ksym *ksym;
- aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
- bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
+ ksym = container_of(n, struct bpf_ksym, tnode);
- if (val < symbol_start)
+ if (val < ksym->start)
return -1;
- if (val >= symbol_end)
+ if (val >= ksym->end)
return 1;
return 0;
@@ -616,20 +618,29 @@ static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;
-static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
+void bpf_ksym_add(struct bpf_ksym *ksym)
{
- WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
- list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
- latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
+ spin_lock_bh(&bpf_lock);
+ WARN_ON_ONCE(!list_empty(&ksym->lnode));
+ list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
+ latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+ spin_unlock_bh(&bpf_lock);
}
-static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
+static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
- if (list_empty(&aux->ksym_lnode))
+ if (list_empty(&ksym->lnode))
return;
- latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
- list_del_rcu(&aux->ksym_lnode);
+ latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
+ list_del_rcu(&ksym->lnode);
+}
+
+void bpf_ksym_del(struct bpf_ksym *ksym)
+{
+ spin_lock_bh(&bpf_lock);
+ __bpf_ksym_del(ksym);
+ spin_unlock_bh(&bpf_lock);
}
static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
@@ -639,19 +650,21 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
- return list_empty(&fp->aux->ksym_lnode) ||
- fp->aux->ksym_lnode.prev == LIST_POISON2;
+ return list_empty(&fp->aux->ksym.lnode) ||
+ fp->aux->ksym.lnode.prev == LIST_POISON2;
}
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
- !capable(CAP_SYS_ADMIN))
+ !bpf_capable())
return;
- spin_lock_bh(&bpf_lock);
- bpf_prog_ksym_node_add(fp->aux);
- spin_unlock_bh(&bpf_lock);
+ bpf_prog_ksym_set_addr(fp);
+ bpf_prog_ksym_set_name(fp);
+ fp->aux->ksym.prog = true;
+
+ bpf_ksym_add(&fp->aux->ksym);
}
void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -659,33 +672,30 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
if (!bpf_prog_kallsyms_candidate(fp))
return;
- spin_lock_bh(&bpf_lock);
- bpf_prog_ksym_node_del(fp->aux);
- spin_unlock_bh(&bpf_lock);
+ bpf_ksym_del(&fp->aux->ksym);
}
-static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
+static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
struct latch_tree_node *n;
n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
- return n ?
- container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
- NULL;
+ return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}
const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
unsigned long *off, char *sym)
{
- unsigned long symbol_start, symbol_end;
- struct bpf_prog *prog;
+ struct bpf_ksym *ksym;
char *ret = NULL;
rcu_read_lock();
- prog = bpf_prog_kallsyms_find(addr);
- if (prog) {
- bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
- bpf_get_prog_name(prog, sym);
+ ksym = bpf_ksym_find(addr);
+ if (ksym) {
+ unsigned long symbol_start = ksym->start;
+ unsigned long symbol_end = ksym->end;
+
+ strncpy(sym, ksym->name, KSYM_NAME_LEN);
ret = sym;
if (size)
@@ -703,19 +713,28 @@ bool is_bpf_text_address(unsigned long addr)
bool ret;
rcu_read_lock();
- ret = bpf_prog_kallsyms_find(addr) != NULL;
+ ret = bpf_ksym_find(addr) != NULL;
rcu_read_unlock();
return ret;
}
+static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
+{
+ struct bpf_ksym *ksym = bpf_ksym_find(addr);
+
+ return ksym && ksym->prog ?
+ container_of(ksym, struct bpf_prog_aux, ksym)->prog :
+ NULL;
+}
+
const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
const struct exception_table_entry *e = NULL;
struct bpf_prog *prog;
rcu_read_lock();
- prog = bpf_prog_kallsyms_find(addr);
+ prog = bpf_prog_ksym_find(addr);
if (!prog)
goto out;
if (!prog->aux->num_exentries)
@@ -730,7 +749,7 @@ out:
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *sym)
{
- struct bpf_prog_aux *aux;
+ struct bpf_ksym *ksym;
unsigned int it = 0;
int ret = -ERANGE;
@@ -738,13 +757,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
return ret;
rcu_read_lock();
- list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
+ list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
if (it++ != symnum)
continue;
- bpf_get_prog_name(aux->prog, sym);
+ strncpy(sym, ksym->name, KSYM_NAME_LEN);
- *value = (unsigned long)aux->prog->bpf_func;
+ *value = ksym->start;
*type = BPF_SYM_ELF_TYPE;
ret = 0;
@@ -765,7 +784,8 @@ int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
if (size > poke_tab_max)
return -ENOSPC;
- if (poke->ip || poke->ip_stable || poke->adj_off)
+ if (poke->tailcall_target || poke->tailcall_target_stable ||
+ poke->tailcall_bypass || poke->adj_off || poke->bypass_addr)
return -EINVAL;
switch (poke->reason) {
@@ -806,17 +826,18 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
PAGE_SIZE), LONG_MAX);
return 0;
}
pure_initcall(bpf_jit_charge_init);
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
@@ -825,7 +846,7 @@ static int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
{
atomic_long_sub(pages, &bpf_jit_current);
}
@@ -1081,7 +1102,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags;
struct bpf_prog *fp;
- fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
+ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags);
if (fp != NULL) {
/* aux->prog still points to the fp_other one, so
* when promoting the clone to the real program,
@@ -1103,6 +1124,8 @@ static void bpf_prog_clone_free(struct bpf_prog *fp)
* clone is guaranteed to not be locked.
*/
fp->aux = NULL;
+ fp->stats = NULL;
+ fp->active = NULL;
__bpf_prog_free(fp);
}
@@ -1304,8 +1327,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
INSN_3(STX, MEM, H), \
INSN_3(STX, MEM, W), \
INSN_3(STX, MEM, DW), \
- INSN_3(STX, XADD, W), \
- INSN_3(STX, XADD, DW), \
+ INSN_3(STX, ATOMIC, W), \
+ INSN_3(STX, ATOMIC, DW), \
/* Immediate based. */ \
INSN_3(ST, MEM, B), \
INSN_3(ST, MEM, H), \
@@ -1349,14 +1372,15 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
}
/**
- * __bpf_prog_run - run eBPF program on a given context
+ * ___bpf_prog_run - run eBPF program on a given context
* @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers
* @insn: is the array of eBPF instructions
- * @stack: is the eBPF storage stack
*
* Decode and execute eBPF instructions.
+ *
+ * Return: whatever value is in %BPF_R0 at program exit
*/
-static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
@@ -1367,6 +1391,7 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
/* Non-UAPI available opcodes. */
[BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS,
[BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
+ [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC,
[BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B,
[BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H,
[BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W,
@@ -1382,29 +1407,54 @@ static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u6
select_insn:
goto *jumptable[insn->code];
- /* ALU */
-#define ALU(OPCODE, OP) \
- ALU64_##OPCODE##_X: \
- DST = DST OP SRC; \
- CONT; \
- ALU_##OPCODE##_X: \
- DST = (u32) DST OP (u32) SRC; \
- CONT; \
- ALU64_##OPCODE##_K: \
- DST = DST OP IMM; \
- CONT; \
- ALU_##OPCODE##_K: \
- DST = (u32) DST OP (u32) IMM; \
+ /* Explicitly mask the register-based shift amounts with 63 or 31
+ * to avoid undefined behavior. Normally this won't affect the
+ * generated code, for example, in case of native 64 bit archs such
+ * as x86-64 or arm64, the compiler is optimizing the AND away for
+ * the interpreter. In case of JITs, each of the JIT backends compiles
+ * the BPF shift operations to machine instructions which produce
+ * implementation-defined results in such a case; the resulting
+ * contents of the register may be arbitrary, but program behaviour
+ * as a whole remains defined. In other words, in case of JIT backends,
+ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation.
+ */
+ /* ALU (shifts) */
+#define SHT(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP (SRC & 63); \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP ((u32) SRC & 31); \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+ /* ALU (rest) */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
CONT;
-
ALU(ADD, +)
ALU(SUB, -)
ALU(AND, &)
ALU(OR, |)
- ALU(LSH, <<)
- ALU(RSH, >>)
ALU(XOR, ^)
ALU(MUL, *)
+ SHT(LSH, <<)
+ SHT(RSH, >>)
+#undef SHT
#undef ALU
ALU_NEG:
DST = (u32) -DST;
@@ -1429,13 +1479,13 @@ select_insn:
insn++;
CONT;
ALU_ARSH_X:
- DST = (u64) (u32) (((s32) DST) >> SRC);
+ DST = (u64) (u32) (((s32) DST) >> (SRC & 31));
CONT;
ALU_ARSH_K:
DST = (u64) (u32) (((s32) DST) >> IMM);
CONT;
ALU64_ARSH_X:
- (*(s64 *) &DST) >>= SRC;
+ (*(s64 *) &DST) >>= (SRC & 63);
CONT;
ALU64_ARSH_K:
(*(s64 *) &DST) >>= IMM;
@@ -1524,7 +1574,8 @@ select_insn:
if (unlikely(index >= array->map.max_entries))
goto out;
- if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
+
+ if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@@ -1535,7 +1586,7 @@ select_insn:
/* ARG1 at this point is guaranteed to point to CTX from
* the verifier side due to the fact that the tail call is
- * handeled like a helper, that is, bpf_tail_call_proto,
+ * handled like a helper, that is, bpf_tail_call_proto,
* where arg1_type is ARG_PTR_TO_CTX.
*/
insn = prog->insnsi;
@@ -1586,7 +1637,21 @@ out:
COND_JMP(s, JSGE, >=)
COND_JMP(s, JSLE, <=)
#undef COND_JMP
- /* STX and ST and LDX*/
+ /* ST, STX and LDX*/
+ ST_NOSPEC:
+ /* Speculation barrier for mitigating Speculative Store Bypass.
+ * In case of arm64, we rely on the firmware mitigation as
+ * controlled via the ssbd kernel parameter. Whenever the
+ * mitigation is enabled, it works for all of the kernel code
+ * with no need to provide any additional instructions here.
+ * In case of x86, we use 'lfence' insn for mitigation. We
+ * reuse preexisting logic from Spectre v1 mitigation that
+ * happens to produce the required code on x86 for v4 as well.
+ */
+#ifdef CONFIG_X86
+ barrier_nospec();
+#endif
+ CONT;
#define LDST(SIZEOP, SIZE) \
STX_MEM_##SIZEOP: \
*(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
@@ -1613,13 +1678,59 @@ out:
LDX_PROBE(DW, 8)
#undef LDX_PROBE
- STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
- atomic_add((u32) SRC, (atomic_t *)(unsigned long)
- (DST + insn->off));
- CONT;
- STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
- atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
- (DST + insn->off));
+#define ATOMIC_ALU_OP(BOP, KOP) \
+ case BOP: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \
+ (DST + insn->off)); \
+ else \
+ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \
+ (DST + insn->off)); \
+ break; \
+ case BOP | BPF_FETCH: \
+ if (BPF_SIZE(insn->code) == BPF_W) \
+ SRC = (u32) atomic_fetch_##KOP( \
+ (u32) SRC, \
+ (atomic_t *)(unsigned long) (DST + insn->off)); \
+ else \
+ SRC = (u64) atomic64_fetch_##KOP( \
+ (u64) SRC, \
+ (atomic64_t *)(unsigned long) (DST + insn->off)); \
+ break;
+
+ STX_ATOMIC_DW:
+ STX_ATOMIC_W:
+ switch (IMM) {
+ ATOMIC_ALU_OP(BPF_ADD, add)
+ ATOMIC_ALU_OP(BPF_AND, and)
+ ATOMIC_ALU_OP(BPF_OR, or)
+ ATOMIC_ALU_OP(BPF_XOR, xor)
+#undef ATOMIC_ALU_OP
+
+ case BPF_XCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ SRC = (u32) atomic_xchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) SRC);
+ else
+ SRC = (u64) atomic64_xchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) SRC);
+ break;
+ case BPF_CMPXCHG:
+ if (BPF_SIZE(insn->code) == BPF_W)
+ BPF_R0 = (u32) atomic_cmpxchg(
+ (atomic_t *)(unsigned long) (DST + insn->off),
+ (u32) BPF_R0, (u32) SRC);
+ else
+ BPF_R0 = (u64) atomic64_cmpxchg(
+ (atomic64_t *)(unsigned long) (DST + insn->off),
+ (u64) BPF_R0, (u64) SRC);
+ break;
+
+ default:
+ goto default_label;
+ }
CONT;
default_label:
@@ -1629,7 +1740,8 @@ out:
*
* Note, verifier whitelists all opcodes in bpf_opcode_in_insntable().
*/
- pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code);
+ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n",
+ insn->code, insn->imm);
BUG_ON(1);
return 0;
}
@@ -1643,7 +1755,7 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size
@@ -1660,7 +1772,7 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \
BPF_R3 = r3; \
BPF_R4 = r4; \
BPF_R5 = r5; \
- return ___bpf_prog_run(regs, insn, stack); \
+ return ___bpf_prog_run(regs, insn); \
}
#define EVAL1(FN, X) FN(X)
@@ -1720,27 +1832,34 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ spin_lock(&array->aux->owner.lock);
+
+ if (!array->aux->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ array->aux->owner.type = fp->type;
+ array->aux->owner.jited = fp->jited;
+ ret = true;
+ } else {
+ ret = array->aux->owner.type == fp->type &&
+ array->aux->owner.jited == fp->jited;
}
-
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ spin_unlock(&array->aux->owner.lock);
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- int i;
+ int i, ret = 0;
+ mutex_lock(&aux->used_maps_mutex);
for (i = 0; i < aux->used_map_cnt; i++) {
struct bpf_map *map = aux->used_maps[i];
struct bpf_array *array;
@@ -1749,11 +1868,15 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
continue;
array = container_of(map, struct bpf_array, map);
- if (!bpf_prog_array_compatible(array, fp))
- return -EINVAL;
+ if (!bpf_prog_array_compatible(array, fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
- return 0;
+out:
+ mutex_unlock(&aux->used_maps_mutex);
+ return ret;
}
static void bpf_prog_select_func(struct bpf_prog *fp)
@@ -1769,20 +1892,29 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
- * @fp: bpf_prog populated with internal BPF program
+ * @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
- * The BPF program will be executed via BPF_PROG_RUN() macro.
+ * The BPF program will be executed via bpf_prog_run() function.
+ *
+ * Return: the &fp argument along with &err set to 0 for success or
+ * a negative errno code on failure
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
/* In case of BPF to BPF calls, verifier did all the prep
* work with regards to JITing, etc.
*/
+ bool jit_needed = false;
+
if (fp->bpf_func)
goto finalize;
+ if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) ||
+ bpf_prog_has_kfunc_call(fp))
+ jit_needed = true;
+
bpf_prog_select_func(fp);
/* eBPF JITs can rewrite the program in case constant
@@ -1797,14 +1929,10 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
return fp;
fp = bpf_int_jit_compile(fp);
- if (!fp->jited) {
- bpf_prog_free_jited_linfo(fp);
-#ifdef CONFIG_BPF_JIT_ALWAYS_ON
+ bpf_prog_jit_attempt_done(fp);
+ if (!fp->jited && jit_needed) {
*err = -ENOTSUPP;
return fp;
-#endif
- } else {
- bpf_prog_free_unused_jited_linfo(fp);
}
} else {
*err = bpf_prog_offload_compile(fp);
@@ -1950,16 +2078,71 @@ void bpf_prog_array_delete_safe(struct bpf_prog_array *array,
}
}
+/**
+ * bpf_prog_array_delete_safe_at() - Replaces the program at the given
+ * index into the program array with
+ * a dummy no-op program.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to replace
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to replace.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index)
+{
+ return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog);
+}
+
+/**
+ * bpf_prog_array_update_at() - Updates the program at the given index
+ * into the program array.
+ * @array: a bpf_prog_array
+ * @index: the index of the program to update
+ * @prog: the program to insert into the array
+ *
+ * Skips over dummy programs, by not counting them, when calculating
+ * the position of the program to update.
+ *
+ * Return:
+ * * 0 - Success
+ * * -EINVAL - Invalid index value. Must be a non-negative integer.
+ * * -ENOENT - Index out of range
+ */
+int bpf_prog_array_update_at(struct bpf_prog_array *array, int index,
+ struct bpf_prog *prog)
+{
+ struct bpf_prog_array_item *item;
+
+ if (unlikely(index < 0))
+ return -EINVAL;
+
+ for (item = array->items; item->prog; item++) {
+ if (item->prog == &dummy_bpf_prog.prog)
+ continue;
+ if (!index) {
+ WRITE_ONCE(item->prog, prog);
+ return 0;
+ }
+ index--;
+ }
+ return -ENOENT;
+}
+
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
+ u64 bpf_cookie,
struct bpf_prog_array **new_array)
{
int new_prog_cnt, carry_prog_cnt = 0;
- struct bpf_prog_array_item *existing;
+ struct bpf_prog_array_item *existing, *new;
struct bpf_prog_array *array;
bool found_exclude = false;
- int new_prog_idx = 0;
/* Figure out how many existing progs we need to carry over to
* the new array.
@@ -1996,20 +2179,27 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
if (!array)
return -ENOMEM;
+ new = array->items;
/* Fill in the new prog array */
if (carry_prog_cnt) {
existing = old_array->items;
- for (; existing->prog; existing++)
- if (existing->prog != exclude_prog &&
- existing->prog != &dummy_bpf_prog.prog) {
- array->items[new_prog_idx++].prog =
- existing->prog;
- }
+ for (; existing->prog; existing++) {
+ if (existing->prog == exclude_prog ||
+ existing->prog == &dummy_bpf_prog.prog)
+ continue;
+
+ new->prog = existing->prog;
+ new->bpf_cookie = existing->bpf_cookie;
+ new++;
+ }
}
- if (include_prog)
- array->items[new_prog_idx++].prog = include_prog;
- array->items[new_prog_idx].prog = NULL;
+ if (include_prog) {
+ new->prog = include_prog;
+ new->bpf_cookie = bpf_cookie;
+ new++;
+ }
+ new->prog = NULL;
*new_array = array;
return 0;
}
@@ -2034,24 +2224,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
: 0;
}
-static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
-{
- enum bpf_cgroup_storage_type stype;
-
- for_each_cgroup_storage_type(stype) {
- if (!aux->cgroup_storage[stype])
- continue;
- bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
- }
-}
-
void __bpf_free_used_maps(struct bpf_prog_aux *aux,
struct bpf_map **used_maps, u32 len)
{
struct bpf_map *map;
u32 i;
- bpf_free_cgroup_storage(aux);
for (i = 0; i < len; i++) {
map = used_maps[i];
if (map->ops->map_poke_untrack)
@@ -2066,22 +2244,55 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
+void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
+ struct btf_mod_pair *used_btfs, u32 len)
+{
+#ifdef CONFIG_BPF_SYSCALL
+ struct btf_mod_pair *btf_mod;
+ u32 i;
+
+ for (i = 0; i < len; i++) {
+ btf_mod = &used_btfs[i];
+ if (btf_mod->module)
+ module_put(btf_mod->module);
+ btf_put(btf_mod->btf);
+ }
+#endif
+}
+
+static void bpf_free_used_btfs(struct bpf_prog_aux *aux)
+{
+ __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt);
+ kfree(aux->used_btfs);
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
int i;
aux = container_of(work, struct bpf_prog_aux, work);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
+#endif
bpf_free_used_maps(aux);
+ bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
#ifdef CONFIG_PERF_EVENTS
if (aux->prog->has_callchain_buf)
put_callchain_buffers();
#endif
- bpf_trampoline_put(aux->trampoline);
- for (i = 0; i < aux->func_cnt; i++)
+ if (aux->dst_trampoline)
+ bpf_trampoline_put(aux->dst_trampoline);
+ for (i = 0; i < aux->func_cnt; i++) {
+ /* We can just unlink the subprog poke descriptor table as
+ * it was originally linked to the main program and is also
+ * released along with it.
+ */
+ aux->func[i]->aux->poke_tab = NULL;
bpf_jit_free(aux->func[i]);
+ }
if (aux->func_cnt) {
kfree(aux->func);
bpf_prog_unlock_free(aux->prog);
@@ -2090,13 +2301,12 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
-/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
- if (aux->linked_prog)
- bpf_prog_put(aux->linked_prog);
+ if (aux->dst_prog)
+ bpf_prog_put(aux->dst_prog);
INIT_WORK(&aux->work, bpf_prog_free_deferred);
schedule_work(&aux->work);
}
@@ -2128,6 +2338,11 @@ BPF_CALL_0(bpf_user_rnd_u32)
return res;
}
+BPF_CALL_0(bpf_get_raw_cpu_id)
+{
+ return raw_smp_processor_id();
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
@@ -2143,18 +2358,29 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
const struct bpf_func_proto bpf_get_numa_node_id_proto __weak;
const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak;
+const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak;
const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
const struct bpf_func_proto bpf_get_current_comm_proto __weak;
const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
+const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak;
const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
+const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
+const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
return NULL;
}
+const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
+{
+ return NULL;
+}
+
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
@@ -2197,12 +2423,21 @@ bool __weak bpf_helper_changes_pkt_data(void *func)
/* Return TRUE if the JIT backend wants verifier to enable sub-register usage
* analysis code and wants explicit zero extension inserted by verifier.
* Otherwise, return FALSE.
+ *
+ * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if
+ * you don't override this. JITs that don't want these extra insns can detect
+ * them using insn_is_zext.
*/
bool __weak bpf_jit_needs_zext(void)
{
return false;
}
+bool __weak bpf_jit_supports_kfunc_call(void)
+{
+ return false;
+}
+
/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
* skb_copy_bits(), so provide a weak definition of it for NET-less config.
*/
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 70f71b154fa5..b3e6b9422238 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -16,6 +16,7 @@
* netstack, and assigning dedicated CPUs for this stage. This
* basically allows for 10G wirespeed pre-filtering via bpf.
*/
+#include <linux/bitops.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/ptr_ring.h>
@@ -27,7 +28,7 @@
#include <linux/capability.h>
#include <trace/events/xdp.h>
-#include <linux/netdevice.h> /* netif_receive_skb_core */
+#include <linux/netdevice.h> /* netif_receive_skb_list */
#include <linux/etherdevice.h> /* eth_type_trans */
/* General idea: XDP packets getting XDP redirected to another CPU,
@@ -52,7 +53,6 @@ struct xdp_bulk_queue {
struct bpf_cpu_map_entry {
u32 cpu; /* kthread CPU and map index */
int map_id; /* Back reference to map */
- u32 qsize; /* Queue size placeholder for map lookup */
/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
struct xdp_bulk_queue __percpu *bulkq;
@@ -62,38 +62,41 @@ struct bpf_cpu_map_entry {
/* Queue with potential multi-producers, and single-consumer kthread */
struct ptr_ring *queue;
struct task_struct *kthread;
- struct work_struct kthread_stop_wq;
+
+ struct bpf_cpumap_val value;
+ struct bpf_prog *prog;
atomic_t refcnt; /* Control when this struct can be free'ed */
struct rcu_head rcu;
+
+ struct work_struct kthread_stop_wq;
};
struct bpf_cpu_map {
struct bpf_map map;
/* Below members specific for map type */
- struct bpf_cpu_map_entry **cpu_map;
+ struct bpf_cpu_map_entry __rcu **cpu_map;
};
static DEFINE_PER_CPU(struct list_head, cpu_map_flush_list);
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq);
-
static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
{
+ u32 value_size = attr->value_size;
struct bpf_cpu_map *cmap;
int err = -ENOMEM;
- u64 cost;
- int ret;
- if (!capable(CAP_SYS_ADMIN))
+ if (!bpf_capable())
return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+ (value_size != offsetofend(struct bpf_cpumap_val, qsize) &&
+ value_size != offsetofend(struct bpf_cpumap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~BPF_F_NUMA_NODE)
return ERR_PTR(-EINVAL);
- cmap = kzalloc(sizeof(*cmap), GFP_USER);
+ cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT);
if (!cmap)
return ERR_PTR(-ENOMEM);
@@ -105,26 +108,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
goto free_cmap;
}
- /* make sure page count doesn't overflow */
- cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
-
- /* Notice returns -EPERM on if map size is larger than memlock limit */
- ret = bpf_map_charge_init(&cmap->map.memory, cost);
- if (ret) {
- err = ret;
- goto free_cmap;
- }
-
/* Alloc array for possible remote "destination" CPUs */
cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
sizeof(struct bpf_cpu_map_entry *),
cmap->map.numa_node);
if (!cmap->cpu_map)
- goto free_charge;
+ goto free_cmap;
return &cmap->map;
-free_charge:
- bpf_map_charge_finish(&cmap->map.memory);
free_cmap:
kfree(cmap);
return ERR_PTR(err);
@@ -151,65 +142,6 @@ static void cpu_map_kthread_stop(struct work_struct *work)
kthread_stop(rcpu->kthread);
}
-static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
- struct xdp_frame *xdpf,
- struct sk_buff *skb)
-{
- unsigned int hard_start_headroom;
- unsigned int frame_size;
- void *pkt_data_start;
-
- /* Part of headroom was reserved to xdpf */
- hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom;
-
- /* build_skb need to place skb_shared_info after SKB end, and
- * also want to know the memory "truesize". Thus, need to
- * know the memory frame size backing xdp_buff.
- *
- * XDP was designed to have PAGE_SIZE frames, but this
- * assumption is not longer true with ixgbe and i40e. It
- * would be preferred to set frame_size to 2048 or 4096
- * depending on the driver.
- * frame_size = 2048;
- * frame_len = frame_size - sizeof(*xdp_frame);
- *
- * Instead, with info avail, skb_shared_info in placed after
- * packet len. This, unfortunately fakes the truesize.
- * Another disadvantage of this approach, the skb_shared_info
- * is not at a fixed memory location, with mixed length
- * packets, which is bad for cache-line hotness.
- */
- frame_size = SKB_DATA_ALIGN(xdpf->len + hard_start_headroom) +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
- pkt_data_start = xdpf->data - hard_start_headroom;
- skb = build_skb_around(skb, pkt_data_start, frame_size);
- if (unlikely(!skb))
- return NULL;
-
- skb_reserve(skb, hard_start_headroom);
- __skb_put(skb, xdpf->len);
- if (xdpf->metasize)
- skb_metadata_set(skb, xdpf->metasize);
-
- /* Essential SKB info: protocol and skb->dev */
- skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
-
- /* Optional SKB info, currently missing:
- * - HW checksum info (skb->ip_summed)
- * - HW RX hash (skb_set_hash)
- * - RX ring dev queue index (skb_record_rx_queue)
- */
-
- /* Until page_pool get SKB return path, release DMA here */
- xdp_release_frame(xdpf);
-
- /* Allow SKB to reuse area used by xdp_frame */
- xdp_scrub_frame(xdpf);
-
- return skb;
-}
-
static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
{
/* The tear-down procedure should have made sure that queue is
@@ -227,6 +159,8 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
{
if (atomic_dec_and_test(&rcpu->refcnt)) {
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
/* The queue should be empty at this point */
__cpu_map_ring_cleanup(rcpu->queue);
ptr_ring_cleanup(rcpu->queue, NULL);
@@ -235,8 +169,132 @@ static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
}
}
+static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
+ struct list_head *listp,
+ struct xdp_cpumap_stats *stats)
+{
+ struct sk_buff *skb, *tmp;
+ struct xdp_buff xdp;
+ u32 act;
+ int err;
+
+ list_for_each_entry_safe(skb, tmp, listp, list) {
+ act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
+ switch (act) {
+ case XDP_PASS:
+ break;
+ case XDP_REDIRECT:
+ skb_list_del_init(skb);
+ err = xdp_do_generic_redirect(skb->dev, skb, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ return;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(skb->dev, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ stats->drop++;
+ return;
+ }
+ }
+}
+
+static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
+ void **frames, int n,
+ struct xdp_cpumap_stats *stats)
+{
+ struct xdp_rxq_info rxq;
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ xdp_set_return_frame_no_direct();
+ xdp.rxq = &rxq;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ rxq.dev = xdpf->dev_rx;
+ rxq.mem = xdpf->mem;
+ /* TODO: report queue_index to xdp_rxq_info */
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+
+ act = bpf_prog_run_xdp(rcpu->prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (err < 0) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ frames[nframes++] = xdpf;
+ stats->pass++;
+ }
+ break;
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(xdpf->dev_rx, &xdp,
+ rcpu->prog);
+ if (unlikely(err)) {
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ } else {
+ stats->redirect++;
+ }
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame(xdpf);
+ stats->drop++;
+ break;
+ }
+ }
+
+ xdp_clear_return_frame_no_direct();
+
+ return nframes;
+}
+
#define CPUMAP_BATCH 8
+static int cpu_map_bpf_prog_run(struct bpf_cpu_map_entry *rcpu, void **frames,
+ int xdp_n, struct xdp_cpumap_stats *stats,
+ struct list_head *list)
+{
+ int nframes;
+
+ if (!rcpu->prog)
+ return xdp_n;
+
+ rcu_read_lock_bh();
+
+ nframes = cpu_map_bpf_prog_run_xdp(rcpu, frames, xdp_n, stats);
+
+ if (stats->redirect)
+ xdp_do_flush();
+
+ if (unlikely(!list_empty(list)))
+ cpu_map_bpf_prog_run_skb(rcpu, list, stats);
+
+ rcu_read_unlock_bh(); /* resched point, may call do_softirq() */
+
+ return nframes;
+}
+
+
static int cpu_map_kthread_run(void *data)
{
struct bpf_cpu_map_entry *rcpu = data;
@@ -249,11 +307,13 @@ static int cpu_map_kthread_run(void *data)
* kthread_stop signal until queue is empty.
*/
while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
- unsigned int drops = 0, sched = 0;
+ struct xdp_cpumap_stats stats = {}; /* zero stats */
+ unsigned int kmem_alloc_drops = 0, sched = 0;
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n, m, nframes, xdp_n;
void *frames[CPUMAP_BATCH];
void *skbs[CPUMAP_BATCH];
- gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
- int i, n, m;
+ LIST_HEAD(list);
/* Release CPU reschedule checks */
if (__ptr_ring_empty(rcpu->queue)) {
@@ -274,11 +334,22 @@ static int cpu_map_kthread_run(void *data)
* kthread CPU pinned. Lockless access to ptr_ring
* consume side valid as no-resize allowed of queue.
*/
- n = ptr_ring_consume_batched(rcpu->queue, frames, CPUMAP_BATCH);
-
- for (i = 0; i < n; i++) {
+ n = __ptr_ring_consume_batched(rcpu->queue, frames,
+ CPUMAP_BATCH);
+ for (i = 0, xdp_n = 0; i < n; i++) {
void *f = frames[i];
- struct page *page = virt_to_page(f);
+ struct page *page;
+
+ if (unlikely(__ptr_test_bit(0, &f))) {
+ struct sk_buff *skb = f;
+
+ __ptr_clear_bit(0, &skb);
+ list_add_tail(&skb->list, &list);
+ continue;
+ }
+
+ frames[xdp_n++] = f;
+ page = virt_to_page(f);
/* Bring struct page memory area to curr CPU. Read by
* build_skb_around via page_is_pfmemalloc(), and when
@@ -287,32 +358,36 @@ static int cpu_map_kthread_run(void *data)
prefetchw(page);
}
- m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, n, skbs);
- if (unlikely(m == 0)) {
- for (i = 0; i < n; i++)
- skbs[i] = NULL; /* effect: xdp_return_frame */
- drops = n;
+ /* Support running another XDP prog on this CPU */
+ nframes = cpu_map_bpf_prog_run(rcpu, frames, xdp_n, &stats, &list);
+ if (nframes) {
+ m = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, skbs);
+ if (unlikely(m == 0)) {
+ for (i = 0; i < nframes; i++)
+ skbs[i] = NULL; /* effect: xdp_return_frame */
+ kmem_alloc_drops += nframes;
+ }
}
local_bh_disable();
- for (i = 0; i < n; i++) {
+ for (i = 0; i < nframes; i++) {
struct xdp_frame *xdpf = frames[i];
struct sk_buff *skb = skbs[i];
- int ret;
- skb = cpu_map_build_skb(rcpu, xdpf, skb);
+ skb = __xdp_build_skb_from_frame(xdpf, skb,
+ xdpf->dev_rx);
if (!skb) {
xdp_return_frame(xdpf);
continue;
}
- /* Inject into network stack */
- ret = netif_receive_skb_core(skb);
- if (ret == NET_RX_DROP)
- drops++;
+ list_add_tail(&skb->list, &list);
}
+ netif_receive_skb_list(&list);
+
/* Feedback loop via tracepoint */
- trace_xdp_cpumap_kthread(rcpu->map_id, n, drops, sched);
+ trace_xdp_cpumap_kthread(rcpu->map_id, n, kmem_alloc_drops,
+ sched, &stats);
local_bh_enable(); /* resched point, may call do_softirq() */
}
@@ -322,24 +397,44 @@ static int cpu_map_kthread_run(void *data)
return 0;
}
-static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
- int map_id)
+static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd)
{
+ struct bpf_prog *prog;
+
+ prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->expected_attach_type != BPF_XDP_CPUMAP) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ rcpu->value.bpf_prog.id = prog->aux->id;
+ rcpu->prog = prog;
+
+ return 0;
+}
+
+static struct bpf_cpu_map_entry *
+__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,
+ u32 cpu)
+{
+ int numa, err, i, fd = value->bpf_prog.fd;
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct bpf_cpu_map_entry *rcpu;
struct xdp_bulk_queue *bq;
- int numa, err, i;
/* Have map->numa_node, but choose node of redirect target CPU */
numa = cpu_to_node(cpu);
- rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+ rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa);
if (!rcpu)
return NULL;
/* Alloc percpu bulkq */
- rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
- sizeof(void *), gfp);
+ rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq),
+ sizeof(void *), gfp);
if (!rcpu->bulkq)
goto free_rcu;
@@ -349,23 +444,28 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
}
/* Alloc queue */
- rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+ rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp,
+ numa);
if (!rcpu->queue)
goto free_bulkq;
- err = ptr_ring_init(rcpu->queue, qsize, gfp);
+ err = ptr_ring_init(rcpu->queue, value->qsize, gfp);
if (err)
goto free_queue;
rcpu->cpu = cpu;
- rcpu->map_id = map_id;
- rcpu->qsize = qsize;
+ rcpu->map_id = map->id;
+ rcpu->value.qsize = value->qsize;
+
+ if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd))
+ goto free_ptr_ring;
/* Setup kthread */
rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
- "cpumap/%d/map:%d", cpu, map_id);
+ "cpumap/%d/map:%d", cpu,
+ map->id);
if (IS_ERR(rcpu->kthread))
- goto free_ptr_ring;
+ goto free_prog;
get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
@@ -376,6 +476,9 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
return rcpu;
+free_prog:
+ if (rcpu->prog)
+ bpf_prog_put(rcpu->prog);
free_ptr_ring:
ptr_ring_cleanup(rcpu->queue, NULL);
free_queue:
@@ -427,7 +530,7 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
{
struct bpf_cpu_map_entry *old_rcpu;
- old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+ old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));
if (old_rcpu) {
call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
@@ -452,12 +555,12 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+ struct bpf_cpumap_val cpumap_value = {};
struct bpf_cpu_map_entry *rcpu;
-
/* Array index key correspond to CPU number */
u32 key_cpu = *(u32 *)key;
- /* Value is the queue size */
- u32 qsize = *(u32 *)value;
+
+ memcpy(&cpumap_value, value, map->value_size);
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
@@ -465,18 +568,18 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+ if (unlikely(cpumap_value.qsize > 16384)) /* sanity limit on qsize */
return -EOVERFLOW;
/* Make sure CPU is a valid possible cpu */
- if (!cpu_possible(key_cpu))
+ if (key_cpu >= nr_cpumask_bits || !cpu_possible(key_cpu))
return -ENODEV;
- if (qsize == 0) {
+ if (cpumap_value.qsize == 0) {
rcpu = NULL; /* Same as deleting */
} else {
/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
- rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+ rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu);
if (!rcpu)
return -ENOMEM;
rcpu->cmap = cmap;
@@ -501,7 +604,6 @@ static void cpu_map_free(struct bpf_map *map)
* complete.
*/
- bpf_clear_redirect_map(map);
synchronize_rcu();
/* For cpu_map the remote CPUs can still be using the entries
@@ -510,7 +612,7 @@ static void cpu_map_free(struct bpf_map *map)
for (i = 0; i < cmap->map.max_entries; i++) {
struct bpf_cpu_map_entry *rcpu;
- rcpu = READ_ONCE(cmap->cpu_map[i]);
+ rcpu = rcu_dereference_raw(cmap->cpu_map[i]);
if (!rcpu)
continue;
@@ -521,7 +623,11 @@ static void cpu_map_free(struct bpf_map *map)
kfree(cmap);
}
-struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
struct bpf_cpu_map_entry *rcpu;
@@ -529,7 +635,8 @@ struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- rcpu = READ_ONCE(cmap->cpu_map[key]);
+ rcpu = rcu_dereference_check(cmap->cpu_map[key],
+ rcu_read_lock_bh_held());
return rcpu;
}
@@ -538,7 +645,7 @@ static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
struct bpf_cpu_map_entry *rcpu =
__cpu_map_lookup_elem(map, *(u32 *)key);
- return rcpu ? &rcpu->qsize : NULL;
+ return rcpu ? &rcpu->value : NULL;
}
static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -558,7 +665,15 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags, 0,
+ __cpu_map_lookup_elem);
+}
+
+static int cpu_map_btf_id;
const struct bpf_map_ops cpu_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = cpu_map_alloc,
.map_free = cpu_map_free,
.map_delete_elem = cpu_map_delete_elem,
@@ -566,9 +681,12 @@ const struct bpf_map_ops cpu_map_ops = {
.map_lookup_elem = cpu_map_lookup_elem,
.map_get_next_key = cpu_map_get_next_key,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_cpu_map",
+ .map_btf_id = &cpu_map_btf_id,
+ .map_redirect = cpu_map_redirect,
};
-static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
+static void bq_flush_to_queue(struct xdp_bulk_queue *bq)
{
struct bpf_cpu_map_entry *rcpu = bq->obj;
unsigned int processed = 0, drops = 0;
@@ -577,7 +695,7 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
int i;
if (unlikely(!bq->count))
- return 0;
+ return;
q = rcpu->queue;
spin_lock(&q->producer_lock);
@@ -600,13 +718,12 @@ static int bq_flush_to_queue(struct xdp_bulk_queue *bq)
/* Feedback loop via tracepoints */
trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
- return 0;
}
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
+static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
@@ -627,19 +744,11 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
if (!bq->flush_node.prev)
list_add(&bq->flush_node, flush_list);
-
- return 0;
}
-int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- struct xdp_frame *xdpf;
-
- xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;
@@ -647,6 +756,25 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
return 0;
}
+int cpu_map_generic_redirect(struct bpf_cpu_map_entry *rcpu,
+ struct sk_buff *skb)
+{
+ int ret;
+
+ __skb_pull(skb, skb->mac_len);
+ skb_set_redirected(skb, false);
+ __ptr_set_bit(0, &skb);
+
+ ret = ptr_ring_produce(rcpu->queue, skb);
+ if (ret < 0)
+ goto trace;
+
+ wake_up_process(rcpu->kthread);
+trace:
+ trace_xdp_cpumap_enqueue(rcpu->map_id, !ret, !!ret, rcpu->cpu);
+ return ret;
+}
+
void __cpu_map_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&cpu_map_flush_list);
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 58bdca5d978a..fe019dbdb3f0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -52,12 +52,12 @@
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-#define DEV_MAP_BULK_SIZE 16
struct xdp_dev_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
struct list_head flush_node;
struct net_device *dev;
struct net_device *dev_rx;
+ struct bpf_prog *xdp_prog;
unsigned int count;
};
@@ -65,13 +65,15 @@ struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct hlist_node index_hlist;
struct bpf_dtab *dtab;
+ struct bpf_prog *xdp_prog;
struct rcu_head rcu;
unsigned int idx;
+ struct bpf_devmap_val val;
};
struct bpf_dtab {
struct bpf_map map;
- struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
+ struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
struct list_head list;
/* these are only used for DEVMAP_HASH type maps */
@@ -85,12 +87,13 @@ static DEFINE_PER_CPU(struct list_head, dev_flush_list);
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
-static struct hlist_head *dev_map_create_hash(unsigned int entries)
+static struct hlist_head *dev_map_create_hash(unsigned int entries,
+ int numa_node)
{
int i;
struct hlist_head *hash;
- hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL);
+ hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -106,12 +109,16 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
- u64 cost = 0;
- int err;
+ u32 valsize = attr->value_size;
- /* check sanity of attributes */
+ /* check sanity of attributes. 2 value sizes supported:
+ * 4 bytes: ifindex
+ * 8 bytes: ifindex + prog fd
+ */
if (attr->max_entries == 0 || attr->key_size != 4 ||
- attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
+ (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
+ valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+ attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return -EINVAL;
/* Lookup returns a pointer straight to dev->ifindex, so make sure the
@@ -127,35 +134,24 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
if (!dtab->n_buckets) /* Overflow check */
return -EINVAL;
- cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets;
- } else {
- cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
}
- /* if map size is larger than memlock limit, reject it */
- err = bpf_map_charge_init(&dtab->map.memory, cost);
- if (err)
- return -EINVAL;
-
if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets);
+ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
+ dtab->map.numa_node);
if (!dtab->dev_index_head)
- goto free_charge;
+ return -ENOMEM;
spin_lock_init(&dtab->index_lock);
} else {
- dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
- goto free_charge;
+ return -ENOMEM;
}
return 0;
-
-free_charge:
- bpf_map_charge_finish(&dtab->map.memory);
- return -ENOMEM;
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -166,7 +162,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
- dtab = kzalloc(sizeof(*dtab), GFP_USER);
+ dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT);
if (!dtab)
return ERR_PTR(-ENOMEM);
@@ -218,20 +214,24 @@ static void dev_map_free(struct bpf_map *map)
hlist_for_each_entry_safe(dev, next, head, index_hlist) {
hlist_del_rcu(&dev->index_hlist);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
}
- kfree(dtab->dev_index_head);
+ bpf_map_area_free(dtab->dev_index_head);
} else {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
- dev = dtab->netdev_map[i];
+ dev = rcu_dereference_raw(dtab->netdev_map[i]);
if (!dev)
continue;
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -259,7 +259,11 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
-struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
+ */
+static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct hlist_head *head = dev_map_index_hash(dtab, key);
@@ -318,72 +322,110 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
return -ENOENT;
}
-static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
+static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
+ struct xdp_frame **frames, int n,
+ struct net_device *dev)
+{
+ struct xdp_txq_info txq = { .dev = dev };
+ struct xdp_buff xdp;
+ int i, nframes = 0;
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ u32 act;
+ int err;
+
+ xdp_convert_frame_to_buff(xdpf, &xdp);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_xdp(xdp_prog, &xdp);
+ switch (act) {
+ case XDP_PASS:
+ err = xdp_update_frame_from_buff(&xdp, xdpf);
+ if (unlikely(err < 0))
+ xdp_return_frame_rx_napi(xdpf);
+ else
+ frames[nframes++] = xdpf;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dev, xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_frame_rx_napi(xdpf);
+ break;
+ }
+ }
+ return nframes; /* sent frames count */
+}
+
+static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
struct net_device *dev = bq->dev;
- int sent = 0, drops = 0, err = 0;
+ unsigned int cnt = bq->count;
+ int sent = 0, err = 0;
+ int to_send = cnt;
int i;
- if (unlikely(!bq->count))
- return 0;
+ if (unlikely(!cnt))
+ return;
- for (i = 0; i < bq->count; i++) {
+ for (i = 0; i < cnt; i++) {
struct xdp_frame *xdpf = bq->q[i];
prefetch(xdpf);
}
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q, flags);
+ if (bq->xdp_prog) {
+ to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
+ if (!to_send)
+ goto out;
+ }
+
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
if (sent < 0) {
+ /* If ndo_xdp_xmit fails with an errno, no frames have
+ * been xmit'ed.
+ */
err = sent;
sent = 0;
- goto error;
}
- drops = bq->count - sent;
-out:
- bq->count = 0;
- trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, drops, err);
- bq->dev_rx = NULL;
- __list_del_clearprev(&bq->flush_node);
- return 0;
-error:
- /* If ndo_xdp_xmit fails with an errno, no frames have been
- * xmit'ed and it's our responsibility to them free all.
+ /* If not all frames have been transmitted, it is our
+ * responsibility to free them
*/
- for (i = 0; i < bq->count; i++) {
- struct xdp_frame *xdpf = bq->q[i];
+ for (i = sent; unlikely(i < to_send); i++)
+ xdp_return_frame_rx_napi(bq->q[i]);
- xdp_return_frame_rx_napi(xdpf);
- drops++;
- }
- goto out;
-}
-
-/* __dev_flush is called from xdp_do_flush() which _must_ be signaled
- * from the driver before returning from its napi->poll() routine. The poll()
- * routine is called either from busy_poll context or net_rx_action signaled
- * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
- * net device can be torn down. On devmap tear down we ensure the flush list
- * is empty before completing to ensure all flush operations have completed.
- * When drivers update the bpf program they may need to ensure any flush ops
- * are also complete. Using synchronize_rcu or call_rcu will suffice for this
- * because both wait for napi context to exit.
+out:
+ bq->count = 0;
+ trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
+}
+
+/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
+ * driver before returning from its napi->poll() routine. See the comment above
+ * xdp_do_flush() in filter.c.
*/
void __dev_flush(void)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq, *tmp;
- list_for_each_entry_safe(bq, tmp, flush_list, flush_node)
+ list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
bq_xmit_all(bq, XDP_XMIT_FLUSH);
+ bq->dev_rx = NULL;
+ bq->xdp_prog = NULL;
+ __list_del_clearprev(&bq->flush_node);
+ }
}
-/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
- * update happens in parallel here a dev_put wont happen until after reading the
- * ifindex.
+/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
+ * by local_bh_disable() (from XDP calls inside NAPI). The
+ * rcu_read_lock_bh_held() below makes lockdep accept both.
*/
-struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *obj;
@@ -391,15 +433,17 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
if (key >= map->max_entries)
return NULL;
- obj = READ_ONCE(dtab->netdev_map[key]);
+ obj = rcu_dereference_check(dtab->netdev_map[key],
+ rcu_read_lock_bh_held());
return obj;
}
-/* Runs under RCU-read-side, plus in softirq under NAPI protection.
- * Thus, safe percpu variable access.
+/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
+ * variable access, and map elements stick around. See comment above
+ * xdp_do_flush() in filter.c.
*/
-static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
- struct net_device *dev_rx)
+static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
@@ -410,50 +454,205 @@ static int bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
/* Ingress dev_rx will be the same for all xdp_frame's in
* bulk_queue, because bq stored per-CPU and must be flushed
* from net_device drivers NAPI func end.
+ *
+ * Do the same with xdp_prog and flush_list since these fields
+ * are only ever modified together.
*/
- if (!bq->dev_rx)
+ if (!bq->dev_rx) {
bq->dev_rx = dev_rx;
-
- bq->q[bq->count++] = xdpf;
-
- if (!bq->flush_node.prev)
+ bq->xdp_prog = xdp_prog;
list_add(&bq->flush_node, flush_list);
+ }
- return 0;
+ bq->q[bq->count++] = xdpf;
}
-static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
- struct net_device *dev_rx)
+static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
+ struct net_device *dev_rx,
+ struct bpf_prog *xdp_prog)
{
- struct xdp_frame *xdpf;
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ err = xdp_ok_fwd_dev(dev, xdpf->len);
if (unlikely(err))
return err;
- xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
+ bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
+ return 0;
+}
+
+static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
+{
+ struct xdp_txq_info txq = { .dev = dst->dev };
+ struct xdp_buff xdp;
+ u32 act;
+
+ if (!dst->xdp_prog)
+ return XDP_PASS;
+
+ __skb_pull(skb, skb->mac_len);
+ xdp.txq = &txq;
+
+ act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
+ switch (act) {
+ case XDP_PASS:
+ __skb_push(skb, skb->mac_len);
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_ABORTED:
+ trace_xdp_exception(dst->dev, dst->xdp_prog, act);
+ fallthrough;
+ case XDP_DROP:
+ kfree_skb(skb);
+ break;
+ }
- return bq_enqueue(dev, xdpf, dev_rx);
+ return act;
}
-int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
+int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
- return __xdp_enqueue(dev, xdp, dev_rx);
+ return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
+}
+
+static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+ if (!obj ||
+ !obj->dev->netdev_ops->ndo_xdp_xmit)
+ return false;
+
+ if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
+ return false;
+
+ return true;
+}
+
+static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
+ struct net_device *dev_rx,
+ struct xdp_frame *xdpf)
+{
+ struct xdp_frame *nxdpf;
+
+ nxdpf = xdpf_clone(xdpf);
+ if (!nxdpf)
+ return -ENOMEM;
+
+ bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
+
+ return 0;
+}
+
+static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
+{
+ while (num_excluded--) {
+ if (ifindex == excluded[num_excluded])
+ return true;
+ }
+ return false;
+}
+
+/* Get ifindex of each upper device. 'indexes' must be able to hold at
+ * least MAX_NEST_DEV elements.
+ * Returns the number of ifindexes added.
+ */
+static int get_upper_ifindexes(struct net_device *dev, int *indexes)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+ int n = 0;
+
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ indexes[n++] = upper->ifindex;
+ }
+ return n;
+}
+
+int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
+ struct bpf_map *map, bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
+ excluded_devices[num_excluded++] = dev_rx->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_rcu(dst, head, index_hlist,
+ lockdep_is_held(&dtab->index_lock)) {
+ if (!is_valid_dst(dst, xdpf))
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the last copy of the frame */
+ if (last_dst)
+ bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
+ else
+ xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
+
+ return 0;
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
@@ -464,27 +663,128 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
err = xdp_ok_fwd_dev(dst->dev, skb->len);
if (unlikely(err))
return err;
+
+ /* Redirect has already succeeded semantically at this point, so we just
+ * return 0 even if packet is dropped. Helper below takes care of
+ * freeing skb.
+ */
+ if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
+ return 0;
+
skb->dev = dst->dev;
generic_xdp_tx(skb, xdp_prog);
return 0;
}
+static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
+ struct sk_buff *skb,
+ struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *nskb;
+ int err;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ err = dev_map_generic_redirect(dst, nskb, xdp_prog);
+ if (unlikely(err)) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ return 0;
+}
+
+int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
+ struct bpf_prog *xdp_prog, struct bpf_map *map,
+ bool exclude_ingress)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ struct bpf_dtab_netdev *dst, *last_dst = NULL;
+ int excluded_devices[1+MAX_NEST_DEV];
+ struct hlist_head *head;
+ struct hlist_node *next;
+ int num_excluded = 0;
+ unsigned int i;
+ int err;
+
+ if (exclude_ingress) {
+ num_excluded = get_upper_ifindexes(dev, excluded_devices);
+ excluded_devices[num_excluded++] = dev->ifindex;
+ }
+
+ if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
+ for (i = 0; i < map->max_entries; i++) {
+ dst = rcu_dereference_check(dtab->netdev_map[i],
+ rcu_read_lock_bh_held());
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+
+ }
+ } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
+ for (i = 0; i < dtab->n_buckets; i++) {
+ head = dev_map_index_hash(dtab, i);
+ hlist_for_each_entry_safe(dst, next, head, index_hlist) {
+ if (!dst)
+ continue;
+
+ if (is_ifindex_excluded(excluded_devices, num_excluded,
+ dst->dev->ifindex))
+ continue;
+
+ /* we only need n-1 clones; last_dst enqueued below */
+ if (!last_dst) {
+ last_dst = dst;
+ continue;
+ }
+
+ err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
+ if (err)
+ return err;
+
+ last_dst = dst;
+ }
+ }
+ }
+
+ /* consume the first skb and return */
+ if (last_dst)
+ return dev_map_generic_redirect(last_dst, skb, xdp_prog);
+
+ /* dtab is empty */
+ consume_skb(skb);
+ return 0;
+}
+
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
*(u32 *)key);
- struct net_device *dev = obj ? obj->dev : NULL;
-
- return dev ? &dev->ifindex : NULL;
+ return obj ? &obj->val : NULL;
}
static void __dev_map_entry_free(struct rcu_head *rcu)
@@ -492,6 +792,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ if (dev->xdp_prog)
+ bpf_prog_put(dev->xdp_prog);
dev_put(dev->dev);
kfree(dev);
}
@@ -505,14 +807,7 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key)
if (k >= map->max_entries)
return -EINVAL;
- /* Use call_rcu() here to ensure any rcu critical sections have
- * completed as well as any flush operations because call_rcu
- * will wait for preempt-disable region to complete, NAPI in this
- * context. And additionally, the driver tear down ensures all
- * soft irqs are complete before removing the net device in the
- * case of dev_put equals zero.
- */
- old_dev = xchg(&dtab->netdev_map[k], NULL);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
@@ -542,26 +837,50 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab *dtab,
- u32 ifindex,
+ struct bpf_devmap_val *val,
unsigned int idx)
{
+ struct bpf_prog *prog = NULL;
struct bpf_dtab_netdev *dev;
- dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
- dtab->map.numa_node);
+ dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
+ GFP_ATOMIC | __GFP_NOWARN,
+ dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
- dev->dev = dev_get_by_index(net, ifindex);
- if (!dev->dev) {
- kfree(dev);
- return ERR_PTR(-EINVAL);
+ dev->dev = dev_get_by_index(net, val->ifindex);
+ if (!dev->dev)
+ goto err_out;
+
+ if (val->bpf_prog.fd > 0) {
+ prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+ BPF_PROG_TYPE_XDP, false);
+ if (IS_ERR(prog))
+ goto err_put_dev;
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+ goto err_put_prog;
}
dev->idx = idx;
dev->dtab = dtab;
+ if (prog) {
+ dev->xdp_prog = prog;
+ dev->val.bpf_prog.id = prog->aux->id;
+ } else {
+ dev->xdp_prog = NULL;
+ dev->val.bpf_prog.id = 0;
+ }
+ dev->val.ifindex = val->ifindex;
return dev;
+err_put_prog:
+ bpf_prog_put(prog);
+err_put_dev:
+ dev_put(dev->dev);
+err_out:
+ kfree(dev);
+ return ERR_PTR(-EINVAL);
}
static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
@@ -569,7 +888,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
+ struct bpf_devmap_val val = {};
u32 i = *(u32 *)key;
if (unlikely(map_flags > BPF_EXIST))
@@ -579,10 +898,16 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
- if (!ifindex) {
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (!val.ifindex) {
dev = NULL;
+ /* can not specify fd if ifindex is 0 */
+ if (val.bpf_prog.fd > 0)
+ return -EINVAL;
} else {
- dev = __dev_map_alloc_node(net, dtab, ifindex, i);
+ dev = __dev_map_alloc_node(net, dtab, &val, i);
if (IS_ERR(dev))
return PTR_ERR(dev);
}
@@ -591,7 +916,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
* Remembering the driver side flush operation will happen before the
* net device is removed.
*/
- old_dev = xchg(&dtab->netdev_map[i], dev);
+ old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
@@ -610,12 +935,15 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev, *old_dev;
- u32 ifindex = *(u32 *)value;
+ struct bpf_devmap_val val = {};
u32 idx = *(u32 *)key;
unsigned long flags;
int err = -EEXIST;
- if (unlikely(map_flags > BPF_EXIST || !ifindex))
+ /* already verified value_size <= sizeof val */
+ memcpy(&val, value, map->value_size);
+
+ if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
return -EINVAL;
spin_lock_irqsave(&dtab->index_lock, flags);
@@ -624,7 +952,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
if (old_dev && (map_flags & BPF_NOEXIST))
goto out_err;
- dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
+ dev = __dev_map_alloc_node(net, dtab, &val, idx);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out_err;
@@ -662,7 +990,23 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
map, key, value, map_flags);
}
+static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_lookup_elem);
+}
+
+static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
+{
+ return __bpf_xdp_redirect_map(map, ifindex, flags,
+ BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
+ __dev_map_hash_lookup_elem);
+}
+
+static int dev_map_btf_id;
const struct bpf_map_ops dev_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
@@ -670,9 +1014,14 @@ const struct bpf_map_ops dev_map_ops = {
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_dtab",
+ .map_btf_id = &dev_map_btf_id,
+ .map_redirect = dev_map_redirect,
};
+static int dev_map_hash_map_btf_id;
const struct bpf_map_ops dev_map_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_hash_get_next_key,
@@ -680,6 +1029,9 @@ const struct bpf_map_ops dev_map_hash_ops = {
.map_update_elem = dev_map_hash_update_elem,
.map_delete_elem = dev_map_hash_delete_elem,
.map_check_btf = map_check_no_btf,
+ .map_btf_name = "bpf_dtab",
+ .map_btf_id = &dev_map_hash_map_btf_id,
+ .map_redirect = dev_hash_map_redirect,
};
static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
@@ -721,9 +1073,7 @@ static int dev_map_notification(struct notifier_block *notifier,
break;
/* will be freed in free_netdev() */
- netdev->xdp_bulkq =
- __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue),
- sizeof(void *), GFP_ATOMIC);
+ netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
if (!netdev->xdp_bulkq)
return NOTIFY_BAD;
@@ -746,10 +1096,10 @@ static int dev_map_notification(struct notifier_block *notifier,
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
- dev = READ_ONCE(dtab->netdev_map[i]);
+ dev = rcu_dereference(dtab->netdev_map[i]);
if (!dev || netdev != dev->dev)
continue;
- odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
+ odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
if (dev == odev)
call_rcu(&dev->rcu,
__dev_map_entry_free);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index b44d8c447afd..7b4afb7d96db 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
@@ -19,16 +19,23 @@ static const char *__func_get_name(const struct bpf_insn_cbs *cbs,
{
BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
- if (insn->src_reg != BPF_PSEUDO_CALL &&
+ if (!insn->src_reg &&
insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID &&
func_id_str[insn->imm])
return func_id_str[insn->imm];
- if (cbs && cbs->cb_call)
- return cbs->cb_call(cbs->private_data, insn);
+ if (cbs && cbs->cb_call) {
+ const char *res;
+
+ res = cbs->cb_call(cbs->private_data, insn);
+ if (res)
+ return res;
+ }
if (insn->src_reg == BPF_PSEUDO_CALL)
snprintf(buff, len, "%+d", insn->imm);
+ else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL)
+ snprintf(buff, len, "kernel-function");
return buff;
}
@@ -80,6 +87,13 @@ const char *const bpf_alu_string[16] = {
[BPF_END >> 4] = "endian",
};
+static const char *const bpf_atomic_alu_string[16] = {
+ [BPF_ADD >> 4] = "add",
+ [BPF_AND >> 4] = "and",
+ [BPF_OR >> 4] = "or",
+ [BPF_XOR >> 4] = "xor",
+};
+
static const char *const bpf_ldst_string[] = {
[BPF_W >> 3] = "u32",
[BPF_H >> 3] = "u16",
@@ -153,24 +167,56 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg,
insn->off, insn->src_reg);
- else if (BPF_MODE(insn->code) == BPF_XADD)
- verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == BPF_ADD || insn->imm == BPF_AND ||
+ insn->imm == BPF_OR || insn->imm == BPF_XOR)) {
+ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ bpf_alu_string[BPF_OP(insn->imm) >> 4],
+ insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ (insn->imm == (BPF_ADD | BPF_FETCH) ||
+ insn->imm == (BPF_AND | BPF_FETCH) ||
+ insn->imm == (BPF_OR | BPF_FETCH) ||
+ insn->imm == (BPF_XOR | BPF_FETCH))) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4],
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_CMPXCHG) {
+ verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n",
insn->code,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->dst_reg, insn->off,
insn->src_reg);
- else
+ } else if (BPF_MODE(insn->code) == BPF_ATOMIC &&
+ insn->imm == BPF_XCHG) {
+ verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n",
+ insn->code, insn->src_reg,
+ BPF_SIZE(insn->code) == BPF_DW ? "64" : "",
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off, insn->src_reg);
+ } else {
verbose(cbs->private_data, "BUG_%02x\n", insn->code);
+ }
} else if (class == BPF_ST) {
- if (BPF_MODE(insn->code) != BPF_MEM) {
+ if (BPF_MODE(insn->code) == BPF_MEM) {
+ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (BPF_MODE(insn->code) == 0xc0 /* BPF_NOSPEC, no UAPI */) {
+ verbose(cbs->private_data, "(%02x) nospec\n", insn->code);
+ } else {
verbose(cbs->private_data, "BUG_st_%02x\n", insn->code);
- return;
}
- verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n",
- insn->code,
- bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
- insn->dst_reg,
- insn->off, insn->imm);
} else if (class == BPF_LDX) {
if (BPF_MODE(insn->code) != BPF_MEM) {
verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/dispatcher.c b/kernel/bpf/dispatcher.c
index b3e5b214fed8..2444bd15cc2d 100644
--- a/kernel/bpf/dispatcher.c
+++ b/kernel/bpf/dispatcher.c
@@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
noff = 0;
} else {
old = d->image + d->image_off;
- noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
+ noff = d->image_off ^ (PAGE_SIZE / 2);
}
new = d->num_progs ? d->image + noff : NULL;
@@ -140,9 +140,10 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
mutex_lock(&d->mutex);
if (!d->image) {
- d->image = bpf_image_alloc();
+ d->image = bpf_jit_alloc_exec_page();
if (!d->image)
goto out;
+ bpf_image_ksym_add(d->image, &d->ksym);
}
prev_num_progs = d->num_progs;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a1468e3f5af2..d29af9988f37 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -9,6 +9,7 @@
#include <linux/rculist_nulls.h>
#include <linux/random.h>
#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
#include "percpu_freelist.h"
#include "bpf_lru_list.h"
#include "map_in_map.h"
@@ -27,11 +28,67 @@
.map_delete_batch = \
generic_map_delete_batch
+/*
+ * The bucket lock has two protection scopes:
+ *
+ * 1) Serializing concurrent operations from BPF programs on different
+ * CPUs
+ *
+ * 2) Serializing concurrent operations from BPF programs and sys_bpf()
+ *
+ * BPF programs can execute in any context including perf, kprobes and
+ * tracing. As there are almost no limits where perf, kprobes and tracing
+ * can be invoked from the lock operations need to be protected against
+ * deadlocks. Deadlocks can be caused by recursion and by an invocation in
+ * the lock held section when functions which acquire this lock are invoked
+ * from sys_bpf(). BPF recursion is prevented by incrementing the per CPU
+ * variable bpf_prog_active, which prevents BPF programs attached to perf
+ * events, kprobes and tracing to be invoked before the prior invocation
+ * from one of these contexts completed. sys_bpf() uses the same mechanism
+ * by pinning the task to the current CPU and incrementing the recursion
+ * protection across the map operation.
+ *
+ * This has subtle implications on PREEMPT_RT. PREEMPT_RT forbids certain
+ * operations like memory allocations (even with GFP_ATOMIC) from atomic
+ * contexts. This is required because even with GFP_ATOMIC the memory
+ * allocator calls into code paths which acquire locks with long held lock
+ * sections. To ensure the deterministic behaviour these locks are regular
+ * spinlocks, which are converted to 'sleepable' spinlocks on RT. The only
+ * true atomic contexts on an RT kernel are the low level hardware
+ * handling, scheduling, low level interrupt handling, NMIs etc. None of
+ * these contexts should ever do memory allocations.
+ *
+ * As regular device interrupt handlers and soft interrupts are forced into
+ * thread context, the existing code which does
+ * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * just works.
+ *
+ * In theory the BPF locks could be converted to regular spinlocks as well,
+ * but the bucket locks and percpu_freelist locks can be taken from
+ * arbitrary contexts (perf, kprobes, tracepoints) which are required to be
+ * atomic contexts even on RT. These mechanisms require preallocated maps,
+ * so there is no need to invoke memory allocations within the lock held
+ * sections.
+ *
+ * BPF maps which need dynamic allocation are only used from (forced)
+ * thread context on RT and can therefore use regular spinlocks which in
+ * turn allows to invoke memory allocations from the lock held section.
+ *
+ * On a non RT kernel this distinction is neither possible nor required.
+ * spinlock maps to raw_spinlock and the extra code is optimized out by the
+ * compiler.
+ */
struct bucket {
struct hlist_nulls_head head;
- raw_spinlock_t lock;
+ union {
+ raw_spinlock_t raw_lock;
+ spinlock_t lock;
+ };
};
+#define HASHTAB_MAP_LOCK_COUNT 8
+#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1)
+
struct bpf_htab {
struct bpf_map map;
struct bucket *buckets;
@@ -45,6 +102,8 @@ struct bpf_htab {
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
u32 hashrnd;
+ struct lock_class_key lockdep_key;
+ int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT];
};
/* each htab element is struct htab_elem + key + value */
@@ -65,9 +124,75 @@ struct htab_elem {
struct bpf_lru_node lru_node;
};
u32 hash;
- char key[0] __aligned(8);
+ char key[] __aligned(8);
};
+static inline bool htab_is_prealloc(const struct bpf_htab *htab)
+{
+ return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
+}
+
+static inline bool htab_use_raw_lock(const struct bpf_htab *htab)
+{
+ return (!IS_ENABLED(CONFIG_PREEMPT_RT) || htab_is_prealloc(htab));
+}
+
+static void htab_init_buckets(struct bpf_htab *htab)
+{
+ unsigned i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
+ if (htab_use_raw_lock(htab)) {
+ raw_spin_lock_init(&htab->buckets[i].raw_lock);
+ lockdep_set_class(&htab->buckets[i].raw_lock,
+ &htab->lockdep_key);
+ } else {
+ spin_lock_init(&htab->buckets[i].lock);
+ lockdep_set_class(&htab->buckets[i].lock,
+ &htab->lockdep_key);
+ }
+ cond_resched();
+ }
+}
+
+static inline int htab_lock_bucket(const struct bpf_htab *htab,
+ struct bucket *b, u32 hash,
+ unsigned long *pflags)
+{
+ unsigned long flags;
+
+ hash = hash & HASHTAB_MAP_LOCK_MASK;
+
+ migrate_disable();
+ if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) {
+ __this_cpu_dec(*(htab->map_locked[hash]));
+ migrate_enable();
+ return -EBUSY;
+ }
+
+ if (htab_use_raw_lock(htab))
+ raw_spin_lock_irqsave(&b->raw_lock, flags);
+ else
+ spin_lock_irqsave(&b->lock, flags);
+ *pflags = flags;
+
+ return 0;
+}
+
+static inline void htab_unlock_bucket(const struct bpf_htab *htab,
+ struct bucket *b, u32 hash,
+ unsigned long flags)
+{
+ hash = hash & HASHTAB_MAP_LOCK_MASK;
+ if (htab_use_raw_lock(htab))
+ raw_spin_unlock_irqrestore(&b->raw_lock, flags);
+ else
+ spin_unlock_irqrestore(&b->lock, flags);
+ __this_cpu_dec(*(htab->map_locked[hash]));
+ migrate_enable();
+}
+
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
static bool htab_is_lru(const struct bpf_htab *htab)
@@ -82,11 +207,6 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
-static bool htab_is_prealloc(const struct bpf_htab *htab)
-{
- return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
-}
-
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
@@ -105,7 +225,33 @@ static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l)
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
- return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+ return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size);
+}
+
+static bool htab_has_extra_elems(struct bpf_htab *htab)
+{
+ return !htab_is_percpu(htab) && !htab_is_lru(htab);
+}
+
+static void htab_free_prealloced_timers(struct bpf_htab *htab)
+{
+ u32 num_entries = htab->map.max_entries;
+ int i;
+
+ if (likely(!map_value_has_timer(&htab->map)))
+ return;
+ if (htab_has_extra_elems(htab))
+ num_entries += num_possible_cpus();
+
+ for (i = 0; i < num_entries; i++) {
+ struct htab_elem *elem;
+
+ elem = get_htab_elem(htab, i);
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+ cond_resched();
+ }
}
static void htab_free_elems(struct bpf_htab *htab)
@@ -145,8 +291,12 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
struct htab_elem *l;
if (node) {
+ u32 key_size = htab->map.key_size;
+
l = container_of(node, struct htab_elem, lru_node);
- memcpy(l->key, key, htab->map.key_size);
+ memcpy(l->key, key, key_size);
+ check_and_init_map_value(&htab->map,
+ l->key + round_up(key_size, 8));
return l;
}
@@ -158,10 +308,10 @@ static int prealloc_init(struct bpf_htab *htab)
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i;
- if (!htab_is_percpu(htab) && !htab_is_lru(htab))
+ if (htab_has_extra_elems(htab))
num_entries += num_possible_cpus();
- htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
+ htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries,
htab->map.numa_node);
if (!htab->elems)
return -ENOMEM;
@@ -173,7 +323,8 @@ static int prealloc_init(struct bpf_htab *htab)
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
- pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+ pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@@ -227,8 +378,8 @@ static int alloc_extra_elems(struct bpf_htab *htab)
struct pcpu_freelist_node *l;
int cpu;
- pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
- GFP_USER | __GFP_NOWARN);
+ pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8,
+ GFP_USER | __GFP_NOWARN);
if (!pptr)
return -ENOMEM;
@@ -266,9 +417,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
offsetof(struct htab_elem, hash_node.pprev));
- if (lru && !capable(CAP_SYS_ADMIN))
+ if (lru && !bpf_capable())
/* LRU implementation is much complicated than other
- * maps. Hence, limit to CAP_SYS_ADMIN for now.
+ * maps. Hence, limit to CAP_BPF.
*/
return -EPERM;
@@ -296,17 +447,11 @@ static int htab_map_alloc_check(union bpf_attr *attr)
attr->value_size == 0)
return -EINVAL;
- if (attr->key_size > MAX_BPF_STACK)
- /* eBPF programs initialize keys on stack, so they cannot be
- * larger than max stack size
- */
- return -E2BIG;
-
- if (attr->value_size >= KMALLOC_MAX_SIZE -
- MAX_BPF_STACK - sizeof(struct htab_elem))
- /* if value_size is bigger, the user space won't be able to
- * access the elements via bpf syscall. This check also makes
- * sure that the elem_size doesn't overflow and it's
+ if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE -
+ sizeof(struct htab_elem))
+ /* if key_size + value_size is bigger, the user space won't be
+ * able to access the elements via bpf syscall. This check
+ * also makes sure that the elem_size doesn't overflow and it's
* kmalloc-able later in htab_map_update_elem()
*/
return -E2BIG;
@@ -329,12 +474,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab;
int err, i;
- u64 cost;
- htab = kzalloc(sizeof(*htab), GFP_USER);
+ htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT);
if (!htab)
return ERR_PTR(-ENOMEM);
+ lockdep_register_key(&htab->lockdep_key);
+
bpf_map_init_from_attr(&htab->map, attr);
if (percpu_lru) {
@@ -365,41 +511,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->n_buckets > U32_MAX / sizeof(struct bucket))
goto free_htab;
- cost = (u64) htab->n_buckets * sizeof(struct bucket) +
- (u64) htab->elem_size * htab->map.max_entries;
-
- if (percpu)
- cost += (u64) round_up(htab->map.value_size, 8) *
- num_possible_cpus() * htab->map.max_entries;
- else
- cost += (u64) htab->elem_size * num_possible_cpus();
-
- /* if map size is larger than memlock limit, reject it */
- err = bpf_map_charge_init(&htab->map.memory, cost);
- if (err)
- goto free_htab;
-
err = -ENOMEM;
htab->buckets = bpf_map_area_alloc(htab->n_buckets *
sizeof(struct bucket),
htab->map.numa_node);
if (!htab->buckets)
- goto free_charge;
+ goto free_htab;
+
+ for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) {
+ htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map,
+ sizeof(int),
+ sizeof(int),
+ GFP_USER);
+ if (!htab->map_locked[i])
+ goto free_map_locked;
+ }
if (htab->map.map_flags & BPF_F_ZERO_SEED)
htab->hashrnd = 0;
else
htab->hashrnd = get_random_int();
- for (i = 0; i < htab->n_buckets; i++) {
- INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i);
- raw_spin_lock_init(&htab->buckets[i].lock);
- }
+ htab_init_buckets(htab);
if (prealloc) {
err = prealloc_init(htab);
if (err)
- goto free_buckets;
+ goto free_map_locked;
if (!percpu && !lru) {
/* lru itself can remove the least used element, so
@@ -415,11 +553,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_prealloc:
prealloc_destroy(htab);
-free_buckets:
+free_map_locked:
+ for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++)
+ free_percpu(htab->map_locked[i]);
bpf_map_area_free(htab->buckets);
-free_charge:
- bpf_map_charge_finish(&htab->map.memory);
free_htab:
+ lockdep_unregister_key(&htab->lockdep_key);
kfree(htab);
return ERR_PTR(err);
}
@@ -487,8 +626,8 @@ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
struct htab_elem *l;
u32 hash, key_size;
- /* Must be called with rcu_read_lock. */
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -522,14 +661,14 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
* bpf_prog
* __htab_map_lookup_elem
*/
-static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@@ -561,7 +700,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
return __htab_lru_map_lookup_elem(map, key, false);
}
-static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@@ -570,7 +709,7 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
- *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
+ *insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@@ -586,6 +725,14 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
return insn - insn_buf;
}
+static void check_and_free_timer(struct bpf_htab *htab, struct htab_elem *elem)
+{
+ if (unlikely(map_value_has_timer(&htab->map)))
+ bpf_timer_cancel_and_free(elem->key +
+ round_up(htab->map.key_size, 8) +
+ htab->map.timer_off);
+}
+
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
@@ -597,20 +744,24 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
struct hlist_nulls_node *n;
unsigned long flags;
struct bucket *b;
+ int ret;
tgt_l = container_of(node, struct htab_elem, lru_node);
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
- raw_spin_lock_irqsave(&b->lock, flags);
+ ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags);
+ if (ret)
+ return false;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
+ check_and_free_timer(htab, l);
break;
}
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, tgt_l->hash, flags);
return l == tgt_l;
}
@@ -678,6 +829,7 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
+ check_and_free_timer(htab, l);
kfree(l);
}
@@ -686,28 +838,26 @@ static void htab_elem_free_rcu(struct rcu_head *head)
struct htab_elem *l = container_of(head, struct htab_elem, rcu);
struct bpf_htab *htab = l->htab;
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
- * we're calling kfree, otherwise deadlock is possible if kprobes
- * are placed somewhere inside of slub
- */
- preempt_disable();
- __this_cpu_inc(bpf_prog_active);
htab_elem_free(htab, l);
- __this_cpu_dec(bpf_prog_active);
- preempt_enable();
}
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
{
struct bpf_map *map = &htab->map;
+ void *ptr;
if (map->ops->map_fd_put_ptr) {
- void *ptr = fd_htab_map_get_ptr(map, l);
-
+ ptr = fd_htab_map_get_ptr(map, l);
map->ops->map_fd_put_ptr(ptr);
}
+}
+
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
+{
+ htab_put_fd_value(htab, l);
if (htab_is_prealloc(htab)) {
+ check_and_free_timer(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
@@ -734,6 +884,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ /* When using prealloc and not setting the initial value on all cpus,
+ * zero-fill element values for other cpus (just as what happens when
+ * not using prealloc). Otherwise, bpf program has no way to ensure
+ * known initial values for cpus other than current one
+ * (onallcpus=false always when coming from bpf prog).
+ */
+ if (htab_is_prealloc(htab) && !onallcpus) {
+ u32 size = round_up(htab->map.value_size, 8);
+ int current_cpu = raw_smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == current_cpu)
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
+ size);
+ else
+ memset(per_cpu_ptr(pptr, cpu), 0, size);
+ }
+ } else {
+ pcpu_copy_value(htab, pptr, value, onallcpus);
+ }
+}
+
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
@@ -757,6 +933,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
+ htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -777,14 +954,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-E2BIG);
goto dec_count;
}
- l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
- htab->map.numa_node);
+ l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
goto dec_count;
}
- check_and_init_map_lock(&htab->map,
- l_new->key + round_up(key_size, 8));
+ check_and_init_map_value(&htab->map,
+ l_new->key + round_up(key_size, 8));
}
memcpy(l_new->key, key, key_size);
@@ -794,8 +972,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */
- pptr = __alloc_percpu_gfp(size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
+ pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
+ GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);
l_new = ERR_PTR(-ENOMEM);
@@ -803,7 +981,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
}
}
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
@@ -853,7 +1031,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
/* unknown flags */
return -EINVAL;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() &&
+ !rcu_read_lock_bh_held());
key_size = map->key_size;
@@ -884,8 +1063,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
*/
}
- /* bpf_map_update_elem() can be called in_irq() */
- raw_spin_lock_irqsave(&b->lock, flags);
+ ret = htab_lock_bucket(htab, b, hash, &flags);
+ if (ret)
+ return ret;
l_old = lookup_elem_raw(head, hash, key, key_size);
@@ -923,13 +1103,21 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
+ else
+ check_and_free_timer(htab, l_old);
}
ret = 0;
err:
- raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_unlock_bucket(htab, b, hash, flags);
return ret;
}
+static void htab_lru_push_free(struct bpf_htab *htab, struct htab_elem *elem)