aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/obsolete/sysfs-selinux-checkreqprot2
-rw-r--r--Documentation/admin-guide/kdump/gdbmacros.txt159
-rw-r--r--Documentation/admin-guide/kdump/vmcoreinfo.rst131
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/Kconfig30
-rw-r--r--arch/arm/Kconfig15
-rw-r--r--arch/arm64/Kconfig13
-rw-r--r--arch/csky/Kconfig13
-rw-r--r--arch/microblaze/Kconfig18
-rw-r--r--arch/mips/Kconfig17
-rw-r--r--arch/parisc/Kconfig16
-rw-r--r--arch/powerpc/Kconfig17
-rw-r--r--arch/riscv/Kconfig13
-rw-r--r--arch/s390/Kconfig17
-rw-r--r--arch/sh/Kconfig16
-rw-r--r--arch/sparc/Kconfig18
-rw-r--r--arch/um/Kconfig16
-rw-r--r--arch/x86/Kconfig16
-rw-r--r--arch/xtensa/Kconfig14
-rw-r--r--drivers/base/core.c46
-rw-r--r--include/linux/crash_core.h3
-rw-r--r--include/linux/debug_locks.h2
-rw-r--r--include/linux/dev_printk.h8
-rw-r--r--include/linux/kernel.h1
-rw-r--r--include/linux/overflow.h39
-rw-r--r--include/linux/printk.h8
-rw-r--r--include/trace/events/avc.h53
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/audit.c9
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/printk/Makefile1
-rw-r--r--kernel/printk/internal.h4
-rw-r--r--kernel/printk/printk.c1153
-rw-r--r--kernel/printk/printk_ringbuffer.c2083
-rw-r--r--kernel/printk/printk_ringbuffer.h382
-rw-r--r--kernel/printk/printk_safe.c2
-rw-r--r--kernel/seccomp.c64
-rw-r--r--scripts/gdb/linux/dmesg.py147
-rw-r--r--scripts/gdb/linux/utils.py7
-rw-r--r--scripts/selinux/mdp/mdp.c7
-rw-r--r--security/selinux/avc.c29
-rw-r--r--security/selinux/hooks.c17
-rw-r--r--security/selinux/include/conditional.h2
-rw-r--r--security/selinux/include/policycap.h20
-rw-r--r--security/selinux/include/policycap_names.h18
-rw-r--r--security/selinux/include/security.h63
-rw-r--r--security/selinux/selinuxfs.c259
-rw-r--r--security/selinux/ss/avtab.c49
-rw-r--r--security/selinux/ss/avtab.h1
-rw-r--r--security/selinux/ss/conditional.c155
-rw-r--r--security/selinux/ss/conditional.h2
-rw-r--r--security/selinux/ss/hashtab.c53
-rw-r--r--security/selinux/ss/hashtab.h6
-rw-r--r--security/selinux/ss/services.c875
-rw-r--r--security/selinux/ss/services.h5
-rw-r--r--security/selinux/ss/sidtab.c10
-rw-r--r--security/selinux/ss/sidtab.h2
-rw-r--r--security/smack/smack.h19
-rw-r--r--security/smack/smack_access.c55
-rw-r--r--security/smack/smack_lsm.c252
-rw-r--r--security/smack/smackfs.c23
-rw-r--r--security/tomoyo/util.c29
-rw-r--r--tools/testing/selftests/clone3/clone3.c45
-rw-r--r--tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c4
-rw-r--r--tools/testing/selftests/clone3/clone3_clear_sighand.c2
-rw-r--r--tools/testing/selftests/clone3/clone3_selftests.h24
-rw-r--r--tools/testing/selftests/clone3/clone3_set_tid.c4
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c2
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c440
69 files changed, 5188 insertions, 1846 deletions
diff --git a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
index 49ed9c8fd1e5..ed6b52ca210f 100644
--- a/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
+++ b/Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
@@ -15,7 +15,7 @@ Description:
actual protection), and Android and Linux distributions have been
explicitly writing a "0" to /sys/fs/selinux/checkreqprot during
initialization for some time. Support for setting checkreqprot to 1
- will be removed in a future kernel release, at which point the kernel
+ will be removed no sooner than June 2021, at which point the kernel
will always cease using checkreqprot internally and will always
check the actual protections being applied upon mmap/mprotect calls.
The checkreqprot selinuxfs node will remain for backward compatibility
diff --git a/Documentation/admin-guide/kdump/gdbmacros.txt b/Documentation/admin-guide/kdump/gdbmacros.txt
index 220d0a80ca2c..82aecdcae8a6 100644
--- a/Documentation/admin-guide/kdump/gdbmacros.txt
+++ b/Documentation/admin-guide/kdump/gdbmacros.txt
@@ -170,57 +170,103 @@ document trapinfo
address the kernel panicked.
end
-define dump_log_idx
- set $idx = $arg0
- if ($argc > 1)
- set $prev_flags = $arg1
+define dump_record
+ set var $desc = $arg0
+ set var $info = $arg1
+ if ($argc > 2)
+ set var $prev_flags = $arg2
else
- set $prev_flags = 0
+ set var $prev_flags = 0
end
- set $msg = ((struct printk_log *) (log_buf + $idx))
- set $prefix = 1
- set $newline = 1
- set $log = log_buf + $idx + sizeof(*$msg)
-
- # prev & LOG_CONT && !(msg->flags & LOG_PREIX)
- if (($prev_flags & 8) && !($msg->flags & 4))
- set $prefix = 0
+
+ set var $prefix = 1
+ set var $newline = 1
+
+ set var $begin = $desc->text_blk_lpos.begin % (1U << prb->text_data_ring.size_bits)
+ set var $next = $desc->text_blk_lpos.next % (1U << prb->text_data_ring.size_bits)
+
+ # handle data-less record
+ if ($begin & 1)
+ set var $text_len = 0
+ set var $log = ""
+ else
+ # handle wrapping data block
+ if ($begin > $next)
+ set var $begin = 0
+ end
+
+ # skip over descriptor id
+ set var $begin = $begin + sizeof(long)
+
+ # handle truncated message
+ if ($next - $begin < $info->text_len)
+ set var $text_len = $next - $begin
+ else
+ set var $text_len = $info->text_len
+ end
+
+ set var $log = &prb->text_data_ring.data[$begin]
+ end
+
+ # prev & LOG_CONT && !(info->flags & LOG_PREIX)
+ if (($prev_flags & 8) && !($info->flags & 4))
+ set var $prefix = 0
end
- # msg->flags & LOG_CONT
- if ($msg->flags & 8)
+ # info->flags & LOG_CONT
+ if ($info->flags & 8)
# (prev & LOG_CONT && !(prev & LOG_NEWLINE))
if (($prev_flags & 8) && !($prev_flags & 2))
- set $prefix = 0
+ set var $prefix = 0
end
- # (!(msg->flags & LOG_NEWLINE))
- if (!($msg->flags & 2))
- set $newline = 0
+ # (!(info->flags & LOG_NEWLINE))
+ if (!($info->flags & 2))
+ set var $newline = 0
end
end
if ($prefix)
- printf "[%5lu.%06lu] ", $msg->ts_nsec / 1000000000, $msg->ts_nsec % 1000000000
+ printf "[%5lu.%06lu] ", $info->ts_nsec / 1000000000, $info->ts_nsec % 1000000000
end
- if ($msg->text_len != 0)
- eval "printf \"%%%d.%ds\", $log", $msg->text_len, $msg->text_len
+ if ($text_len)
+ eval "printf \"%%%d.%ds\", $log", $text_len, $text_len
end
if ($newline)
printf "\n"
end
- if ($msg->dict_len > 0)
- set $dict = $log + $msg->text_len
- set $idx = 0
- set $line = 1
- while ($idx < $msg->dict_len)
- if ($line)
- printf " "
- set $line = 0
+
+ # handle dictionary data
+
+ set var $dict = &$info->dev_info.subsystem[0]
+ set var $dict_len = sizeof($info->dev_info.subsystem)
+ if ($dict[0] != '\0')
+ printf " SUBSYSTEM="
+ set var $idx = 0
+ while ($idx < $dict_len)
+ set var $c = $dict[$idx]
+ if ($c == '\0')
+ loop_break
+ else
+ if ($c < ' ' || $c >= 127 || $c == '\\')
+ printf "\\x%02x", $c
+ else
+ printf "%c", $c
+ end
end
- set $c = $dict[$idx]
+ set var $idx = $idx + 1
+ end
+ printf "\n"
+ end
+
+ set var $dict = &$info->dev_info.device[0]
+ set var $dict_len = sizeof($info->dev_info.device)
+ if ($dict[0] != '\0')
+ printf " DEVICE="
+ set var $idx = 0
+ while ($idx < $dict_len)
+ set var $c = $dict[$idx]
if ($c == '\0')
- printf "\n"
- set $line = 1
+ loop_break
else
if ($c < ' ' || $c >= 127 || $c == '\\')
printf "\\x%02x", $c
@@ -228,33 +274,46 @@ define dump_log_idx
printf "%c", $c
end
end
- set $idx = $idx + 1
+ set var $idx = $idx + 1
end
printf "\n"
end
end
-document dump_log_idx
- Dump a single log given its index in the log buffer. The first
- parameter is the index into log_buf, the second is optional and
- specified the previous log buffer's flags, used for properly
- formatting continued lines.
+document dump_record
+ Dump a single record. The first parameter is the descriptor,
+ the second parameter is the info, the third parameter is
+ optional and specifies the previous record's flags, used for
+ properly formatting continued lines.
end
define dmesg
- set $i = log_first_idx
- set $end_idx = log_first_idx
- set $prev_flags = 0
+ # definitions from kernel/printk/printk_ringbuffer.h
+ set var $desc_committed = 1
+ set var $desc_finalized = 2
+ set var $desc_sv_bits = sizeof(long) * 8
+ set var $desc_flags_shift = $desc_sv_bits - 2
+ set var $desc_flags_mask = 3 << $desc_flags_shift
+ set var $id_mask = ~$desc_flags_mask
+
+ set var $desc_count = 1U << prb->desc_ring.count_bits
+ set var $prev_flags = 0
+
+ set var $id = prb->desc_ring.tail_id.counter
+ set var $end_id = prb->desc_ring.head_id.counter
while (1)
- set $msg = ((struct printk_log *) (log_buf + $i))
- if ($msg->len == 0)
- set $i = 0
- else
- dump_log_idx $i $prev_flags
- set $i = $i + $msg->len
- set $prev_flags = $msg->flags
+ set var $desc = &prb->desc_ring.descs[$id % $desc_count]
+ set var $info = &prb->desc_ring.infos[$id % $desc_count]
+
+ # skip non-committed record
+ set var $state = 3 & ($desc->state_var.counter >> $desc_flags_shift)
+ if ($state == $desc_committed || $state == $desc_finalized)
+ dump_record $desc $info $prev_flags
+ set var $prev_flags = $info->flags
end
- if ($i == $end_idx)
+
+ set var $id = ($id + 1) & $id_mask
+ if ($id == $end_id)
loop_break
end
end
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 2baad0bfb09d..e44a6c01f336 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -189,50 +189,123 @@ from this.
Free areas descriptor. User-space tools use this value to iterate the
free_area ranges. MAX_ORDER is used by the zone buddy allocator.
-log_first_idx
+prb
+---
+
+A pointer to the printk ringbuffer (struct printk_ringbuffer). This
+may be pointing to the static boot ringbuffer or the dynamically
+allocated ringbuffer, depending on when the the core dump occurred.
+Used by user-space tools to read the active kernel log buffer.
+
+printk_rb_static
+----------------
+
+A pointer to the static boot printk ringbuffer. If @prb has a
+different value, this is useful for viewing the initial boot messages,
+which may have been overwritten in the dynamically allocated
+ringbuffer.
+
+clear_seq
+---------
+
+The sequence number of the printk() record after the last clear
+command. It indicates the first record after the last
+SYSLOG_ACTION_CLEAR, like issued by 'dmesg -c'. Used by user-space
+tools to dump a subset of the dmesg log.
+
+printk_ringbuffer
+-----------------
+
+The size of a printk_ringbuffer structure. This structure contains all
+information required for accessing the various components of the
+kernel log buffer.
+
+(printk_ringbuffer, desc_ring|text_data_ring|dict_data_ring|fail)
+-----------------------------------------------------------------
+
+Offsets for the various components of the printk ringbuffer. Used by
+user-space tools to view the kernel log buffer without requiring the
+declaration of the structure.
+
+prb_desc_ring
-------------
-Index of the first record stored in the buffer log_buf. Used by
-user-space tools to read the strings in the log_buf.
+The size of the prb_desc_ring structure. This structure contains
+information about the set of record descriptors.
-log_buf
--------
+(prb_desc_ring, count_bits|descs|head_id|tail_id)
+-------------------------------------------------
+
+Offsets for the fields describing the set of record descriptors. Used
+by user-space tools to be able to traverse the descriptors without
+requiring the declaration of the structure.
+
+prb_desc
+--------
+
+The size of the prb_desc structure. This structure contains
+information about a single record descriptor.
+
+(prb_desc, info|state_var|text_blk_lpos|dict_blk_lpos)
+------------------------------------------------------
+
+Offsets for the fields describing a record descriptors. Used by
+user-space tools to be able to read descriptors without requiring
+the declaration of the structure.
+
+prb_data_blk_lpos
+-----------------
+
+The size of the prb_data_blk_lpos structure. This structure contains
+information about where the text or dictionary data (data block) is
+located within the respective data ring.
+
+(prb_data_blk_lpos, begin|next)
+-------------------------------
-Console output is written to the ring buffer log_buf at index
-log_first_idx. Used to get the kernel log.
+Offsets for the fields describing the location of a data block. Used
+by user-space tools to be able to locate data blocks without
+requiring the declaration of the structure.
-log_buf_len
+printk_info
-----------
-log_buf's length.
+The size of the printk_info structure. This structure contains all
+the meta-data for a record.
-clear_idx
----------
+(printk_info, seq|ts_nsec|text_len|dict_len|caller_id)
+------------------------------------------------------
-The index that the next printk() record to read after the last clear
-command. It indicates the first record after the last SYSLOG_ACTION
-_CLEAR, like issued by 'dmesg -c'. Used by user-space tools to dump
-the dmesg log.
+Offsets for the fields providing the meta-data for a record. Used by
+user-space tools to be able to read the information without requiring
+the declaration of the structure.
-log_next_idx
-------------
+prb_data_ring
+-------------
-The index of the next record to store in the buffer log_buf. Used to
-compute the index of the current buffer position.
+The size of the prb_data_ring structure. This structure contains
+information about a set of data blocks.
-printk_log
-----------
+(prb_data_ring, size_bits|data|head_lpos|tail_lpos)
+---------------------------------------------------
-The size of a structure printk_log. Used to compute the size of
-messages, and extract dmesg log. It encapsulates header information for
-log_buf, such as timestamp, syslog level, etc.
+Offsets for the fields describing a set of data blocks. Used by
+user-space tools to be able to access the data blocks without
+requiring the declaration of the structure.
-(printk_log, ts_nsec|len|text_len|dict_len)
--------------------------------------------
+atomic_long_t
+-------------
+
+The size of the atomic_long_t structure. Used by user-space tools to
+be able to copy the full structure, regardless of its
+architecture-specific implementation.
+
+(atomic_long_t, counter)
+------------------------
-It represents field offsets in struct printk_log. User space tools
-parse it and check whether the values of printk_log's members have been
-changed.
+Offset for the long value of an atomic_long_t variable. Used by
+user-space tools to access the long value without requiring the
+architecture-specific declaration.
(free_area.free_list, MIGRATE_TYPES)
------------------------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 42f1230d1add..7eecfb345806 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13970,6 +13970,7 @@ PRINTK
M: Petr Mladek <pmladek@suse.com>
M: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
R: Steven Rostedt <rostedt@goodmis.org>
+R: John Ogness <john.ogness@linutronix.de>
S: Maintained
F: include/linux/printk.h
F: kernel/printk/
@@ -15620,6 +15621,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git
F: Documentation/ABI/obsolete/sysfs-selinux-checkreqprot
F: Documentation/ABI/obsolete/sysfs-selinux-disable
F: Documentation/admin-guide/LSM/SELinux.rst
+F: include/trace/events/avc.h
F: include/uapi/linux/selinux_netlink.h
F: scripts/selinux/
F: security/selinux/
diff --git a/arch/Kconfig b/arch/Kconfig
index 76ec3395b843..8519d9f42e33 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -450,10 +450,23 @@ config ARCH_WANT_OLD_COMPAT_IPC
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION
bool
+config HAVE_ARCH_SECCOMP
+ bool
+ help
+ An arch should select this symbol to support seccomp mode 1 (the fixed
+ syscall policy), and must provide an overrides for __NR_seccomp_sigreturn,
+ and compat syscalls if the asm-generic/seccomp.h defaults need adjustment:
+ - __NR_seccomp_read_32
+ - __NR_seccomp_write_32
+ - __NR_seccomp_exit_32
+ - __NR_seccomp_sigreturn_32
+
config HAVE_ARCH_SECCOMP_FILTER
bool
+ select HAVE_ARCH_SECCOMP
help
An arch should select this symbol if it provides all of these things:
+ - all the requirements for HAVE_ARCH_SECCOMP
- syscall_get_arch()
- syscall_get_arguments()
- syscall_rollback()
@@ -464,6 +477,23 @@ config HAVE_ARCH_SECCOMP_FILTER
results in the system call being skipped immediately.
- seccomp syscall wired up
+config SECCOMP
+ prompt "Enable seccomp to safely execute untrusted bytecode"
+ def_bool y
+ depends on HAVE_ARCH_SECCOMP
+ help
+ This kernel feature is useful for number crunching applications
+ that may need to handle untrusted bytecode during their
+ execution. By using pipes or other transports made available
+ to the process as file descriptors supporting the read/write
+ syscalls, it's possible to isolate those applications in their
+ own address space using seccomp. Once seccomp is enabled via
+ prctl(PR_SET_SECCOMP) or the seccomp() syscall, it cannot be
+ disabled and the task is only allowed to execute a few safe
+ syscalls defined by each seccomp mode.
+
+ If unsure, say Y.
+
config SECCOMP_FILTER
def_bool y
depends on HAVE_ARCH_SECCOMP_FILTER && SECCOMP && NET
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1ed277f2e4fc..3996b6572c3a 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -68,6 +68,7 @@ config ARM
select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU
select HAVE_ARCH_MMAP_RND_BITS if MMU
+ select HAVE_ARCH_SECCOMP
select HAVE_ARCH_SECCOMP_FILTER if AEABI && !OABI_COMPAT
select HAVE_ARCH_THREAD_STRUCT_WHITELIST
select HAVE_ARCH_TRACEHOOK
@@ -1618,20 +1619,6 @@ config UACCESS_WITH_MEMCPY
However, if the CPU data cache is using a write-allocate mode,
this option is unlikely to provide any performance gain.
-config SECCOMP
- bool
- prompt "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
config PARAVIRT
bool "Enable paravirtualization code"
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 583b6a60b094..893130ce1626 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1041,19 +1041,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
config CC_HAVE_SHADOW_CALL_STACK
def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18)
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
config PARAVIRT
bool "Enable paravirtualization code"
help
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 3d5afb5f5685..7f424c85772c 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -309,16 +309,3 @@ endmenu
source "arch/csky/Kconfig.platforms"
source "kernel/Kconfig.hz"
-
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index d262ac0c8714..37bd6a5f38fb 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -26,6 +26,7 @@ config MICROBLAZE
select GENERIC_SCHED_CLOCK
select HAVE_ARCH_HASH
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_SECCOMP
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
@@ -120,23 +121,6 @@ config CMDLINE_FORCE
Set this to have arguments from the default kernel command string
override those passed by the boot loader.
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- default y
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
endmenu
menu "Kernel features"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index cff19225da3d..440614dc9de2 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -3006,23 +3006,6 @@ config PHYSICAL_START
specified in the "crashkernel=YM@XM" command line boot parameter
passed to the panic-ed kernel).
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- default y
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
config MIPS_O32_FP64_SUPPORT
bool "Support for O32 binaries using 64-bit FP" if !CPU_MIPSR6
depends on 32BIT || MIPS32_O32
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 3b0f53dd70bc..cd4afe1e7a6c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -378,19 +378,3 @@ endmenu
source "drivers/parisc/Kconfig"
-
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 592036103493..1f0bd7e223f5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -946,23 +946,6 @@ config ARCH_WANTS_FREEZER_CONTROL
source "kernel/power/Kconfig"
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- default y
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
config PPC_MEM_KEYS
prompt "PowerPC Memory Protection Keys"
def_bool y
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 7766e1289468..b7821ac36d28 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -334,19 +334,6 @@ menu "Kernel features"
source "kernel/Kconfig.hz"
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
config RISCV_SBI_V01
bool "SBI v0.1 support"
default y
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0a3899386a51..d509bf23ef78 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -792,23 +792,6 @@ config CRASH_DUMP
endmenu
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y.
-
config CCW
def_bool y
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d20927128fce..18278152c91c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -600,22 +600,6 @@ config PHYSICAL_START
where the fail safe kernel needs to run at a different address
than the panic-ed kernel.
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on PROC_FS
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl, it cannot be disabled and the task is only
- allowed to execute a few safe syscalls defined by each seccomp
- mode.
-
- If unsure, say N.
-
config SMP
bool "Symmetric multi-processing support"
depends on SYS_SUPPORTS_SMP
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 91ed1104b7f4..096530eac8e1 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -23,6 +23,7 @@ config SPARC
select HAVE_OPROFILE
select HAVE_ARCH_KGDB if !SMP || SPARC64
select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_SECCOMP if SPARC64
select HAVE_EXIT_THREAD
select HAVE_PCI
select SYSCTL_EXCEPTION_TRACE
@@ -227,23 +228,6 @@ config EARLYFB
help
Say Y here to enable a faster early framebuffer boot console.
-config SECCOMP
- bool "Enable seccomp to safely compute untrusted bytecode"
- depends on SPARC64 && PROC_FS
- default y
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via /proc/<pid>/seccomp, it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
config HOTPLUG_CPU
bool "Support for hot-pluggable CPUs"
depends on SPARC64 && SMP
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index eb51fec75948..d49f471b02e3 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -173,22 +173,6 @@ config PGTABLE_LEVELS
default 3 if 3_LEVEL_PGTABLES
default 2
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y.
-
config UML_TIME_TRAVEL_SUPPORT
bool
prompt "Support time-travel mode (e.g. for test execution)"
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 835d93006bd6..255084c65138 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1970,22 +1970,6 @@ config EFI_MIXED
If unsure, say N.
-config SECCOMP
- def_bool y
- prompt "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
- If unsure, say Y. Only embedded should say N here.
-
source "kernel/Kconfig.hz"
config KEXEC
diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index e997e0119c02..d8a29dc5a284 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -217,20 +217,6 @@ config HOTPLUG_CPU
Say N if you want to disable CPU hotplug.
-config SECCOMP
- bool
- prompt "Enable seccomp to safely compute untrusted bytecode"
- help
- This kernel feature is useful for number crunching applications
- that may need to compute untrusted bytecode during their
- execution. By using pipes or other transports made available to
- the process as file descriptors supporting the read/write
- syscalls, it's possible to isolate those applications in
- their own address space using seccomp. Once seccomp is
- enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
- and the task is only allowed to execute a few safe syscalls
- defined by each seccomp mode.
-
config FAST_SYSCALL_XTENSA
bool "Enable fast atomic syscalls"
default n
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8dd753539c06..0bad9e6066c3 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -4061,22 +4061,21 @@ void device_shutdown(void)
*/
#ifdef CONFIG_PRINTK
-static int
-create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen)
+static void
+set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
{
const char *subsys;
- size_t pos = 0;
+
+ memset(dev_info, 0, sizeof(*dev_info));
if (dev->class)
subsys = dev->class->name;
else if (dev->bus)
subsys = dev->bus->name;
else
- return 0;
+ return;
- pos += snprintf(hdr + pos, hdrlen - pos, "SUBSYSTEM=%s", subsys);
- if (pos >= hdrlen)
- goto overflow;
+ strscpy(dev_info->subsystem, subsys, sizeof(dev_info->subsystem));
/*
* Add device identifier DEVICE=:
@@ -4092,41 +4091,28 @@ create_syslog_header(const struct device *dev, char *hdr, size_t hdrlen)
c = 'b';
else
c = 'c';
- pos++;
- pos += snprintf(hdr + pos, hdrlen - pos,
- "DEVICE=%c%u:%u",
- c, MAJOR(dev->devt), MINOR(dev->devt));
+
+ snprintf(dev_info->device, sizeof(dev_info->device),
+ "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
} else if (strcmp(subsys, "net") == 0) {
struct net_device *net = to_net_dev(dev);
- pos++;
- pos += snprintf(hdr + pos, hdrlen - pos,
- "DEVICE=n%u", net->ifindex);
+ snprintf(dev_info->device, sizeof(dev_info->device),
+ "n%u", net->ifindex);
} else {
- pos++;
- pos += snprintf(hdr + pos, hdrlen - pos,
- "DEVICE=+%s:%s", subsys, dev_name(dev));
+ snprintf(dev_info->device, sizeof(dev_info->device),
+ "+%s:%s", subsys, dev_name(dev));
}
-
- if (pos >= hdrlen)
- goto overflow;
-
- return pos;
-
-overflow:
- dev_WARN(dev, "device/subsystem name too long");
- return 0;
}
int dev_vprintk_emit(int level, const struct device *dev,
const char *fmt, va_list args)
{
- char hdr[128];
- size_t hdrlen;
+ struct dev_printk_info dev_info;
- hdrlen = create_syslog_header(dev, hdr, sizeof(hdr));
+ set_dev_info(dev, &dev_info);
- return vprintk_emit(0, level, hdrlen ? hdr : NULL, hdrlen, fmt, args);
+ return vprintk_emit(0, level, &dev_info, fmt, args);
}
EXPORT_SYMBOL(dev_vprintk_emit);
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index 6594dbc34a37..206bde8308b2 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -55,6 +55,9 @@ phys_addr_t paddr_vmcoreinfo_note(void);
#define VMCOREINFO_OFFSET(name, field) \
vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
(unsigned long)offsetof(struct name, field))
+#define VMCOREINFO_TYPE_OFFSET(name, field) \
+ vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \
+ (unsigned long)offsetof(name, field))
#define VMCOREINFO_LENGTH(name, value) \
vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value)
#define VMCOREINFO_NUMBER(name) \
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index e7e45f0cc7da..2915f56ad421 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -2,9 +2,9 @@
#ifndef __LINUX_DEBUG_LOCKING_H
#define __LINUX_DEBUG_LOCKING_H
-#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/bug.h>
+#include <linux/printk.h>
struct task_struct;
diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h
index 3028b644b4fb..6f009559ee54 100644
--- a/include/linux/dev_printk.h
+++ b/include/linux/dev_printk.h
@@ -21,6 +21,14 @@
struct device;
+#define PRINTK_INFO_SUBSYSTEM_LEN 16
+#define PRINTK_INFO_DEVICE_LEN 48
+
+struct dev_printk_info {
+ char subsystem[PRINTK_INFO_SUBSYSTEM_LEN];
+ char device[PRINTK_INFO_DEVICE_LEN];
+};
+
#ifdef CONFIG_PRINTK
__printf(3, 0) __cold
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c25b8e41c0ea..e4aa29b1ad62 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -526,7 +526,6 @@ extern unsigned int sysctl_oops_all_cpu_backtrace;
#endif /* CONFIG_SMP */
extern void bust_spinlocks(int yes);
-extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
extern int panic_timeout;
extern unsigned long panic_print;
extern int panic_on_oops;
diff --git a/include/linux/overflow.h b/include/linux/overflow.h
index 93fcef105061..f1c4e7b56bd9 100644
--- a/include/linux/overflow.h
+++ b/include/linux/overflow.h
@@ -43,6 +43,16 @@
#define is_non_negative(a) ((a) > 0 || (a) == 0)
#define is_negative(a) (!(is_non_negative(a)))
+/*
+ * Allows for effectively applying __must_check to a macro so we can have
+ * both the type-agnostic benefits of the macros while also being able to
+ * enforce that the return value is, in fact, checked.
+ */
+static inline bool __must_check __must_check_overflow(bool overflow)
+{
+ return unlikely(overflow);
+}
+
#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
/*
* For simplicity and code hygiene, the fallback code below insists on
@@ -52,32 +62,32 @@
* alias for __builtin_add_overflow, but add type checks similar to
* below.
*/
-#define check_add_overflow(a, b, d) ({ \
+#define check_add_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_add_overflow(__a, __b, __d); \
-})
+}))
-#define check_sub_overflow(a, b, d) ({ \
+#define check_sub_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_sub_overflow(__a, __b, __d); \
-})
+}))
-#define check_mul_overflow(a, b, d) ({ \
+#define check_mul_overflow(a, b, d) __must_check_overflow(({ \
typeof(a) __a = (a); \
typeof(b) __b = (b); \
typeof(d) __d = (d); \
(void) (&__a == &__b); \
(void) (&__a == __d); \
__builtin_mul_overflow(__a, __b, __d); \
-})
+}))
#else
@@ -190,21 +200,20 @@
})
-#define check_add_overflow(a, b, d) \
+#define check_add_overflow(a, b, d) __must_check_overflow( \
__builtin_choose_expr(is_signed_type(typeof(a)), \
__signed_add_overflow(a, b, d), \
- __unsigned_add_overflow(a, b, d))
+ __unsigned_add_overflow(a, b, d)))
-#define check_sub_overflow(a, b, d) \
+#define check_sub_overflow(a, b, d) __must_check_overflow( \
__builtin_choose_expr(is_signed_type(typeof(a)), \
__signed_sub_overflow(a, b, d), \
- __unsigned_sub_overflow(a, b, d))
+ __unsigned_sub_overflow(a, b, d)))
-#define check_mul_overflow(a, b, d) \
+#define check_mul_overflow(a, b, d) __must_check_overflow( \
__builtin_choose_expr(is_signed_type(typeof(a)), \
__signed_mul_overflow(a, b, d), \
- __unsigned_mul_overflow(a, b, d))
-
+ __unsigned_mul_overflow(a, b, d)))
#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */
@@ -227,7 +236,7 @@
* '*d' will hold the results of the attempted shift, but is not
* considered "safe for use" if false is returned.
*/
-#define check_shl_overflow(a, s, d) ({ \
+#define check_shl_overflow(a, s, d) __must_check_overflow(({ \
typeof(a) _a = a; \
typeof(s) _s = s; \
typeof(d) _d = d; \
@@ -237,7 +246,7 @@
*_d = (_a_full << _to_shift); \
(_to_shift != _s || is_negative(*_d) || is_negative(_a) || \
(*_d >> _to_shift) != _a); \
-})
+}))
/**
* array_size() - Calculate size of 2-dimensional array.
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 34c1a7be3e01..78479633ccfc 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -12,6 +12,8 @@
extern const char linux_banner[];
extern const char linux_proc_banner[];
+extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */
+
#define PRINTK_MAX_SINGLE_HEADER_LEN 2
static inline int printk_get_level(const char *buffer)
@@ -159,10 +161,12 @@ static inline void printk_nmi_direct_enter(void) { }
static inline void printk_nmi_direct_exit(void) { }
#endif /* PRINTK_NMI */
+struct dev_printk_info;
+
#ifdef CONFIG_PRINTK
-asmlinkage __printf(5, 0)
+asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
asmlinkage __printf(1, 0)
diff --git a/include/trace/events/avc.h b/include/trace/events/avc.h
new file mode 100644
index 000000000000..b55fda2e0773
--- /dev/null
+++ b/include/trace/events/avc.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Authors: Thiébaud Weksteen <tweek@google.com>
+ * Peter Enderborg <Peter.Enderborg@sony.com>
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM avc
+
+#if !defined(_TRACE_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SELINUX_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(selinux_audited,
+
+ TP_PROTO(struct selinux_audit_data *sad,
+ char *scontext,
+ char *tcontext,
+ const char *tclass
+ ),
+
+ TP_ARGS(sad, scontext, tcontext, tclass),
+
+ TP_STRUCT__entry(
+ __field(u32, requested)
+ __field(u32, denied)
+ __field(u32, audited)
+ __field(int, result)
+ __string(scontext, scontext)
+ __string(tcontext, tcontext)
+ __string(tclass, tclass)
+ ),
+
+ TP_fast_assign(
+ __entry->requested = sad->requested;
+ __entry->denied = sad->denied;
+ __entry->audited = sad->audited;
+ __entry->result = sad->result;
+ __assign_str(tcontext, tcontext);
+ __assign_str(scontext, scontext);
+ __assign_str(tclass, tclass);
+ ),
+
+ TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s",
+ __entry->requested, __entry->denied, __entry->audited, __entry->result,
+ __get_str(scontext), __get_str(tcontext), __get_str(tclass)
+ )
+);
+
+#endif
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/init/Kconfig b/init/Kconfig
index c0f56e455dce..2a583ef2ff66 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -682,7 +682,8 @@ config IKHEADERS
config LOG_BUF_SHIFT
int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
- range 12 25
+ range 12 25 if !H8300
+ range 12 19 if H8300
default 17
depends on PRINTK
help
diff --git a/kernel/audit.c b/kernel/audit.c
index 7efaece534a9..68cee3bc8cfe 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -123,9 +123,9 @@ static u32 audit_backlog_limit = 64;
static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
/* The identity of the user shutting down the audit system. */
-kuid_t audit_sig_uid = INVALID_UID;
-pid_t audit_sig_pid = -1;
-u32 audit_sig_sid = 0;
+static kuid_t audit_sig_uid = INVALID_UID;
+static pid_t audit_sig_pid = -1;
+static u32 audit_sig_sid;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -934,8 +934,7 @@ static void audit_free_reply(struct audit_reply *reply)
if (!reply)
return;
- if (reply->skb)
- kfree_skb(reply->skb);
+ kfree_skb(reply->skb);
if (reply->net)
put_net(reply->net);
kfree(reply);
diff --git a/kernel/audit.h b/kernel/audit.h
index ddc22878433d..3b9c0945225a 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -327,10 +327,6 @@ static inline int audit_signal_info_syscall(struct task_struct *t)
extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
-extern pid_t audit_sig_pid;
-extern kuid_t audit_sig_uid;
-extern u32 audit_sig_sid;
-
extern int audit_filter(int msgtype, unsigned int listtype);
extern void audit_ctl_lock(void);
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..eee3dc9b60a9 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -2,3 +2,4 @@
obj-y = printk.o
obj-$(CONFIG_PRINTK) += printk_safe.o
obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+obj-$(CONFIG_PRINTK) += printk_ringbuffer.o
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 660f9a6bf73a..3a8fd491758c 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -14,9 +14,9 @@
extern raw_spinlock_t logbuf_lock;
-__printf(5, 0)
+__printf(4, 0)
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args);
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 9b75f6bfc333..fe64a49344bf 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -55,6 +55,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
+#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"
@@ -294,30 +295,22 @@ enum con_msg_format_flags {
static int console_msg_format = MSG_FORMAT_DEFAULT;
/*
- * The printk log buffer consists of a chain of concatenated variable
- * length records. Every record starts with a record header, containing
- * the overall length of the record.
+ * The printk log buffer consists of a sequenced collection of records, each
+ * containing variable length message text. Every record also contains its
+ * own meta-data (@info).
*
- * The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these entries are maintained when messages are
- * stored.
+ * Every record meta-data carries the timestamp in microseconds, as well as
+ * the standard userspace syslog level and syslog facility. The usual kernel
+ * messages use LOG_KERN; userspace-injected messages always carry a matching
+ * syslog facility, by default LOG_USER. The origin of every message can be
+ * reliably determined that way.
*
- * If the heads indicate available messages, the length in the header
- * tells the start next message. A length == 0 for the next message
- * indicates a wrap-around to the beginning of the buffer.
+ * The human readable log message of a record is available in @text, the
+ * length of the message text in @text_len. The stored message is not
+ * terminated.
*
- * Every record carries the monotonic timestamp in microseconds, as well as
- * the standard userspace syslog level and syslog facility. The usual
- * kernel messages use LOG_KERN; userspace-injected messages always carry
- * a matching syslog facility, by default LOG_USER. The origin of every
- * message can be reliably determined that way.
- *
- * The human readable log message directly follows the message header. The
- * length of the message text is stored in the header, the stored message
- * is not terminated.
- *
- * Optionally, a message can carry a dictionary of properties (key/value pairs),
- * to provide userspace with a machine-readable message context.
+ * Optionally, a record can carry a dictionary of properties (key/value
+ * pairs), to provide userspace with a machine-readable message context.
*
* Examples for well-defined, commonly used property names are:
* DEVICE=b12:8 device identifier
@@ -327,25 +320,22 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
* +sound:card0 subsystem:devname
* SUBSYSTEM=pci driver-core subsystem name
*
- * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value
- * follows directly after a '=' character. Every property is terminated by
- * a '\0' character. The last property is not terminated.
- *
- * Example of a message structure:
- * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec
- * 0008 34 00 record is 52 bytes long
- * 000a 0b 00 text is 11 bytes long
- * 000c 1f 00 dictionary is 23 bytes long
- * 000e 03 00 LOG_KERN (facility) LOG_ERR (level)
- * 0010 69 74 27 73 20 61 20 6c "it's a l"
- * 69 6e 65 "ine"
- * 001b 44 45 56 49 43 "DEVIC"
- * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D"
- * 52 49 56 45 52 3d 62 75 "RIVER=bu"
- * 67 "g"
- * 0032 00 00 00 padding to next message header
- *
- * The 'struct printk_log' buffer header must never be directly exported to
+ * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
+ * and values are terminated by a '\0' character.
+ *
+ * Example of record values:
+ * record.text_buf = "it's a line" (unterminated)
+ * record.info.seq = 56
+ * record.info.ts_nsec = 36863
+ * record.info.text_len = 11
+ * record.info.facility = 0 (LOG_KERN)
+ * record.info.flags = 0
+ * record.info.level = 3 (LOG_ERR)
+ * record.info.caller_id = 299 (task 299)
+ * record.info.dev_info.subsystem = "pci" (terminated)
+ * record.info.dev_info.device = "+pci:0000:00:01.0" (terminated)
+ *
+ * The 'struct printk_info' buffer must never be directly exported to
* userspace, it is a kernel-private implementation detail that might
* need to be changed in the future, when the requirements change.
*
@@ -365,23 +355,6 @@ enum log_flags {
LOG_CONT = 8, /* text is a fragment of a continuation line */
};
-struct printk_log {
- u64 ts_nsec; /* timestamp in nanoseconds */
- u16 len; /* length of entire record */
- u16 text_len; /* length of text buffer */
- u16 dict_len; /* length of dictionary buffer */
- u8 facility; /* syslog facility */
- u8 flags:5; /* internal record flags */
- u8 level:3; /* syslog level */
-#ifdef CONFIG_PRINTK_CALLER
- u32 caller_id; /* thread id or processor id */
-#endif
-}
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
-__packed __aligned(4)
-#endif
-;
-
/*
* The logbuf_lock protects kmsg buffer, indices, counters. This can be taken
* within the scheduler's rq lock. It must be released before calling
@@ -421,26 +394,16 @@ DEFINE_RAW_SPINLOCK(logbuf_lock);
DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
-static u32 syslog_idx;
static size_t syslog_partial;
static bool syslog_time;
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
/* the next printk record to write to the console */
static u64 console_seq;
-static u32 console_idx;
static u64 exclusive_console_stop_seq;
+static unsigned long console_dropped;
/* the next printk record to read after the last 'clear' command */
static u64 clear_seq;
-static u32 clear_idx;
#ifdef CONFIG_PRINTK_CALLER
#define PREFIX_MAX 48
@@ -453,7 +416,7 @@ static u32 clear_idx;
#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
/* record buffer */
-#define LOG_ALIGN __alignof__(struct printk_log)
+#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX (u32)(1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
@@ -461,6 +424,23 @@ static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
/*
+ * Define the average message size. This only affects the number of
+ * descriptors that will be available. Underestimating is better than
+ * overestimating (too many available descriptors is better than not enough).
+ */
+#define PRB_AVGBITS 5 /* 32 character average length */
+
+#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
+#error CONFIG_LOG_BUF_SHIFT value too small.
+#endif
+_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
+ PRB_AVGBITS, &__log_buf[0]);
+
+static struct printk_ringbuffer printk_rb_dynamic;
+
+static struct printk_ringbuffer *prb = &printk_rb_static;
+
+/*
* We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
* per_cpu_areas are initialised. This variable is set to true when
* it's safe to access per-CPU data.
@@ -484,108 +464,6 @@ u32 log_buf_len_get(void)
return log_buf_len;
}
-/* human readable text of the record */
-static char *log_text(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log);
-}
-
-/* optional key/value pair dictionary attached to the record */
-static char *log_dict(const struct printk_log *msg)
-{
- return (char *)msg + sizeof(struct printk_log) + msg->text_len;
-}
-
-/* get record by index; idx must point to valid msg */
-static struct printk_log *log_from_idx(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer.
- */
- if (!msg->len)
- return (struct printk_log *)log_buf;
- return msg;
-}
-
-/* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
-{
- struct printk_log *msg = (struct printk_log *)(log_buf + idx);
-
- /* length == 0 indicates the end of the buffer; wrap */
- /*
- * A length == 0 record is the end of buffer marker. Wrap around and
- * read the message at the start of the buffer as *this* one, and
- * return the one after that.
- */
- if (!msg->len) {
- msg = (struct printk_log *)log_buf;
- return msg->len;
- }
- return idx + msg->len;
-}
-
-/*
- * Check whether there is enough free space for the given message.
- *
- * The same values of first_idx and next_idx mean that the buffer
- * is either empty or full.
- *
- * If the buffer is empty, we must respect the position of the indexes.
- * They cannot be reset to the beginning of the buffer.
- */
-static int logbuf_has_space(u32 msg_size, bool empty)
-{
- u32 free;
-
- if (log_next_idx > log_first_idx || empty)
- free = max(log_buf_len - log_next_idx, log_first_idx);
- else
- free = log_first_idx - log_next_idx;
-
- /*
- * We need space also for an empty header that signalizes wrapping
- * of the buffer.
- */
- return free >= msg_size + sizeof(struct printk_log);
-}
-
-static int log_make_free_space(u32 msg_size)
-{
- while (log_first_seq < log_next_seq &&
- !logbuf_has_space(msg_size, false)) {
- /* drop old messages until we have enough contiguous space */
- log_first_idx = log_next(log_first_idx);
- log_first_seq++;
- }
-
- if (clear_seq < log_first_seq) {
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
- }
-
- /* sequence numbers are equal, so the log buffer is empty */
- if (logbuf_has_space(msg_size, log_first_seq == log_next_seq))
- return 0;
-
- return -ENOMEM;
-}
-
-/* compute the message size including the padding bytes */
-static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
-{
- u32 size;
-
- size = sizeof(struct printk_log) + text_len + dict_len;
- *pad_len = (-size) & (LOG_ALIGN - 1);
- size += *pad_len;
-
- return size;
-}
-
/*
* Define how much of the log buffer we could take at maximum. The value
* must be greater than two. Note that only half of the buffer is available
@@ -594,84 +472,69 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len)
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";
-static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len,
- u16 *dict_len, u32 *pad_len)
+static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
/*
* The message should not take the whole buffer. Otherwise, it might
* get removed too soon.
*/
u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;
+
if (*text_len > max_text_len)
*text_len = max_text_len;
- /* enable the warning message */
+
+ /* enable the warning message (if there is room) */
*trunc_msg_len = strlen(trunc_msg);
- /* disable the "dict" completely */
- *dict_len = 0;
- /* compute the size again, count also the warning message */
- return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len);
+ if (*text_len >= *trunc_msg_len)
+ *text_len -= *trunc_msg_len;
+ else
+ *trunc_msg_len = 0;
}
/* insert record into the buffer, discard old ones, update heads */
static int log_store(u32 caller_id, int facility, int level,
enum log_flags flags, u64 ts_nsec,
- const char *dict, u16 dict_len,
+ const struct dev_printk_info *dev_info,
const char *text, u16 text_len)
{
- struct printk_log *msg;
- u32 size, pad_len;
+ struct prb_reserved_entry e;
+ struct printk_record r;
u16 trunc_msg_len = 0;
- /* number of '\0' padding bytes to next message */
- size = msg_used_size(text_len, dict_len, &pad_len);
+ prb_rec_init_wr(&r, text_len);
- if (log_make_free_space(size)) {
+ if (!prb_reserve(&e, prb, &r)) {
/* truncate the message if it is too long for empty buffer */
- size = truncate_msg(&text_len, &trunc_msg_len,
- &dict_len, &pad_len);
+ truncate_msg(&text_len, &trunc_msg_len);
+ prb_rec_init_wr(&r, text_len + trunc_msg_len);
/* survive when the log buffer is too small for trunc_msg */
- if (log_make_free_space(size))
+ if (!prb_reserve(&e, prb, &r))
return 0;
}
- if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) {
- /*
- * This message + an additional empty header does not fit
- * at the end of the buffer. Add an empty header with len == 0
- * to signify a wrap around.
- */
- memset(log_buf + log_next_idx, 0, sizeof(struct printk_log));
- log_next_idx = 0;
- }
-
/* fill message */
- msg = (struct printk_log *)(log_buf + log_next_idx);
- memcpy(log_text(msg), text, text_len);
- msg->text_len = text_len;
- if (trunc_msg_len) {
- memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len);
- msg->text_len += trunc_msg_len;
- }
- memcpy(log_dict(msg), dict, dict_len);
- msg->dict_len = dict_len;
- msg->facility = facility;
- msg->level = level & 7;
- msg->flags = flags & 0x1f;
+ memcpy(&r.text_buf[0], text, text_len);
+ if (trunc_msg_len)
+ memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
+ r.info->text_len = text_len + trunc_msg_len;
+ r.info->facility = facility;
+ r.info->level = level & 7;
+ r.info->flags = flags & 0x1f;
if (ts_nsec > 0)
- msg->ts_nsec = ts_nsec;
+ r.info->ts_nsec = ts_nsec;
else
- msg->ts_nsec = local_clock();
-#ifdef CONFIG_PRINTK_CALLER
- msg->caller_id = caller_id;
-#endif
- memset(log_dict(msg) + dict_len, 0, pad_len);
- msg->len = size;
+ r.info->ts_nsec = local_clock();
+ r.info->caller_id = caller_id;
+ if (dev_info)
+ memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));
/* insert message */
- log_next_idx += msg->len;
- log_next_seq++;
+ if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE))
+ prb_commit(&e);
+ else
+ prb_final_commit(&e);
- return msg->text_len;
+ return (text_len + trunc_msg_len);
}
int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
@@ -723,13 +586,13 @@ static void append_char(char **pp, char *e, char c)
*(*pp)++ = c;
}
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg, u64 seq)
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
{
- u64 ts_usec = msg->ts_nsec;
+ u64 ts_usec = info->ts_nsec;
char caller[20];
#ifdef CONFIG_PRINTK_CALLER
- u32 id = msg->caller_id;
+ u32 id = info->caller_id;
snprintf(caller, sizeof(caller), ",caller=%c%u",
id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
@@ -740,13 +603,13 @@ static ssize_t msg_print_ext_header(char *buf, size_t size,
do_div(ts_usec, 1000);
return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
- (msg->facility << 3) | msg->level, seq, ts_usec,
- msg->flags & LOG_CONT ? 'c' : '-', caller);
+ (info->facility << 3) | info->level, info->seq,
+ ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}
-static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len)
+static ssize_t msg_add_ext_text(char *buf, size_t size,
+ const char *text, size_t text_len,
+ unsigned char endc)
{
char *p = buf, *e = buf + size;
size_t i;
@@ -760,45 +623,56 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
else
append_char(&p, e, c);
}
- append_char(&p, e, '\n');
+ append_char(&p, e, endc);
- if (dict_len) {
- bool line = true;
+ return p - buf;
+}
- for (i = 0; i < dict_len; i++) {
- unsigned char c = dict[i];
+static ssize_t msg_add_dict_text(char *buf, size_t size,
+ const char *key, const char *val)
+{
+ size_t val_len = strlen(val);
+ ssize_t len;
- if (line) {
- append_char(&p, e, ' ');
- line = false;
- }
+ if (!val_len)
+ return 0;
- if (c == '\0') {
- append_char(&p, e, '\n');
- line = true;
- continue;
- }
+ len = msg_add_ext_text(buf, size, "", 0, ' '); /* dict prefix */
+ len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
+ len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');
- if (c < ' ' || c >= 127 || c == '\\') {
- p += scnprintf(p, e - p, "\\x%02x", c);
- continue;
- }
+ return len;
+}
- append_char(&p, e, c);
- }
- append_char(&p, e, '\n');
- }
+static ssize_t msg_print_ext_body(char *buf, size_t size,
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info)
+{
+ ssize_t len;
- return p - buf;
+ len = msg_add_ext_text(buf, size, text, text_len, '\n');
+
+ if (!dev_info)
+ goto out;
+
+ len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
+ dev_info->subsystem);
+ len += msg_add_dict_text(buf + len, size - len, "DEVICE",
+ dev_info->device);
+out:
+ return len;
}
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
- u32 idx;
struct ratelimit_state rs;
struct mutex lock;
char buf[CONSOLE_EXT_LOG_MAX];
+
+ struct printk_info info;
+ char text_buf[CONSOLE_EXT_LOG_MAX];
+ struct printk_record record;
};
static __printf(3, 4) __cold
@@ -808,7 +682,7 @@ int devkmsg_emit(int facility, int level, const char *fmt, ...)
int r;
va_start(args, fmt);
- r = vprintk_emit(facility, level, NULL, 0, fmt, args);
+ r = vprintk_emit(facility, level, NULL, fmt, args);
va_end(args);
return r;
@@ -881,7 +755,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct devkmsg_user *user = file->private_data;
- struct printk_log *msg;
+ struct printk_record *r = &user->record;
size_t len;
ssize_t ret;
@@ -893,7 +767,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
return ret;
logbuf_lock_irq();
- while (user->seq == log_next_seq) {
+ if (!prb_read_valid(prb, user->seq, r)) {
if (file->f_flags & O_NONBLOCK) {
ret = -EAGAIN;
logbuf_unlock_irq();
@@ -902,30 +776,26 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
logbuf_unlock_irq();
ret = wait_event_interruptible(log_wait,
- user->seq != log_next_seq);
+ prb_read_valid(prb, user->seq, r));
if (ret)
goto out;
logbuf_lock_irq();
}
- if (user->seq < log_first_seq) {
+ if (user->seq < prb_first_valid_seq(prb)) {
/* our last seen message is gone, return error and reset */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
ret = -EPIPE;
logbuf_unlock_irq();
goto out;
}
- msg = log_from_idx(user->idx);
- len = msg_print_ext_header(user->buf, sizeof(user->buf),
- msg, user->seq);
+ len = info_print_ext_header(user->buf, sizeof(user->buf), r->info);
len += msg_print_ext_body(user->buf + len, sizeof(user->buf) - len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
+ &r->text_buf[0], r->info->text_len,
+ &r->info->dev_info);
- user->idx = log_next(user->idx);
- user->seq++;
+ user->seq = r->info->seq + 1;
logbuf_unlock_irq();
if (len > count) {
@@ -965,8 +835,7 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case SEEK_SET:
/* the first record */
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
break;
case SEEK_DATA:
/*
@@ -974,13 +843,11 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
* like issued by 'dmesg -c'. Reading /dev/kmsg itself
* changes no global state, and does not clear anything.
*/
- user->idx = clear_idx;
user->seq = clear_seq;
break;
case SEEK_END:
/* after the last record */
- user->idx = log_next_idx;
- user->seq = log_next_seq;
+ user->seq = prb_next_seq(prb);
break;
default:
ret = -EINVAL;
@@ -1000,9 +867,9 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
poll_wait(file, &log_wait, wait);
logbuf_lock_irq();
- if (user->seq < log_next_seq) {
+ if (prb_read_valid(prb, user->seq, NULL)) {
/* return error when data has vanished underneath us */
- if (user->seq < log_first_seq)
+ if (user->seq < prb_first_valid_seq(prb))
ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
else
ret = EPOLLIN|EPOLLRDNORM;
@@ -1037,9 +904,11 @@ static int devkmsg_open(struct inode *inode, struct file *file)
mutex_init(&user->lock);
+ prb_rec_init_rd(&user->record, &user->info,
+ &user->text_buf[0], sizeof(user->text_buf));
+
logbuf_lock_irq();
- user->idx = log_first_idx;
- user->seq = log_first_seq;
+ user->seq = prb_first_valid_seq(prb);
logbuf_unlock_irq();
file->private_data = user;
@@ -1080,23 +949,58 @@ const struct file_operations kmsg_fops = {
*/
void log_buf_vmcoreinfo_setup(void)
{
- VMCOREINFO_SYMBOL(log_buf);
- VMCOREINFO_SYMBOL(log_buf_len);
- VMCOREINFO_SYMBOL(log_first_idx);
- VMCOREINFO_SYMBOL(clear_idx);
- VMCOREINFO_SYMBOL(log_next_idx);
+ struct dev_printk_info *dev_info = NULL;
+
+ VMCOREINFO_SYMBOL(prb);
+ VMCOREINFO_SYMBOL(printk_rb_static);
+ VMCOREINFO_SYMBOL(clear_seq);
+
/*
- * Export struct printk_log size and field offsets. User space tools can
+ * Export struct size and field offsets. User space tools can
* parse it and detect any changes to structure down the line.
*/
- VMCOREINFO_STRUCT_SIZE(printk_log);
- VMCOREINFO_OFFSET(printk_log, ts_nsec);
- VMCOREINFO_OFFSET(printk_log, len);
- VMCOREINFO_OFFSET(printk_log, text_len);
- VMCOREINFO_OFFSET(printk_log, dict_len);
-#ifdef CONFIG_PRINTK_CALLER
- VMCOREINFO_OFFSET(printk_log, caller_id);
-#endif
+
+ VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
+ VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
+ VMCOREINFO_OFFSET(printk_ringbuffer, fail);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
+ VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
+ VMCOREINFO_OFFSET(prb_desc_ring, descs);
+ VMCOREINFO_OFFSET(prb_desc_ring, infos);
+ VMCOREINFO_OFFSET(prb_desc_ring, head_id);
+ VMCOREINFO_OFFSET(prb_desc_ring, tail_id);
+
+ VMCOREINFO_STRUCT_SIZE(prb_desc);
+ VMCOREINFO_OFFSET(prb_desc, state_var);
+ VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
+ VMCOREINFO_OFFSET(prb_data_blk_lpos, next);
+
+ VMCOREINFO_STRUCT_SIZE(printk_info);
+ VMCOREINFO_OFFSET(printk_info, seq);
+ VMCOREINFO_OFFSET(printk_info, ts_nsec);
+ VMCOREINFO_OFFSET(printk_info, text_len);
+ VMCOREINFO_OFFSET(printk_info, caller_id);
+ VMCOREINFO_OFFSET(printk_info, dev_info);
+
+ VMCOREINFO_STRUCT_SIZE(dev_printk_info);
+ VMCOREINFO_OFFSET(dev_printk_info, subsystem);
+ VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
+ VMCOREINFO_OFFSET(dev_printk_info, device);
+ VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));
+
+ VMCOREINFO_STRUCT_SIZE(prb_data_ring);
+ VMCOREINFO_OFFSET(prb_data_ring, size_bits);
+ VMCOREINFO_OFFSET(prb_data_ring, data);
+ VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
+ VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);
+
+ VMCOREINFO_SIZE(atomic_long_t);
+ VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);
}
#endif
@@ -1174,11 +1078,46 @@ static void __init set_percpu_data_ready(void)
__printk_percpu_data_ready = true;
}
+static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_reserved_entry e;
+ struct printk_record dest_r;
+
+ prb_rec_init_wr(&dest_r, r->info->text_len);
+
+ if (!prb_reserve(&e, rb, &dest_r))
+ return 0;
+
+ memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
+ dest_r.info->text_len = r->info->text_len;
+ dest_r.info->facility = r->info->facility;
+ dest_r.info->level = r->info->level;
+ dest_r.info->flags = r->info->flags;
+ dest_r.info->ts_nsec = r->info->ts_nsec;
+ dest_r.info->caller_id = r->info->caller_id;
+ memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));
+
+ prb_final_commit(&e);
+
+ return prb_record_text_space(&e);
+}
+
+static char setup_text_buf[LOG_LINE_MAX] __initdata;
+
void __init setup_log_buf(int early)
{
+ struct printk_info *new_infos;
+ unsigned int new_descs_count;
+ struct prb_desc *new_descs;
+ struct printk_info info;
+ struct printk_record r;
+ size_t new_descs_size;
+ size_t new_infos_size;
unsigned long flags;
char *new_log_buf;
unsigned int free;
+ u64 seq;
/*
* Some archs call setup_log_buf() multiple times - first is very
@@ -1197,24 +1136,75 @@ void __init setup_log_buf(int early)
if (!new_log_buf_len)
return;
+ new_descs_count = new_log_buf_len >> PRB_AVGBITS;
+ if (new_descs_count == 0) {
+ pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
+ return;
+ }
+
new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
if (unlikely(!new_log_buf)) {
- pr_err("log_buf_len: %lu bytes not available\n",
- new_log_buf_len);
+ pr_err("log_buf_len: %lu text bytes not available\n",
+ new_log_buf_len);
return;
}
+ new_descs_size = new_descs_count * sizeof(struct prb_desc);
+ new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
+ if (unlikely(!new_descs)) {
+ pr_err("log_buf_len: %zu desc bytes not available\n",
+ new_descs_size);
+ goto err_free_log_buf;
+ }
+
+ new_infos_size = new_descs_count * sizeof(struct printk_info);
+ new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
+ if (unlikely(!new_infos)) {
+ pr_err("log_buf_len: %zu info bytes not available\n",
+ new_infos_size);
+ goto err_free_descs;
+ }
+
+ prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));
+
+ prb_init(&printk_rb_dynamic,
+ new_log_buf, ilog2(new_log_buf_len),
+ new_descs, ilog2(new_descs_count),
+ new_infos);
+
logbuf_lock_irqsave(flags);
+
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
- free = __LOG_BUF_LEN - log_next_idx;
- memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+
+ free = __LOG_BUF_LEN;
+ prb_for_each_record(0, &printk_rb_static, seq, &r)
+ free -= add_to_rb(&printk_rb_dynamic, &r);
+
+ /*
+ * This is early enough that everything is still running on the
+ * boot CPU and interrupts are disabled. So no new messages will
+ * appear during the transition to the dynamic buffer.
+ */
+ prb = &printk_rb_dynamic;
+
logbuf_unlock_irqrestore(flags);
+ if (seq != prb_next_seq(&printk_rb_static)) {
+ pr_err("dropped %llu messages\n",
+ prb_next_seq(&printk_rb_static) - seq);
+ }
+
pr_info("log_buf_len: %u bytes\n", log_buf_len);
pr_info("early log buf free: %u(%u%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
+ return;
+
+err_free_descs:
+ memblock_free(__pa(new_descs), new_descs_size);
+err_free_log_buf:
+ memblock_free(__pa(new_log_buf), new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
@@ -1321,18 +1311,18 @@ static size_t print_caller(u32 id, char *buf)
#define print_caller(id, buf) 0
#endif
-static size_t print_prefix(const struct printk_log *msg, bool syslog,
- bool time, char *buf)
+static size_t info_print_prefix(const struct printk_info *info, bool syslog,
+ bool time, char *buf)
{
size_t len = 0;
if (syslog)
- len = print_syslog((msg->facility << 3) | msg->level, buf);
+ len = print_syslog((info->facility << 3) | info->level, buf);
if (time)
- len += print_time(msg->ts_nsec, buf + len);
+ len += print_time(info->ts_nsec, buf + len);
- len += print_caller(msg->caller_id, buf + len);
+ len += print_caller(info->caller_id, buf + len);
if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
buf[len++] = ' ';
@@ -1342,72 +1332,150 @@ static size_t print_prefix(const struct printk_log *msg, bool syslog,
return len;
}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size)
+/*
+ * Prepare the record for printing. The text is shifted within the given
+ * buffer to avoid a need for another one. The following operations are
+ * done:
+ *
+ * - Add prefix for each line.
+ * - Add the trailing newline that has been removed in vprintk_store().
+ * - Drop truncated lines that do not longer fit into the buffer.
+ *
+ * Return: The length of the updated/prepared text, including the added
+ * prefixes and the newline. The dropped line(s) are not counted.
+ */
+static size_t record_print_text(struct printk_record *r, bool syslog,
+ bool time)
{
- const char *text = log_text(msg);
- size_t text_size = msg->text_len;
- size_t len = 0;
+ size_t text_len = r->info->text_len;
+ size_t buf_size = r->text_buf_size;
+ char *text = r->text_buf;
char prefix[PREFIX_MAX];
- const size_t prefix_len = print_prefix(msg, syslog, time, prefix);
+ bool truncated = false;
+ size_t prefix_len;
+ size_t line_len;
+ size_t len = 0;
+ char *next;
+
+ /*
+ * If the message was truncated because the buffer was not large
+ * enough, treat the available text as if it were the full text.
+ */
+ if (text_len > buf_size)
+ text_len = buf_size;
- do {
- const char *next = memchr(text, '\n', text_size);
- size_t text_len;
+ prefix_len = info_print_prefix(r->info, syslog, time, prefix);
+ /*
+ * @text_len: bytes of unprocessed text
+ * @line_len: bytes of current line _without_ newline
+ * @text: pointer to beginning of current line
+ * @len: number of bytes prepared in r->text_buf
+ */
+ for (;;) {
+ next = memchr(text, '\n', text_len);
if (next) {
- text_len = next - text;
- next++;
- text_size -= next - text;
+ line_len = next - text;
} else {
- text_len = text_size;
+ /* Drop truncated line(s). */
+ if (truncated)
+ break;
+ line_len = text_len;
}
- if (buf) {
- if (prefix_len + text_len + 1 >= size - len)
+ /*
+ * Truncate the text if there is not enough space to add the
+ * prefix and a trailing newline.
+ */
+ if (len + prefix_len + text_len + 1 > buf_size) {
+ /* Drop even the current line if no space. */
+ if (len + prefix_len + line_len + 1 > buf_size)
break;
- memcpy(buf + len, prefix, prefix_len);
- len += prefix_len;
- memcpy(buf + len, text, text_len);
- len += text_len;
- buf[len++] = '\n';
- } else {
- /* SYSLOG_ACTION_* buffer size only calculation */
- len += prefix_len + text_len + 1;
+ text_len = buf_size - len - prefix_len - 1;
+ truncated = true;
}
- text = next;
- } while (text);
+ memmove(text + prefix_len, text, text_len);
+ memcpy(text, prefix, prefix_len);
+
+ len += prefix_len + line_len + 1;
+
+ if (text_len == line_len) {
+ /*
+ * Add the trailing newline removed in
+ * vprintk_store().
+ */
+ text[prefix_len + line_len] = '\n';
+ break;
+ }
+
+ /*
+ * Advance beyond the added prefix and the related line with
+ * its newline.
+ */
+ text += prefix_len + line_len + 1;
+
+ /*
+ * The remaining text has only decreased by the line with its
+ * newline.
+ *
+ * Note that @text_len can become zero. It happens when @text
+ * ended with a newline (either due to truncation or the
+ * original string ending with "\n\n"). The loop is correctly
+ * repeated and (if not truncated) an empty line with a prefix
+ * will be prepared.
+ */
+ text_len -= line_len + 1;
+ }
return len;
}
+static size_t get_record_print_text_size(struct printk_info *info,
+ unsigned int line_count,
+ bool syslog, bool time)
+{
+ char prefix[PREFIX_MAX];
+ size_t prefix_len;
+
+ prefix_len = info_print_prefix(info, syslog, time, prefix);
+
+ /*
+ * Each line will be preceded with a prefix. The intermediate
+ * newlines are already within the text, but a final trailing
+ * newline will be added.
+ */
+ return ((prefix_len * line_count) + info->text_len + 1);
+}
+
static int syslog_print(char __user *buf, int size)
{
+ struct printk_info info;
+ struct printk_record r;
char *text;
- struct printk_log *msg;
int len = 0;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
if (!text)
return -ENOMEM;
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
+
while (size > 0) {
size_t n;
size_t skip;
logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
- /* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
- syslog_partial = 0;
- }
- if (syslog_seq == log_next_seq) {
+ if (!prb_read_valid(prb, syslog_seq, &r)) {
logbuf_unlock_irq();
break;
}
+ if (r.info->seq != syslog_seq) {
+ /* message is gone, move to next valid one */
+ syslog_seq = r.info->seq;
+ syslog_partial = 0;
+ }
/*
* To keep reading/counting partial line consistent,
@@ -1417,13 +1485,10 @@ static int syslog_print(char __user *buf, int size)
syslog_time = printk_time;
skip = syslog_partial;
- msg = log_from_idx(syslog_idx);
- n = msg_print_text(msg, true, syslog_time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ n = record_print_text(&r, true, syslog_time);
if (n - syslog_partial <= size) {
/* message fits into buffer, move forward */
- syslog_idx = log_next(syslog_idx);
- syslog_seq++;
+ syslog_seq = r.info->seq + 1;
n -= syslog_partial;
syslog_partial = 0;
} else if (!len){
@@ -1454,11 +1519,12 @@ static int syslog_print(char __user *buf, int size)
static int syslog_print_all(char __user *buf, int size, bool clear)
{
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
char *text;
int len = 0;
- u64 next_seq;
u64 seq;
- u32 idx;
bool time;
text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL);
@@ -1471,38 +1537,28 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
- }
+ prb_for_each_info(clear_seq, prb, seq, &info, &line_count)
+ len += get_record_print_text_size(&info, line_count, true, time);
/* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
- while (len > size && seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- len -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ prb_for_each_info(clear_seq, prb, seq, &info, &line_count) {
+ if (len <= size)
+ break;
+ len -= get_record_print_text_size(&info, line_count, true, time);
}
- /* last message fitting into this dump */
- next_seq = log_next_seq;
+ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX);
len = 0;
- while (len >= 0 && seq < next_seq) {
- struct printk_log *msg = log_from_idx(idx);
- int textlen = msg_print_text(msg, true, time, text,
- LOG_LINE_MAX + PREFIX_MAX);
+ prb_for_each_record(seq, prb, seq, &r) {
+ int textlen;
- idx = log_next(idx);
- seq++;
+ textlen = record_print_text(&r, true, time);
+
+ if (len + textlen > size) {
+ seq--;
+ break;
+ }
logbuf_unlock_irq();
if (copy_to_user(buf + len, text, textlen))
@@ -1511,17 +1567,12 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
len += textlen;
logbuf_lock_irq();
- if (seq < log_first_seq) {
- /* messages are gone, move to next one */
- seq = log_first_seq;
- idx = log_first_idx;
- }
+ if (len < 0)
+ break;
}
- if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
- }
+ if (clear)
+ clear_seq = seq;
logbuf_unlock_irq();
kfree(text);
@@ -1531,8 +1582,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
static void syslog_clear(void)
{
logbuf_lock_irq();
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ clear_seq = prb_next_seq(prb);
logbuf_unlock_irq();
}
@@ -1559,7 +1609,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
if (!access_ok(buf, len))
return -EFAULT;
error = wait_event_interruptible(log_wait,
- syslog_seq != log_next_seq);
+ prb_read_valid(prb, syslog_seq, NULL));
if (error)
return error;
error = syslog_print(buf, len);
@@ -1567,7 +1617,7 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Read/clear last kernel messages */
case SYSLOG_ACTION_READ_CLEAR:
clear = true;
- /* FALL THRU */
+ fallthrough;
/* Read last kernel messages */
case SYSLOG_ACTION_READ_ALL:
if (!buf || len < 0)
@@ -1608,10 +1658,9 @@ int do_syslog(int type, char __user *buf, int len, int source)
/* Number of chars in the log buffer */
case SYSLOG_ACTION_SIZE_UNREAD:
logbuf_lock_irq();
- if (syslog_seq < log_first_seq) {
+ if (syslog_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first one */
- syslog_seq = log_first_seq;
- syslog_idx = log_first_idx;
+ syslog_seq = prb_first_valid_seq(prb);
syslog_partial = 0;
}
if (source == SYSLOG_FROM_PROC) {
@@ -1620,20 +1669,18 @@ int do_syslog(int type, char __user *buf, int len, int source)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_seq - syslog_seq;
+ error = prb_next_seq(prb) - syslog_seq;
} else {
- u64 seq = syslog_seq;
- u32 idx = syslog_idx;
bool time = syslog_partial ? syslog_time : printk_time;
-
- while (seq < log_next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- error += msg_print_text(msg, true, time, NULL,
- 0);
+ struct printk_info info;
+ unsigned int line_count;
+ u64 seq;
+
+ prb_for_each_info(syslog_seq, prb, seq, &info,
+ &line_count) {
+ error += get_record_print_text_size(&info, line_count,
+ true, time);
time = printk_time;
- idx = log_next(idx);
- seq++;
}
error -= syslog_partial;
}
@@ -1804,10 +1851,22 @@ static int console_trylock_spinning(void)
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len)
{
+ static char dropped_text[64];
+ size_t dropped_len = 0;
struct console *con;
trace_console_rcuidle(text, len);
+ if (!console_drivers)
+ return;
+
+ if (console_dropped) {
+ dropped_len = snprintf(dropped_text, sizeof(dropped_text),
+ "** %lu printk messages dropped **\n",
+ console_dropped);
+ console_dropped = 0;
+ }
+
for_each_console(con) {
if (exclusive_console && con != exclusive_console)
continue;
@@ -1820,8 +1879,11 @@ static void call_console_drivers(const char *ext_text, size_t ext_len,
continue;
if (con->flags & CON_EXTENDED)
con->write(con, ext_text, ext_len);
- else
+ else {
+ if (dropped_len)
+ con->write(con, dropped_text, dropped_len);
con->write(con, text, len);
+ }
}
}
@@ -1845,97 +1907,38 @@ static inline u32 printk_caller_id(void)
0x80000000 + raw_smp_processor_id();
}
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
- char buf[LOG_LINE_MAX];
- size_t len; /* length == 0 means unused buffer */
- u32 caller_id; /* printk_caller_id() of first print */
- u64 ts_nsec; /* time of first print */
- u8 level; /* log level of first message */
- u8 facility; /* log facility of first message */
- enum log_flags flags; /* prefix, newline flags */
-} cont;
-
-static void cont_flush(void)
-{
- if (cont.len == 0)
- return;
-
- log_store(cont.caller_id, cont.facility, cont.level, cont.flags,
- cont.ts_nsec, NULL, 0, cont.buf, cont.len);
- cont.len = 0;
-}
-
-static bool cont_add(u32 caller_id, int facility, int level,
- enum log_flags flags, const char *text, size_t len)
-{
- /* If the line gets too long, split it up in separate records. */
- if (cont.len + len > sizeof(cont.buf)) {
- cont_flush();
- return false;
- }
-
- if (!cont.len) {
- cont.facility = facility;
- cont.level = level;
- cont.caller_id = caller_id;
- cont.ts_nsec = local_clock();
- cont.flags = flags;
- }
-
- memcpy(cont.buf + cont.len, text, len);
- cont.len += len;
-
- // The original flags come from the first line,
- // but later continuations can add a newline.
- if (flags & LOG_NEWLINE) {
- cont.flags |= LOG_NEWLINE;
- cont_flush();
- }
-
- return true;
-}
-
-static size_t log_output(int facility, int level, enum log_flags lflags, const char *dict, size_t dictlen, char *text, size_t text_len)
+static size_t log_output(int facility, int level, enum log_flags lflags,
+ const struct dev_printk_info *dev_info,
+ char *text, size_t text_len)
{
const u32 caller_id = printk_caller_id();
- /*
- * If an earlier line was buffered, and we're a continuation
- * write from the same context, try to add it to the buffer.
- */
- if (cont.len) {
- if (cont.caller_id == caller_id && (lflags & LOG_CONT)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
- return text_len;
- }
- /* Otherwise, make sure it's flushed */
- cont_flush();
- }
-
- /* Skip empty continuation lines that couldn't be added - they just flush */
- if (!text_len && (lflags & LOG_CONT))
- return 0;
-
- /* If it doesn't end in a newline, try to buffer the current line */
- if (!(lflags & LOG_NEWLINE)) {
- if (cont_add(caller_id, facility, level, lflags, text, text_len))
+ if (lflags & LOG_CONT) {
+ struct prb_reserved_entry e;
+ struct printk_record r;
+
+ prb_rec_init_wr(&r, text_len);
+ if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) {
+ memcpy(&r.text_buf[r.info->text_len], text, text_len);
+ r.info->text_len += text_len;
+ if (lflags & LOG_NEWLINE) {
+ r.info->flags |= LOG_NEWLINE;
+ prb_final_commit(&e);
+ } else {
+ prb_commit(&e);
+ }
return text_len;
+ }
}
/* Store it in the record log */
return log_store(caller_id, facility, level, lflags, 0,
- dict, dictlen, text, text_len);
+ dev_info, text, text_len);
}
/* Must be called under logbuf_lock. */
int vprintk_store(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
static char textbuf[LOG_LINE_MAX];
@@ -1977,21 +1980,19 @@ int vprintk_store(int facility, int level,
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
- if (dict)
+ if (dev_info)
lflags |= LOG_NEWLINE;
- return log_output(facility, level, lflags,
- dict, dictlen, text, text_len);
+ return log_output(facility, level, lflags, dev_info, text, text_len);
}
asmlinkage int vprintk_emit(int facility, int level,
- const char *dict, size_t dictlen,
+ const struct dev_printk_info *dev_info,
const char *fmt, va_list args)
{
int printed_len;
- bool in_sched = false, pending_output;
+ bool in_sched = false;
unsigned long flags;
- u64 curr_log_seq;
/* Suppress unimportant messages after panic happens */
if (unlikely(suppress_printk))
@@ -2007,13 +2008,11 @@ asmlinkage int vprintk_emit(int facility, int level,
/* This stops the holder of console_sem just where we want him */
logbuf_lock_irqsave(flags);
- curr_log_seq = log_next_seq;
- printed_len = vprintk_store(facility, level, dict, dictlen, fmt, args);
- pending_output = (curr_log_seq != log_next_seq);
+ printed_len = vprintk_store(facility, level, dev_info, fmt, args);
logbuf_unlock_irqrestore(flags);
/* If called from the scheduler, we can not call up(). */
- if (!in_sched && pending_output) {
+ if (!in_sched) {
/*
* Disable preemption to avoid being preempted while holding
* console_sem which would prevent anyone from printing to
@@ -2030,8 +2029,7 @@ asmlinkage int vprintk_emit(int facility, int level,
preempt_enable();
}
- if (pending_output)
- wake_up_klogd();
+ wake_up_klogd();
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -2044,7 +2042,7 @@ EXPORT_SYMBOL(vprintk);
int vprintk_default(const char *fmt, va_list args)
{
- return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);
@@ -2088,30 +2086,31 @@ EXPORT_SYMBOL(printk);
#define PREFIX_MAX 0
#define printk_time false
+#define prb_read_valid(rb, seq, r) false
+#define prb_first_valid_seq(rb) 0
+
static u64 syslog_seq;
-static u32 syslog_idx;
static u64 console_seq;
-static u32 console_idx;
static u64 exclusive_console_stop_seq;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static char *log_text(const struct printk_log *msg) { return NULL; }
-static char *log_dict(const struct printk_log *msg) { return NULL; }
-static struct printk_log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
-static ssize_t msg_print_ext_header(char *buf, size_t size,
- struct printk_log *msg,
- u64 seq) { return 0; }
+static unsigned long console_dropped;
+
+static size_t record_print_text(const struct printk_record *r,
+ bool syslog, bool time)
+{
+ return 0;
+}
+static ssize_t info_print_ext_header(char *buf, size_t size,
+ struct printk_info *info)
+{
+ return 0;
+}
static ssize_t msg_print_ext_body(char *buf, size_t size,
- char *dict, size_t dict_len,
- char *text, size_t text_len) { return 0; }
+ char *text, size_t text_len,
+ struct dev_printk_info *dev_info) { return 0; }
static void console_lock_spinning_enable(void) { }
static int console_lock_spinning_disable_and_check(void) { return 0; }
static void call_console_drivers(const char *ext_text, size_t ext_len,
const char *text, size_t len) {}
-static size_t msg_print_text(const struct printk_log *msg, bool syslog,
- bool time, char *buf, size_t size) { return 0; }
static bool suppress_message_printing(int level) { return false; }
#endif /* CONFIG_PRINTK */
@@ -2398,12 +2397,16 @@ void console_unlock(void)
static char text[LOG_LINE_MAX + PREFIX_MAX];
unsigned long flags;
bool do_cond_resched, retry;
+ struct printk_info info;
+ struct printk_record r;
if (console_suspended) {
up_console_sem();
return;
}
+ prb_rec_init_rd(&r, &info, text, sizeof(text));
+
/*
* Console drivers are called with interrupts disabled, so
* @console_may_schedule should be cleared before; however, we may
@@ -2416,7 +2419,7 @@ void console_unlock(void)
*
* console_trylock() is not able to detect the preemptive
* context reliably. Therefore the value must be stored before
- * and cleared after the the "again" goto label.
+ * and cleared after the "again" goto label.
*/
do_cond_resched = console_may_schedule;
again:
@@ -2434,35 +2437,26 @@ again:
}
for (;;) {
- struct printk_log *msg;
size_t ext_len = 0;
size_t len;
printk_safe_enter_irqsave(flags);
raw_spin_lock(&logbuf_lock);
- if (console_seq < log_first_seq) {
- len = snprintf(text, sizeof(text),
- "** %llu printk messages dropped **\n",
- log_first_seq - console_seq);
-
- /* messages are gone, move to first one */
- console_seq = log_first_seq;
- console_idx = log_first_idx;
- } else {
- len = 0;
- }
skip:
- if (console_seq == log_next_seq)
+ if (!prb_read_valid(prb, console_seq, &r))
break;
- msg = log_from_idx(console_idx);
- if (suppress_message_printing(msg->level)) {
+ if (console_seq != r.info->seq) {
+ console_dropped += r.info->seq - console_seq;
+ console_seq = r.info->seq;
+ }
+
+ if (suppress_message_printing(r.info->level)) {
/*
* Skip record we have buffered and already printed
* directly to the console when we received it, and
* record that has level above the console loglevel.
*/
- console_idx = log_next(console_idx);
console_seq++;
goto skip;
}
@@ -2473,19 +2467,23 @@ skip:
exclusive_console = NULL;
}
- len += msg_print_text(msg,
- console_msg_format & MSG_FORMAT_SYSLOG,
- printk_time, text + len, sizeof(text) - len);
+ /*
+ * Handle extended console text first because later
+ * record_print_text() will modify the record buffer in-place.
+ */
if (nr_ext_console_drivers) {
- ext_len = msg_print_ext_header(ext_text,
+ ext_len = info_print_ext_header(ext_text,
sizeof(ext_text),
- msg, console_seq);
+ r.info);
ext_len += msg_print_ext_body(ext_text + ext_len,
sizeof(ext_text) - ext_len,
- log_dict(msg), msg->dict_len,
- log_text(msg), msg->text_len);
+ &r.text_buf[0],
+ r.info->text_len,
+ &r.info->dev_info);
}
- console_idx = log_next(console_idx);
+ len = record_print_text(&r,
+ console_msg_format & MSG_FORMAT_SYSLOG,
+ printk_time);
console_seq++;
raw_spin_unlock(&logbuf_lock);
@@ -2525,7 +2523,7 @@ skip:
* flush, no worries.
*/
raw_spin_lock(&logbuf_lock);
- retry = console_seq != log_next_seq;
+ retry = prb_read_valid(prb, console_seq, NULL);
raw_spin_unlock(&logbuf_lock);
printk_safe_exit_irqrestore(flags);
@@ -2594,8 +2592,7 @@ void console_flush_on_panic(enum con_flush_mode mode)
unsigned long flags;
logbuf_lock_irqsave(flags);
- console_seq = log_first_seq;
- console_idx = log_first_idx;
+ console_seq = prb_first_valid_seq(prb);
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -2838,7 +2835,6 @@ void register_console(struct console *newcon)
exclusive_console = newcon;
exclusive_console_stop_seq = console_seq;
console_seq = syslog_seq;
- console_idx = syslog_idx;
logbuf_unlock_irqrestore(flags);
}
console_unlock();
@@ -3062,7 +3058,7 @@ int vprintk_deferred(const char *fmt, va_list args)
{
int r;
- r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
+ r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
defer_console_output();
return r;
@@ -3227,9 +3223,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
logbuf_lock_irqsave(flags);
dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->next_seq = prb_next_seq(prb);
logbuf_unlock_irqrestore(flags);
/* invoke dumper which will iterate over records */
@@ -3263,28 +3257,33 @@ void kmsg_dump(enum kmsg_dump_reason reason)
bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
char *line, size_t size, size_t *len)
{
- struct printk_log *msg;
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
size_t l = 0;
bool ret = false;
+ prb_rec_init_rd(&r, &info, line, size);
+
if (!dumper->active)
goto out;
- if (dumper->cur_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
- }
-
- /* last entry */
- if (dumper->cur_seq >= log_next_seq)
- goto out;
+ /* Read text or count text lines? */
+ if (line) {
+ if (!prb_read_valid(prb, dumper->cur_seq, &r))
+ goto out;
+ l = record_print_text(&r, syslog, printk_time);
+ } else {
+ if (!prb_read_valid_info(prb, dumper->cur_seq,
+ &info, &line_count)) {
+ goto out;
+ }
+ l = get_record_print_text_size(&info, line_count, syslog,
+ printk_time);
- msg = log_from_idx(dumper->cur_idx);
- l = msg_print_text(msg, syslog, printk_time, line, size);
+ }
- dumper->cur_idx = log_next(dumper->cur_idx);
- dumper->cur_seq++;
+ dumper->cur_seq = r.info->seq + 1;
ret = true;
out:
if (len)
@@ -3332,7 +3331,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
* @len: length of line placed into buffer
*
* Start at the end of the kmsg buffer and fill the provided buffer
- * with as many of the the *youngest* kmsg records that fit into it.
+ * with as many of the *youngest* kmsg records that fit into it.
* If the buffer is large enough, all available kmsg records will be
* copied with a single call.
*
@@ -3345,23 +3344,25 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
char *buf, size_t size, size_t *len)
{
+ struct printk_info info;
+ unsigned int line_count;
+ struct printk_record r;
unsigned long flags;
u64 seq;
- u32 idx;
u64 next_seq;
- u32 next_idx;
size_t l = 0;
bool ret = false;
bool time = printk_time;
- if (!dumper->active)
+ prb_rec_init_rd(&r, &info, buf, size);
+
+ if (!dumper->active || !buf || !size)
goto out;
logbuf_lock_irqsave(flags);
- if (dumper->cur_seq < log_first_seq) {
+ if (dumper->cur_seq < prb_first_valid_seq(prb)) {
/* messages are gone, move to first available one */
- dumper->cur_seq = log_first_seq;
- dumper->cur_idx = log_first_idx;
+ dumper->cur_seq = prb_first_valid_seq(prb);
}
/* last entry */
@@ -3372,41 +3373,41 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
/* calculate length of entire buffer */
seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l += msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ while (prb_read_valid_info(prb, seq, &info, &line_count)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+ l += get_record_print_text_size(&info, line_count, true, time);
+ seq = r.info->seq + 1;
}
/* move first record forward until length fits into the buffer */
seq = dumper->cur_seq;
- idx = dumper->cur_idx;
- while (l >= size && seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
-
- l -= msg_print_text(msg, true, time, NULL, 0);
- idx = log_next(idx);
- seq++;
+ while (l >= size && prb_read_valid_info(prb, seq,
+ &info, &line_count)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+ l -= get_record_print_text_size(&info, line_count, true, time);
+ seq = r.info->seq + 1;
}
/* last message in next interation */
next_seq = seq;
- next_idx = idx;
+ /* actually read text into the buffer now */
l = 0;
- while (seq < dumper->next_seq) {
- struct printk_log *msg = log_from_idx(idx);
+ while (prb_read_valid(prb, seq, &r)) {
+ if (r.info->seq >= dumper->next_seq)
+ break;
+
+ l += record_print_text(&r, syslog, time);
+
+ /* adjust record to store to remaining buffer space */
+ prb_rec_init_rd(&r, &info, buf + l, size - l);
- l += msg_print_text(msg, syslog, time, buf + l, size - l);
- idx = log_next(idx);
- seq++;
+ seq = r.info->seq + 1;
}
dumper->next_seq = next_seq;
- dumper->next_idx = next_idx;
ret = true;
logbuf_unlock_irqrestore(flags);
out:
@@ -3429,9 +3430,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
{
dumper->cur_seq = clear_seq;
- dumper->cur_idx = clear_idx;
- dumper->next_seq = log_next_seq;
- dumper->next_idx = log_next_idx;
+ dumper->next_seq = prb_next_seq(prb);
}
/**
diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c
new file mode 100644
index 000000000000..2493348a1631
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.c
@@ -0,0 +1,2083 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/kernel.h>
+#include <linux/irqflags.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/bug.h>
+#include "printk_ringbuffer.h"
+
+/**
+ * DOC: printk_ringbuffer overview
+ *
+ * Data Structure
+ * --------------
+ * The printk_ringbuffer is made up of 3 internal ringbuffers:
+ *
+ * desc_ring
+ * A ring of descriptors and their meta data (such as sequence number,
+ * timestamp, loglevel, etc.) as well as internal state information about
+ * the record and logical positions specifying where in the other
+ * ringbuffer the text strings are located.
+ *
+ * text_data_ring
+ * A ring of data blocks. A data block consists of an unsigned long
+ * integer (ID) that maps to a desc_ring index followed by the text
+ * string of the record.
+ *
+ * The internal state information of a descriptor is the key element to allow
+ * readers and writers to locklessly synchronize access to the data.
+ *
+ * Implementation
+ * --------------
+ *
+ * Descriptor Ring
+ * ~~~~~~~~~~~~~~~
+ * The descriptor ring is an array of descriptors. A descriptor contains
+ * essential meta data to track the data of a printk record using
+ * blk_lpos structs pointing to associated text data blocks (see
+ * "Data Rings" below). Each descriptor is assigned an ID that maps
+ * directly to index values of the descriptor array and has a state. The ID
+ * and the state are bitwise combined into a single descriptor field named
+ * @state_var, allowing ID and state to be synchronously and atomically
+ * updated.
+ *
+ * Descriptors have four states:
+ *
+ * reserved
+ * A writer is modifying the record.
+ *
+ * committed
+ * The record and all its data are written. A writer can reopen the
+ * descriptor (transitioning it back to reserved), but in the committed
+ * state the data is consistent.
+ *
+ * finalized
+ * The record and all its data are complete and available for reading. A
+ * writer cannot reopen the descriptor.
+ *
+ * reusable
+ * The record exists, but its text and/or meta data may no longer be
+ * available.
+ *
+ * Querying the @state_var of a record requires providing the ID of the
+ * descriptor to query. This can yield a possible fifth (pseudo) state:
+ *
+ * miss
+ * The descriptor being queried has an unexpected ID.
+ *
+ * The descriptor ring has a @tail_id that contains the ID of the oldest
+ * descriptor and @head_id that contains the ID of the newest descriptor.
+ *
+ * When a new descriptor should be created (and the ring is full), the tail
+ * descriptor is invalidated by first transitioning to the reusable state and
+ * then invalidating all tail data blocks up to and including the data blocks
+ * associated with the tail descriptor (for the text ring). Then
+ * @tail_id is advanced, followed by advancing @head_id. And finally the
+ * @state_var of the new descriptor is initialized to the new ID and reserved
+ * state.
+ *
+ * The @tail_id can only be advanced if the new @tail_id would be in the
+ * committed or reusable queried state. This makes it possible that a valid
+ * sequence number of the tail is always available.
+ *
+ * Descriptor Finalization
+ * ~~~~~~~~~~~~~~~~~~~~~~~
+ * When a writer calls the commit function prb_commit(), record data is
+ * fully stored and is consistent within the ringbuffer. However, a writer can
+ * reopen that record, claiming exclusive access (as with prb_reserve()), and
+ * modify that record. When finished, the writer must again commit the record.
+ *
+ * In order for a record to be made available to readers (and also become
+ * recyclable for writers), it must be finalized. A finalized record cannot be
+ * reopened and can never become "unfinalized". Record finalization can occur
+ * in three different scenarios:
+ *
+ * 1) A writer can simultaneously commit and finalize its record by calling
+ * prb_final_commit() instead of prb_commit().
+ *
+ * 2) When a new record is reserved and the previous record has been
+ * committed via prb_commit(), that previous record is automatically
+ * finalized.
+ *
+ * 3) When a record is committed via prb_commit() and a newer record
+ * already exists, the record being committed is automatically finalized.
+ *
+ * Data Ring
+ * ~~~~~~~~~
+ * The text data ring is a byte array composed of data blocks. Data blocks are
+ * referenced by blk_lpos structs that point to the logical position of the
+ * beginning of a data block and the beginning of the next adjacent data
+ * block. Logical positions are mapped directly to index values of the byte
+ * array ringbuffer.
+ *
+ * Each data block consists of an ID followed by the writer data. The ID is
+ * the identifier of a descriptor that is associated with the data block. A
+ * given data block is considered valid if all of the following conditions
+ * are met:
+ *
+ * 1) The descriptor associated with the data block is in the committed
+ * or finalized queried state.
+ *
+ * 2) The blk_lpos struct within the descriptor associated with the data
+ * block references back to the same data block.
+ *
+ * 3) The data block is within the head/tail logical position range.
+ *
+ * If the writer data of a data block would extend beyond the end of the
+ * byte array, only the ID of the data block is stored at the logical
+ * position and the full data block (ID and writer data) is stored at the
+ * beginning of the byte array. The referencing blk_lpos will point to the
+ * ID before the wrap and the next data block will be at the logical
+ * position adjacent the full data block after the wrap.
+ *
+ * Data rings have a @tail_lpos that points to the beginning of the oldest
+ * data block and a @head_lpos that points to the logical position of the
+ * next (not yet existing) data block.
+ *
+ * When a new data block should be created (and the ring is full), tail data
+ * blocks will first be invalidated by putting their associated descriptors
+ * into the reusable state and then pushing the @tail_lpos forward beyond
+ * them. Then the @head_lpos is pushed forward and is associated with a new
+ * descriptor. If a data block is not valid, the @tail_lpos cannot be
+ * advanced beyond it.
+ *
+ * Info Array
+ * ~~~~~~~~~~
+ * The general meta data of printk records are stored in printk_info structs,
+ * stored in an array with the same number of elements as the descriptor ring.
+ * Each info corresponds to the descriptor of the same index in the
+ * descriptor ring. Info validity is confirmed by evaluating the corresponding
+ * descriptor before and after loading the info.
+ *
+ * Usage
+ * -----
+ * Here are some simple examples demonstrating writers and readers. For the
+ * examples a global ringbuffer (test_rb) is available (which is not the
+ * actual ringbuffer used by printk)::
+ *
+ * DEFINE_PRINTKRB(test_rb, 15, 5);
+ *
+ * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
+ * 1 MiB (2 ^ (15 + 5)) for text data.
+ *
+ * Sample writer code::
+ *
+ * const char *textstr = "message text";
+ * struct prb_reserved_entry e;
+ * struct printk_record r;
+ *
+ * // specify how much to allocate
+ * prb_rec_init_wr(&r, strlen(textstr) + 1);
+ *
+ * if (prb_reserve(&e, &test_rb, &r)) {
+ * snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Note that additional writer functions are available to extend a record
+ * after it has been committed but not yet finalized. This can be done as
+ * long as no new records have been reserved and the caller is the same.
+ *
+ * Sample writer code (record extending)::
+ *
+ * // alternate rest of previous example
+ *
+ * r.info->text_len = strlen(textstr);
+ * r.info->ts_nsec = local_clock();
+ * r.info->caller_id = printk_caller_id();
+ *
+ * // commit the record (but do not finalize yet)
+ * prb_commit(&e);
+ * }
+ *
+ * ...
+ *
+ * // specify additional 5 bytes text space to extend
+ * prb_rec_init_wr(&r, 5);
+ *
+ * // try to extend, but only if it does not exceed 32 bytes
+ * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id()), 32) {
+ * snprintf(&r.text_buf[r.info->text_len],
+ * r.text_buf_size - r.info->text_len, "hello");
+ *
+ * r.info->text_len += 5;
+ *
+ * // commit and finalize the record
+ * prb_final_commit(&e);
+ * }
+ *
+ * Sample reader code::
+ *
+ * struct printk_info info;
+ * struct printk_record r;
+ * char text_buf[32];
+ * u64 seq;
+ *
+ * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
+ *
+ * prb_for_each_record(0, &test_rb, &seq, &r) {
+ * if (info.seq != seq)
+ * pr_warn("lost %llu records\n", info.seq - seq);
+ *
+ * if (info.text_len > r.text_buf_size) {
+ * pr_warn("record %llu text truncated\n", info.seq);
+ * text_buf[r.text_buf_size - 1] = 0;
+ * }
+ *
+ * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
+ * &text_buf[0]);
+ * }
+ *
+ * Note that additional less convenient reader functions are available to
+ * allow complex record access.
+ *
+ * ABA Issues
+ * ~~~~~~~~~~
+ * To help avoid ABA issues, descriptors are referenced by IDs (array index
+ * values combined with tagged bits counting array wraps) and data blocks are
+ * referenced by logical positions (array index values combined with tagged
+ * bits counting array wraps). However, on 32-bit systems the number of
+ * tagged bits is relatively small such that an ABA incident is (at least
+ * theoretically) possible. For example, if 4 million maximally sized (1KiB)
+ * printk messages were to occur in NMI context on a 32-bit system, the
+ * interrupted context would not be able to recognize that the 32-bit integer
+ * completely wrapped and thus represents a different data block than the one
+ * the interrupted context expects.
+ *
+ * To help combat this possibility, additional state checking is performed
+ * (such as using cmpxchg() even though set() would suffice). These extra
+ * checks are commented as such and will hopefully catch any ABA issue that
+ * a 32-bit system might experience.
+ *
+ * Memory Barriers
+ * ~~~~~~~~~~~~~~~
+ * Multiple memory barriers are used. To simplify proving correctness and
+ * generating litmus tests, lines of code related to memory barriers
+ * (loads, stores, and the associated memory barriers) are labeled::
+ *
+ * LMM(function:letter)
+ *
+ * Comments reference the labels using only the "function:letter" part.
+ *
+ * The memory barrier pairs and their ordering are:
+ *
+ * desc_reserve:D / desc_reserve:B
+ * push descriptor tail (id), then push descriptor head (id)
+ *
+ * desc_reserve:D / data_push_tail:B
+ * push data tail (lpos), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / desc_push_tail:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:D / prb_first_seq:C
+ * push descriptor tail (id), then set new descriptor reserved (state)
+ *
+ * desc_reserve:F / desc_read:D
+ * set new descriptor id and reserved (state), then allow writer changes
+ *
+ * data_alloc:A (or data_realloc:A) / desc_read:D
+ * set old descriptor reusable (state), then modify new data block area
+ *
+ * data_alloc:A (or data_realloc:A) / data_push_tail:B
+ * push data tail (lpos), then modify new data block area
+ *
+ * _prb_commit:B / desc_read:B
+ * store writer changes, then set new descriptor committed (state)
+ *
+ * desc_reopen_last:A / _prb_commit:B
+ * set descriptor reserved (state), then read descriptor data
+ *
+ * _prb_commit:B / desc_reserve:D
+ * set new descriptor committed (state), then check descriptor head (id)
+ *
+ * data_push_tail:D / data_push_tail:A
+ * set descriptor reusable (state), then push data tail (lpos)
+ *
+ * desc_push_tail:B / desc_reserve:D
+ * set descriptor reusable (state), then push descriptor tail (id)
+ */
+
+#define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits)
+#define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1)
+
+#define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits)
+#define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1)
+
+/* Determine the data array index from a logical position. */
+#define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring))
+
+/* Determine the desc array index from an ID or sequence number. */
+#define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring))
+
+/* Determine how many times the data array has wrapped. */
+#define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits)
+
+/* Determine if a logical position refers to a data-less block. */
+#define LPOS_DATALESS(lpos) ((lpos) & 1UL)
+#define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \
+ LPOS_DATALESS((blk)->next))
+
+/* Get the logical position at index 0 of the current wrap. */
+#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
+((lpos) & ~DATA_SIZE_MASK(data_ring))
+
+/* Get the ID for the same index of the previous wrap as the given ID. */
+#define DESC_ID_PREV_WRAP(desc_ring, id) \
+DESC_ID((id) - DESCS_COUNT(desc_ring))
+
+/*
+ * A data block: mapped directly to the beginning of the data block area
+ * specified as a logical position within the data ring.
+ *
+ * @id: the ID of the associated descriptor
+ * @data: the writer data
+ *
+ * Note that the size of a data block is only known by its associated
+ * descriptor.
+ */
+struct prb_data_block {
+ unsigned long id;
+ char data[0];
+};
+
+/*
+ * Return the descriptor associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
+}
+
+/*
+ * Return the printk_info associated with @n. @n can be either a
+ * descriptor ID or a sequence number.
+ */
+static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
+{
+ return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
+}
+
+static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
+ unsigned long begin_lpos)
+{
+ return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
+}
+
+/*
+ * Increase the data size to account for data block meta data plus any
+ * padding so that the adjacent data block is aligned on the ID size.
+ */
+static unsigned int to_blk_size(unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ size += sizeof(*db);
+ size = ALIGN(size, sizeof(db->id));
+ return size;
+}
+
+/*
+ * Sanity checker for reserve size. The ringbuffer code assumes that a data
+ * block does not exceed the maximum possible size that could fit within the
+ * ringbuffer. This function provides that basic size check so that the
+ * assumption is safe.
+ */
+static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
+{
+ struct prb_data_block *db = NULL;
+
+ if (size == 0)
+ return true;
+
+ /*
+ * Ensure the alignment padded size could possibly fit in the data
+ * array. The largest possible data block must still leave room for
+ * at least the ID of the next block.
+ */
+ size = to_blk_size(size);
+ if (size > DATA_SIZE(data_ring) - sizeof(db->id))
+ return false;
+
+ return true;
+}
+
+/* Query the state of a descriptor. */
+static enum desc_state get_desc_state(unsigned long id,
+ unsigned long state_val)
+{
+ if (id != DESC_ID(state_val))
+ return desc_miss;
+
+ return DESC_STATE(state_val);
+}
+
+/*
+ * Get a copy of a specified descriptor and return its queried state. If the
+ * descriptor is in an inconsistent state (miss or reserved), the caller can
+ * only expect the descriptor's @state_var field to be valid.
+ *
+ * The sequence number and caller_id can be optionally retrieved. Like all
+ * non-state_var data, they are only valid if the descriptor is in a
+ * consistent state.
+ */
+static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
+ unsigned long id, struct prb_desc *desc_out,
+ u64 *seq_out, u32 *caller_id_out)
+{
+ struct printk_info *info = to_info(desc_ring, id);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+ enum desc_state d_state;
+ unsigned long state_val;
+
+ /* Check the descriptor state. */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
+ d_state = get_desc_state(id, state_val);
+ if (d_state == desc_miss || d_state == desc_reserved) {
+ /*
+ * The descriptor is in an inconsistent state. Set at least
+ * @state_var so that the caller can see the details of
+ * the inconsistent state.
+ */
+ goto out;
+ }
+
+ /*
+ * Guarantee the state is loaded before copying the descriptor
+ * content. This avoids copying obsolete descriptor content that might
+ * not apply to the descriptor state. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
+ * from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * RMB from desc_read:A to desc_read:C
+ */
+ smp_rmb(); /* LMM(desc_read:B) */
+
+ /*
+ * Copy the descriptor data. The data is not valid until the
+ * state has been re-checked. A memcpy() for all of @desc
+ * cannot be used because of the atomic_t @state_var field.
+ */
+ memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
+ sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
+ if (seq_out)
+ *seq_out = info->seq; /* also part of desc_read:C */
+ if (caller_id_out)
+ *caller_id_out = info->caller_id; /* also part of desc_read:C */
+
+ /*
+ * 1. Guarantee the descriptor content is loaded before re-checking
+ * the state. This avoids reading an obsolete descriptor state
+ * that may not apply to the copied content. This pairs with
+ * desc_reserve:F.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_read:C reads from desc_reserve:G, then desc_read:E
+ * reads from desc_reserve:F.
+ *
+ * Relies on:
+ *
+ * WMB from desc_reserve:F to desc_reserve:G
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * 2. Guarantee the record data is loaded before re-checking the
+ * state. This avoids reading an obsolete descriptor state that may
+ * not apply to the copied data. This pairs with data_alloc:A and
+ * data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If copy_data:A reads from data_alloc:B, then desc_read:E
+ * reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_alloc:B
+ * matching
+ * RMB from desc_read:C to desc_read:E
+ *
+ * Note: desc_make_reusable:A and data_alloc:B can be different
+ * CPUs. However, the data_alloc:B CPU (which performs the
+ * full memory barrier) must have previously seen
+ * desc_make_reusable:A.
+ */
+ smp_rmb(); /* LMM(desc_read:D) */
+
+ /*
+ * The data has been copied. Return the current descriptor state,
+ * which may have changed since the load above.
+ */
+ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
+ d_state = get_desc_state(id, state_val);
+out:
+ atomic_long_set(&desc_out->state_var, state_val);
+ return d_state;
+}
+
+/*
+ * Take a specified descriptor out of the finalized state by attempting
+ * the transition from finalized to reusable. Either this context or some
+ * other context will have been successful.
+ */
+static void desc_make_reusable(struct prb_desc_ring *desc_ring,
+ unsigned long id)
+{
+ unsigned long val_finalized = DESC_SV(id, desc_finalized);
+ unsigned long val_reusable = DESC_SV(id, desc_reusable);
+ struct prb_desc *desc = to_desc(desc_ring, id);
+ atomic_long_t *state_var = &desc->state_var;
+
+ atomic_long_cmpxchg_relaxed(state_var, val_finalized,
+ val_reusable); /* LMM(desc_make_reusable:A) */
+}
+
+/*
+ * Given the text data ring, put the associated descriptor of each
+ * data block from @lpos_begin until @lpos_end into the reusable state.
+ *
+ * If there is any problem making the associated descriptor reusable, either
+ * the descriptor has not yet been finalized or another writer context has
+ * already pushed the tail lpos past the problematic data block. Regardless,
+ * on error the caller can re-load the tail lpos to determine the situation.
+ */
+static bool data_make_reusable(struct printk_ringbuffer *rb,
+ struct prb_data_ring *data_ring,
+ unsigned long lpos_begin,
+ unsigned long lpos_end,
+ unsigned long *lpos_out)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct prb_data_block *blk;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
+ unsigned long id;
+
+ /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
+ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
+ blk = to_block(data_ring, lpos_begin);
+
+ /*
+ * Load the block ID from the data block. This is a data race
+ * against a writer that may have newly reserved this data
+ * area. If the loaded value matches a valid descriptor ID,
+ * the blk_lpos of that descriptor will be checked to make
+ * sure it points back to this data block. If the check fails,
+ * the data area has been recycled by another writer.
+ */
+ id = blk->id; /* LMM(data_make_reusable:A) */
+
+ d_state = desc_read(desc_ring, id, &desc,
+ NULL, NULL); /* LMM(data_make_reusable:B) */
+
+ switch (d_state) {
+ case desc_miss:
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ desc_make_reusable(desc_ring, id);
+ break;
+ case desc_reusable:
+ /*
+ * This data block is invalid if the descriptor
+ * does not point back to it.
+ */
+ if (blk_lpos->begin != lpos_begin)
+ return false;
+ break;
+ }
+
+ /* Advance @lpos_begin to the next data block. */
+ lpos_begin = blk_lpos->next;
+ }
+
+ *lpos_out = lpos_begin;
+ return true;
+}
+
+/*
+ * Advance the data ring tail to at least @lpos. This function puts
+ * descriptors into the reusable state if the tail is pushed beyond
+ * their associated data block.
+ */
+static bool data_push_tail(struct printk_ringbuffer *rb,
+ struct prb_data_ring *data_ring,
+ unsigned long lpos)
+{
+ unsigned long tail_lpos_new;
+ unsigned long tail_lpos;
+ unsigned long next_lpos;
+
+ /* If @lpos is from a data-less block, there is nothing to do. */
+ if (LPOS_DATALESS(lpos))
+ return true;
+
+ /*
+ * Any descriptor states that have transitioned to reusable due to the
+ * data tail being pushed to this loaded value will be visible to this
+ * CPU. This pairs with data_push_tail:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_push_tail:A reads from data_push_tail:D, then this CPU can
+ * see desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to data_push_tail:D
+ * matches
+ * READFROM from data_push_tail:D to data_push_tail:A
+ * thus
+ * READFROM from desc_make_reusable:A to this CPU
+ */
+ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */
+
+ /*
+ * Loop until the tail lpos is at or beyond @lpos. This condition
+ * may already be satisfied, resulting in no full memory barrier
+ * from data_push_tail:D being performed. However, since this CPU
+ * sees the new tail lpos, any descriptor states that transitioned to
+ * the reusable state must already be visible.
+ */
+ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
+ /*
+ * Make all descriptors reusable that are associated with
+ * data blocks before @lpos.
+ */
+ if (!data_make_reusable(rb, data_ring, tail_lpos, lpos,
+ &next_lpos)) {
+ /*
+ * 1. Guarantee the block ID loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled data area causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * data_alloc:A and data_realloc:A.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:A reads from data_alloc:B,
+ * then data_push_tail:C reads from
+ * data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to data_alloc:B
+ * matching
+ * RMB from data_make_reusable:A to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and data_alloc:B can be
+ * different CPUs. However, the data_alloc:B
+ * CPU (which performs the full memory
+ * barrier) must have previously seen
+ * data_push_tail:D.
+ *
+ * 2. Guarantee the descriptor state loaded in
+ * data_make_reusable() is performed before
+ * reloading the tail lpos. The failed
+ * data_make_reusable() may be due to a newly
+ * recycled descriptor causing the tail lpos to
+ * have been previously pushed. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If data_make_reusable:B reads from
+ * desc_reserve:F, then data_push_tail:C reads
+ * from data_push_tail:D.
+ *
+ * Relies on:
+ *
+ * MB from data_push_tail:D to desc_reserve:F
+ * matching
+ * RMB from data_make_reusable:B to
+ * data_push_tail:C
+ *
+ * Note: data_push_tail:D and desc_reserve:F can
+ * be different CPUs. However, the
+ * desc_reserve:F CPU (which performs the
+ * full memory barrier) must have previously
+ * seen data_push_tail:D.
+ */
+ smp_rmb(); /* LMM(data_push_tail:B) */
+
+ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
+ ); /* LMM(data_push_tail:C) */
+ if (tail_lpos_new == tail_lpos)
+ return false;
+
+ /* Another CPU pushed the tail. Try again. */
+ tail_lpos = tail_lpos_new;
+ continue;
+ }
+
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail lpos. A full
+ * memory barrier is needed since other CPUs may have made
+ * the descriptor states reusable. This pairs with
+ * data_push_tail:A.
+ */
+ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
+ next_lpos)) { /* LMM(data_push_tail:D) */
+ break;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Advance the desc ring tail. This function advances the tail by one
+ * descriptor, thus invalidating the oldest descriptor. Before advancing
+ * the tail, the tail descriptor is made reusable and all data blocks up to
+ * and including the descriptor's data block are invalidated (i.e. the data
+ * ring tail is pushed past the data block of the descriptor being made
+ * reusable).
+ */
+static bool desc_push_tail(struct printk_ringbuffer *rb,
+ unsigned long tail_id)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+
+ d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);
+
+ switch (d_state) {
+ case desc_miss:
+ /*
+ * If the ID is exactly 1 wrap behind the expected, it is
+ * in the process of being reserved by another writer and
+ * must be considered reserved.
+ */
+ if (DESC_ID(atomic_long_read(&desc.state_var)) ==
+ DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
+ return false;
+ }
+
+ /*
+ * The ID has changed. Another writer must have pushed the
+ * tail and recycled the descriptor already. Success is
+ * returned because the caller is only interested in the
+ * specified tail being pushed, which it was.
+ */
+ return true;
+ case desc_reserved:
+ case desc_committed:
+ return false;
+ case desc_finalized:
+ desc_make_reusable(desc_ring, tail_id);
+ break;
+ case desc_reusable:
+ break;
+ }
+
+ /*
+ * Data blocks must be invalidated before their associated
+ * descriptor can be made available for recycling. Invalidating
+ * them later is not possible because there is no way to trust
+ * data blocks once their associated descriptor is gone.
+ */
+
+ if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next))
+ return false;
+
+ /*
+ * Check the next descriptor after @tail_id before pushing the tail
+ * to it because the tail must always be in a finalized or reusable
+ * state. The implementation of prb_first_seq() relies on this.
+ *
+ * A successful read implies that the next descriptor is less than or
+ * equal to @head_id so there is no risk of pushing the tail past the
+ * head.
+ */
+ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
+ NULL, NULL); /* LMM(desc_push_tail:A) */
+
+ if (d_state == desc_finalized || d_state == desc_reusable) {
+ /*
+ * Guarantee any descriptor states that have transitioned to
+ * reusable are stored before pushing the tail ID. This allows
+ * verifying the recycled descriptor state. A full memory
+ * barrier is needed since other CPUs may have made the
+ * descriptor states reusable. This pairs with desc_reserve:D.
+ */
+ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
+ DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
+ } else {
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail ID in the
+ * case that the descriptor has been recycled. This pairs
+ * with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_push_tail:A reads from desc_reserve:F, then
+ * desc_push_tail:D reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB from desc_push_tail:A to desc_push_tail:D
+ *
+ * Note: desc_push_tail:B and desc_reserve:F can be different
+ * CPUs. However, the desc_reserve:F CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_push_tail:C) */
+
+ /*
+ * Re-check the tail ID. The descriptor following @tail_id is
+ * not in an allowed tail state. But if the tail has since
+ * been moved by another CPU, then it does not matter.
+ */
+ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
+ return false;
+ }
+
+ return true;
+}
+
+/* Reserve a new descriptor, invalidating the oldest if necessary. */
+static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ unsigned long prev_state_val;
+ unsigned long id_prev_wrap;
+ struct prb_desc *desc;
+ unsigned long head_id;
+ unsigned long id;
+
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */
+
+ do {
+ desc = to_desc(desc_ring, head_id);
+
+ id = DESC_ID(head_id + 1);
+ id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);
+
+ /*
+ * Guarantee the head ID is read before reading the tail ID.
+ * Since the tail ID is updated before the head ID, this
+ * guarantees that @id_prev_wrap is never ahead of the tail
+ * ID. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:A reads from desc_reserve:D, then
+ * desc_reserve:C reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:D
+ * matching
+ * RMB from desc_reserve:A to desc_reserve:C
+ *
+ * Note: desc_push_tail:B and desc_reserve:D can be different
+ * CPUs. However, the desc_reserve:D CPU (which performs
+ * the full memory barrier) must have previously seen
+ * desc_push_tail:B.
+ */
+ smp_rmb(); /* LMM(desc_reserve:B) */
+
+ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
+ )) { /* LMM(desc_reserve:C) */
+ /*
+ * Make space for the new descriptor by
+ * advancing the tail.
+ */
+ if (!desc_push_tail(rb, id_prev_wrap))
+ return false;
+ }
+
+ /*
+ * 1. Guarantee the tail ID is read before validating the
+ * recycled descriptor state. A read memory barrier is
+ * sufficient for this. This pairs with desc_push_tail:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reserve:C reads from desc_push_tail:B, then
+ * desc_reserve:E reads from desc_make_reusable:A.
+ *
+ * Relies on:
+ *
+ * MB from desc_make_reusable:A to desc_push_tail:B
+ * matching
+ * RMB from desc_reserve:C to desc_reserve:E
+ *
+ * Note: desc_make_reusable:A and desc_push_tail:B can be
+ * different CPUs. However, the desc_push_tail:B CPU
+ * (which performs the full memory barrier) must have
+ * previously seen desc_make_reusable:A.
+ *
+ * 2. Guarantee the tail ID is stored before storing the head
+ * ID. This pairs with desc_reserve:B.
+ *
+ * 3. Guarantee any data ring tail changes are stored before
+ * recycling the descriptor. Data ring tail changes can
+ * happen via desc_push_tail()->data_push_tail(). A full
+ * memory barrier is needed since another CPU may have
+ * pushed the data ring tails. This pairs with
+ * data_push_tail:B.
+ *
+ * 4. Guarantee a new tail ID is stored before recycling the
+ * descriptor. A full memory barrier is needed since
+ * another CPU may have pushed the tail ID. This pairs
+ * with desc_push_tail:C and this also pairs with
+ * prb_first_seq:C.
+ *
+ * 5. Guarantee the head ID is stored before trying to
+ * finalize the previous descriptor. This pairs with
+ * _prb_commit:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
+ id)); /* LMM(desc_reserve:D) */
+
+ desc = to_desc(desc_ring, id);
+
+ /*
+ * If the descriptor has been recycled, verify the old state val.
+ * See "ABA Issues" about why this verification is performed.
+ */
+ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
+ if (prev_state_val &&
+ get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /*
+ * Assign the descriptor a new ID and set its state to reserved.
+ * See "ABA Issues" about why cmpxchg() instead of set() is used.
+ *
+ * Guarantee the new descriptor ID and state is stored before making
+ * any other changes. A write memory barrier is sufficient for this.
+ * This pairs with desc_read:D.
+ */
+ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ /* Now data in @desc can be modified: LMM(desc_reserve:G) */
+
+ *id_out = id;
+ return true;
+}
+
+/* Determine the end of a data block. */
+static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
+ unsigned long lpos, unsigned int size)
+{
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ begin_lpos = lpos;
+ next_lpos = lpos + size;
+
+ /* First check if the data block does not wrap. */
+ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
+ return next_lpos;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
+}
+
+/*
+ * Allocate a new data block, invalidating the oldest data block(s)
+ * if necessary. This function also associates the data block with
+ * a specified descriptor.
+ */
+static char *data_alloc(struct printk_ringbuffer *rb,
+ struct prb_data_ring *data_ring, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_block *blk;
+ unsigned long begin_lpos;
+ unsigned long next_lpos;
+
+ if (size == 0) {
+ /* Specify a data-less block. */
+ blk_lpos->begin = NO_LPOS;
+ blk_lpos->next = NO_LPOS;
+ return NULL;
+ }
+
+ size = to_blk_size(size);
+
+ begin_lpos = atomic_long_read(&data_ring->head_lpos);
+
+ do {
+ next_lpos = get_next_lpos(data_ring, begin_lpos, size);
+
+ if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) {
+ /* Failed to allocate, specify a data-less block. */
+ blk_lpos->begin = FAILED_LPOS;
+ blk_lpos->next = FAILED_LPOS;
+ return NULL;
+ }
+
+ /*
+ * 1. Guarantee any descriptor states that have transitioned
+ * to reusable are stored before modifying the newly
+ * allocated data area. A full memory barrier is needed
+ * since other CPUs may have made the descriptor states
+ * reusable. See data_push_tail:A about why the reusable
+ * states are visible. This pairs with desc_read:D.
+ *
+ * 2. Guarantee any updated tail lpos is stored before
+ * modifying the newly allocated data area. Another CPU may
+ * be in data_make_reusable() and is reading a block ID
+ * from this area. data_make_reusable() can handle reading
+ * a garbage block ID value, but then it must be able to
+ * load a new tail lpos. A full memory barrier is needed
+ * since other CPUs may have updated the tail lpos. This
+ * pairs with data_push_tail:B.
+ */
+ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
+ next_lpos)); /* LMM(data_alloc:A) */
+
+ blk = to_block(data_ring, begin_lpos);
+ blk->id = id; /* LMM(data_alloc:B) */
+
+ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+ }
+
+ blk_lpos->begin = begin_lpos;
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/*
+ * Try to resize an existing data block associated with the descriptor
+ * specified by @id. If the resized data block should become wrapped, it
+ * copies the old data to the new data block. If @size yields a data block
+ * with the same or less size, the data block is left as is.
+ *
+ * Fail if this is not the last allocated data block or if there is not
+ * enough space or it is not possible make enough space.
+ *
+ * Return a pointer to the beginning of the entire data buffer or NULL on
+ * failure.
+ */
+static char *data_realloc(struct printk_ringbuffer *rb,
+ struct prb_data_ring *data_ring, unsigned int size,
+ struct prb_data_blk_lpos *blk_lpos, unsigned long id)
+{
+ struct prb_data_block *blk;
+ unsigned long head_lpos;
+ unsigned long next_lpos;
+ bool wrapped;
+
+ /* Reallocation only works if @blk_lpos is the newest data block. */
+ head_lpos = atomic_long_read(&data_ring->head_lpos);
+ if (head_lpos != blk_lpos->next)
+ return NULL;
+
+ /* Keep track if @blk_lpos was a wrapping data block. */
+ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));
+
+ size = to_blk_size(size);
+
+ next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);
+
+ /* If the data block does not increase, there is nothing to do. */
+ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
+ blk = to_block(data_ring, blk_lpos->begin);
+ return &blk->data[0];
+ }
+
+ if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring)))
+ return NULL;
+
+ /* The memory barrier involvement is the same as data_alloc:A. */
+ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
+ next_lpos)) { /* LMM(data_realloc:A) */
+ return NULL;
+ }
+
+ blk = to_block(data_ring, blk_lpos->begin);
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
+ struct prb_data_block *old_blk = blk;
+
+ /* Wrapping data blocks store their data at the beginning. */
+ blk = to_block(data_ring, 0);
+
+ /*
+ * Store the ID on the wrapped block for consistency.
+ * The printk_ringbuffer does not actually use it.
+ */
+ blk->id = id;
+
+ if (!wrapped) {
+ /*
+ * Since the allocated space is now in the newly
+ * created wrapping data block, copy the content
+ * from the old data block.
+ */
+ memcpy(&blk->data[0], &old_blk->data[0],
+ (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
+ }
+ }
+
+ blk_lpos->next = next_lpos;
+
+ return &blk->data[0];
+}
+
+/* Return the number of bytes used by a data block. */
+static unsigned int space_used(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos)
+{
+ /* Data-less blocks take no space. */
+ if (BLK_DATALESS(blk_lpos))
+ return 0;
+
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
+ /* Data block does not wrap. */
+ return (DATA_INDEX(data_ring, blk_lpos->next) -
+ DATA_INDEX(data_ring, blk_lpos->begin));
+ }
+
+ /*
+ * For wrapping data blocks, the trailing (wasted) space is
+ * also counted.
+ */
+ return (DATA_INDEX(data_ring, blk_lpos->next) +
+ DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
+}
+
+/*
+ * Given @blk_lpos, return a pointer to the writer data from the data block
+ * and calculate the size of the data part. A NULL pointer is returned if
+ * @blk_lpos specifies values that could never be legal.
+ *
+ * This function (used by readers) performs strict validation on the lpos
+ * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static const char *get_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos,
+ unsigned int *data_size)
+{
+ struct prb_data_block *db;
+
+ /* Data-less data block description. */
+ if (BLK_DATALESS(blk_lpos)) {
+ if (blk_lpos->begin == NO_LPOS && blk_lpos->next == NO_LPOS) {
+ *data_size = 0;
+ return "";
+ }
+ return NULL;
+ }
+
+ /* Regular data block: @begin less than @next and in same wrap. */
+ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
+ blk_lpos->begin < blk_lpos->next) {
+ db = to_block(data_ring, blk_lpos->begin);
+ *data_size = blk_lpos->next - blk_lpos->begin;
+
+ /* Wrapping data block: @begin is one wrap behind @next. */
+ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
+ DATA_WRAPS(data_ring, blk_lpos->next)) {
+ db = to_block(data_ring, 0);
+ *data_size = DATA_INDEX(data_ring, blk_lpos->next);
+
+ /* Illegal block description. */
+ } else {
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
+ /* A valid data block will always be aligned to the ID size. */
+ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
+ WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
+ return NULL;
+ }
+
+ /* A valid data block will always have at least an ID. */
+ if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
+ return NULL;
+
+ /* Subtract block ID space from size to reflect data size. */
+ *data_size -= sizeof(db->id);
+
+ return &db->data[0];
+}
+
+/*
+ * Attempt to transition the newest descriptor from committed back to reserved
+ * so that the record can be modified by a writer again. This is only possible
+ * if the descriptor is not yet finalized and the provided @caller_id matches.
+ */
+static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
+ u32 caller_id, unsigned long *id_out)
+{
+ unsigned long prev_state_val;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ struct prb_desc *d;
+ unsigned long id;
+ u32 cid;
+
+ id = atomic_long_read(&desc_ring->head_id);
+
+ /*
+ * To reduce unnecessarily reopening, first check if the descriptor
+ * state and caller ID are correct.
+ */
+ d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
+ if (d_state != desc_committed || cid != caller_id)
+ return NULL;
+
+ d = to_desc(desc_ring, id);
+
+ prev_state_val = DESC_SV(id, desc_committed);
+
+ /*
+ * Guarantee the reserved state is stored before reading any
+ * record data. A full memory barrier is needed because @state_var
+ * modification is followed by reading. This pairs with _prb_commit:B.
+ *
+ * Memory barrier involvement:
+ *
+ * If desc_reopen_last:A reads from _prb_commit:B, then
+ * prb_reserve_in_last:A reads from _prb_commit:A.
+ *
+ * Relies on:
+ *
+ * WMB from _prb_commit:A to _prb_commit:B
+ * matching
+ * MB If desc_reopen_last:A to prb_reserve_in_last:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
+ return NULL;
+ }
+
+ *id_out = id;
+ return d;
+}
+
+/**
+ * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
+ * used by the newest record.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to re-reserve and extend data in.
+ * @r: The record structure to allocate buffers for.
+ * @caller_id: The caller ID of the caller (reserving writer).
+ * @max_size: Fail if the extended size would be greater than this.
+ *
+ * This is the public function available to writers to re-reserve and extend
+ * data.
+ *
+ * The writer specifies the text size to extend (not the new total size) by
+ * setting the @text_buf_size field of @r. To ensure proper initialization
+ * of @r, prb_rec_init_wr() should be used.
+ *
+ * This function will fail if @caller_id does not match the caller ID of the
+ * newest record. In that case the caller must reserve new data using
+ * prb_reserve().
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if text data could be extended, otherwise false.
+ *
+ * On success:
+ *
+ * - @r->text_buf points to the beginning of the entire text buffer.
+ *
+ * - @r->text_buf_size is set to the new total size of the buffer.
+ *
+ * - @r->info is not touched so that @r->info->text_len could be used
+ * to append the text.
+ *
+ * - prb_record_text_space() can be used on @e to query the new
+ * actually used space.
+ *
+ * Important: All @r->info fields will already be set with the current values
+ * for the record. I.e. @r->info->text_len will be less than
+ * @text_buf_size. Writers can use @r->info->text_len to know
+ * where concatenation begins and writers should update
+ * @r->info->text_len after concatenating.
+ */
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ unsigned int data_size;
+ struct prb_desc *d;
+ unsigned long id;
+
+ local_irq_save(e->irqflags);
+
+ /* Transition the newest descriptor back to the reserved state. */
+ d = desc_reopen_last(desc_ring, caller_id, &id);
+ if (!d) {
+ local_irq_restore(e->irqflags);
+ goto fail_reopen;
+ }
+
+ /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */
+
+ info = to_info(desc_ring, id);
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * anything fails from now on.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * desc_reopen_last() checked the caller_id, but there was no
+ * exclusive access at that point. The descriptor may have
+ * changed since then.
+ */
+ if (caller_id != info->caller_id)
+ goto fail;
+
+ if (BLK_DATALESS(&d->text_blk_lpos)) {
+ if (WARN_ON_ONCE(info->text_len != 0)) {
+ pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
+ info->text_len);
+ info->text_len = 0;
+ }
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ } else {
+ if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
+ goto fail;
+
+ /*
+ * Increase the buffer size to include the original size. If
+ * the meta data (@text_len) is not sane, use the full data
+ * block size.
+ */
+ if (WARN_ON_ONCE(info->text_len > data_size)) {
+ pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
+ info->text_len, data_size);
+ info->text_len = data_size;
+ }
+ r->text_buf_size += info->text_len;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ if (r->text_buf_size > max_size)
+ goto fail;
+
+ r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ }
+ if (r->text_buf_size && !r->text_buf)
+ goto fail;
+
+ r->info = info;
+
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+fail_reopen:
+ /* Make it clear to the caller that the re-reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/*
+ * Attempt to finalize a specified descriptor. If this fails, the descriptor
+ * is either already final or it will finalize itself when the writer commits.
+ */
+static void desc_make_final(struct prb_desc_ring *desc_ring, unsigned long id)
+{
+ unsigned long prev_state_val = DESC_SV(id, desc_committed);
+ struct prb_desc *d = to_desc(desc_ring, id);
+
+ atomic_long_cmpxchg_relaxed(&d->state_var, prev_state_val,
+ DESC_SV(id, desc_finalized)); /* LMM(desc_make_final:A) */
+}
+
+/**
+ * prb_reserve() - Reserve space in the ringbuffer.
+ *
+ * @e: The entry structure to setup.
+ * @rb: The ringbuffer to reserve data in.
+ * @r: The record structure to allocate buffers for.
+ *
+ * This is the public function available to writers to reserve data.
+ *
+ * The writer specifies the text size to reserve by setting the
+ * @text_buf_size field of @r. To ensure proper initialization of @r,
+ * prb_rec_init_wr() should be used.
+ *
+ * Context: Any context. Disables local interrupts on success.
+ * Return: true if at least text data could be allocated, otherwise false.
+ *
+ * On success, the fields @info and @text_buf of @r will be set by this
+ * function and should be filled in by the writer before committing. Also
+ * on success, prb_record_text_space() can be used on @e to query the actual
+ * space used for the text data block.
+ *
+ * Important: @info->text_len needs to be set correctly by the writer in
+ * order for data to be readable and/or extended. Its value
+ * is initialized to 0.
+ */
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info;
+ struct prb_desc *d;
+ unsigned long id;
+ u64 seq;
+
+ if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
+ goto fail;
+
+ /*
+ * Descriptors in the reserved state act as blockers to all further
+ * reservations once the desc_ring has fully wrapped. Disable
+ * interrupts during the reserve/commit window in order to minimize
+ * the likelihood of this happening.
+ */
+ local_irq_save(e->irqflags);
+
+ if (!desc_reserve(rb, &id)) {
+ /* Descriptor reservation failures are tracked. */
+ atomic_long_inc(&rb->fail);
+ local_irq_restore(e->irqflags);
+ goto fail;
+ }
+
+ d = to_desc(desc_ring, id);
+ info = to_info(desc_ring, id);
+
+ /*
+ * All @info fields (except @seq) are cleared and must be filled in
+ * by the writer. Save @seq before clearing because it is used to
+ * determine the new sequence number.
+ */
+ seq = info->seq;
+ memset(info, 0, sizeof(*info));
+
+ /*
+ * Set the @e fields here so that prb_commit() can be used if
+ * text data allocation fails.
+ */
+ e->rb = rb;
+ e->id = id;
+
+ /*
+ * Initialize the sequence number if it has "never been set".
+ * Otherwise just increment it by a full wrap.
+ *
+ * @seq is considered "never been set" if it has a value of 0,
+ * _except_ for @infos[0], which was specially setup by the ringbuffer
+ * initializer and therefore is always considered as set.
+ *
+ * See the "Bootstrap" comment block in printk_ringbuffer.h for
+ * details about how the initializer bootstraps the descriptors.
+ */
+ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
+ info->seq = DESC_INDEX(desc_ring, id);
+ else
+ info->seq = seq + DESCS_COUNT(desc_ring);
+
+ /*
+ * New data is about to be reserved. Once that happens, previous
+ * descriptors are no longer able to be extended. Finalize the
+ * previous descriptor now so that it can be made available to
+ * readers. (For seq==0 there is no previous descriptor.)
+ */
+ if (info->seq > 0)
+ desc_make_final(desc_ring, DESC_ID(id - 1));
+
+ r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size,
+ &d->text_blk_lpos, id);
+ /* If text data allocation fails, a data-less record is committed. */
+ if (r->text_buf_size && !r->text_buf) {
+ prb_commit(e);
+ /* prb_commit() re-enabled interrupts. */
+ goto fail;
+ }
+
+ r->info = info;
+
+ /* Record full text space used by record. */
+ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);
+
+ return true;
+fail:
+ /* Make it clear to the caller that the reserve failed. */
+ memset(r, 0, sizeof(*r));
+ return false;
+}
+
+/* Commit the data (possibly finalizing it) and restore interrupts. */
+static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ struct prb_desc *d = to_desc(desc_ring, e->id);
+ unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);
+
+ /* Now the writer has finished all writing: LMM(_prb_commit:A) */
+
+ /*
+ * Set the descriptor as committed. See "ABA Issues" about why
+ * cmpxchg() instead of set() is used.
+ *
+ * 1 Guarantee all record data is stored before the descriptor state
+ * is stored as committed. A write memory barrier is sufficient
+ * for this. This pairs with desc_read:B and desc_reopen_last:A.
+ *
+ * 2. Guarantee the descriptor state is stored as committed before
+ * re-checking the head ID in order to possibly finalize this
+ * descriptor. This pairs with desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_commit:A reads from desc_reserve:D, then
+ * desc_make_final:A reads from _prb_commit:B.
+ *
+ * Relies on:
+ *
+ * MB _prb_commit:B to prb_commit:A
+ * matching
+ * MB desc_reserve:D to desc_make_final:A
+ */
+ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
+ DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
+ WARN_ON_ONCE(1);
+ }
+
+ /* Restore interrupts, the reserve/commit window is finished. */
+ local_irq_restore(e->irqflags);
+}
+
+/**
+ * prb_commit() - Commit (previously reserved) data to the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit data.
+ *
+ * Note that the data is not yet available to readers until it is finalized.
+ * Finalizing happens automatically when space for the next record is
+ * reserved.
+ *
+ * See prb_final_commit() for a version of this function that finalizes
+ * immediately.
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_commit(struct prb_reserved_entry *e)
+{
+ struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
+ unsigned long head_id;
+
+ _prb_commit(e, desc_committed);
+
+ /*
+ * If this descriptor is no longer the head (i.e. a new record has
+ * been allocated), extending the data for this record is no longer
+ * allowed and therefore it must be finalized.
+ */
+ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
+ if (head_id != e->id)
+ desc_make_final(desc_ring, e->id);
+}
+
+/**
+ * prb_final_commit() - Commit and finalize (previously reserved) data to
+ * the ringbuffer.
+ *
+ * @e: The entry containing the reserved data information.
+ *
+ * This is the public function available to writers to commit+finalize data.
+ *
+ * By finalizing, the data is made immediately available to readers.
+ *
+ * This function should only be used if there are no intentions of extending
+ * this data using prb_reserve_in_last().
+ *
+ * Context: Any context. Enables local interrupts.
+ */
+void prb_final_commit(struct prb_reserved_entry *e)
+{
+ _prb_commit(e, desc_finalized);
+}
+
+/*
+ * Count the number of lines in provided text. All text has at least 1 line
+ * (even if @text_size is 0). Each '\n' processed is counted as an additional
+ * line.
+ */
+static unsigned int count_lines(const char *text, unsigned int text_size)
+{
+ unsigned int next_size = text_size;
+ unsigned int line_count = 1;
+ const char *next = text;
+
+ while (next_size) {
+ next = memchr(next, '\n', next_size);
+ if (!next)
+ break;
+ line_count++;
+ next++;
+ next_size = text_size - (next - text);
+ }
+
+ return line_count;
+}
+
+/*
+ * Given @blk_lpos, copy an expected @len of data into the provided buffer.
+ * If @line_count is provided, count the number of lines in the data.
+ *
+ * This function (used by readers) performs strict validation on the data
+ * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
+ * triggered if an internal error is detected.
+ */
+static bool copy_data(struct prb_data_ring *data_ring,
+ struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
+ unsigned int buf_size, unsigned int *line_count)
+{
+ unsigned int data_size;
+ const char *data;
+
+ /* Caller might not want any data. */
+ if ((!buf || !buf_size) && !line_count)
+ return true;
+
+ data = get_data(data_ring, blk_lpos, &data_size);
+ if (!data)
+ return false;
+
+ /*
+ * Actual cannot be less than expected. It can be more than expected
+ * because of the trailing alignment padding.
+ *
+ * Note that invalid @len values can occur because the caller loads
+ * the value during an allowed data race.
+ */
+ if (data_size < (unsigned int)len)
+ return false;
+
+ /* Caller interested in the line count? */
+ if (line_count)
+ *line_count = count_lines(data, data_size);
+
+ /* Caller interested in the data content? */
+ if (!buf || !buf_size)
+ return true;
+
+ data_size = min_t(u16, buf_size, len);
+
+ memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
+ return true;
+}
+
+/*
+ * This is an extended version of desc_read(). It gets a copy of a specified
+ * descriptor. However, it also verifies that the record is finalized and has
+ * the sequence number @seq. On success, 0 is returned.
+ *
+ * Error return values:
+ * -EINVAL: A finalized record with sequence number @seq does not exist.
+ * -ENOENT: A finalized record with sequence number @seq exists, but its data
+ * is not available. This is a valid record, so readers should
+ * continue with the next record.
+ */
+static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
+ unsigned long id, u64 seq,
+ struct prb_desc *desc_out)
+{
+ struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
+ enum desc_state d_state;
+ u64 s;
+
+ d_state = desc_read(desc_ring, id, desc_out, &s, NULL);
+
+ /*
+ * An unexpected @id (desc_miss) or @seq mismatch means the record
+ * does not exist. A descriptor in the reserved or committed state
+ * means the record does not yet exist for the reader.
+ */
+ if (d_state == desc_miss ||
+ d_state == desc_reserved ||
+ d_state == desc_committed ||
+ s != seq) {
+ return -EINVAL;
+ }
+
+ /*
+ * A descriptor in the reusable state may no longer have its data
+ * available; report it as existing but with lost data. Or the record
+ * may actually be a record with lost data.
+ */
+ if (d_state == desc_reusable ||
+ (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+/*
+ * Copy the ringbuffer data from the record with @seq to the provided
+ * @r buffer. On success, 0 is returned.
+ *
+ * See desc_read_finalized_seq() for error return values.
+ */
+static int prb_read(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ struct printk_info *info = to_info(desc_ring, seq);
+ struct prb_desc *rdesc = to_desc(desc_ring, seq);
+ atomic_long_t *state_var = &rdesc->state_var;
+ struct prb_desc desc;
+ unsigned long id;
+ int err;
+
+ /* Extract the ID, used to specify the descriptor to read. */
+ id = DESC_ID(atomic_long_read(state_var));
+
+ /* Get a local copy of the correct descriptor (if available). */
+ err = desc_read_finalized_seq(desc_ring, id, seq, &desc);
+
+ /*
+ * If @r is NULL, the caller is only interested in the availability
+ * of the record.
+ */
+ if (err || !r)
+ return err;
+
+ /* If requested, copy meta data. */
+ if (r->info)
+ memcpy(r->info, info, sizeof(*(r->info)));
+
+ /* Copy text data. If it fails, this is a data-less record. */
+ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
+ r->text_buf, r->text_buf_size, line_count)) {
+ return -ENOENT;
+ }
+
+ /* Ensure the record is still finalized and has the same @seq. */
+ return desc_read_finalized_seq(desc_ring, id, seq, &desc);
+}
+
+/* Get the sequence number of the tail descriptor. */
+static u64 prb_first_seq(struct printk_ringbuffer *rb)
+{
+ struct prb_desc_ring *desc_ring = &rb->desc_ring;
+ enum desc_state d_state;
+ struct prb_desc desc;
+ unsigned long id;
+ u64 seq;
+
+ for (;;) {
+ id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */
+
+ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */
+
+ /*
+ * This loop will not be infinite because the tail is
+ * _always_ in the finalized or reusable state.
+ */
+ if (d_state == desc_finalized || d_state == desc_reusable)
+ break;
+
+ /*
+ * Guarantee the last state load from desc_read() is before
+ * reloading @tail_id in order to see a new tail in the case
+ * that the descriptor has been recycled. This pairs with
+ * desc_reserve:D.
+ *
+ * Memory barrier involvement:
+ *
+ * If prb_first_seq:B reads from desc_reserve:F, then
+ * prb_first_seq:A reads from desc_push_tail:B.
+ *
+ * Relies on:
+ *
+ * MB from desc_push_tail:B to desc_reserve:F
+ * matching
+ * RMB prb_first_seq:B to prb_first_seq:A
+ */
+ smp_rmb(); /* LMM(prb_first_seq:C) */
+ }
+
+ return seq;
+}
+
+/*
+ * Non-blocking read of a record. Updates @seq to the last finalized record
+ * (which may have no data available).
+ *
+ * See the description of prb_read_valid() and prb_read_valid_info()
+ * for details.
+ */
+static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
+ struct printk_record *r, unsigned int *line_count)
+{
+ u64 tail_seq;
+ int err;
+
+ while ((err = prb_read(rb, *seq, r, line_count))) {
+ tail_seq = prb_first_seq(rb);
+
+ if (*seq < tail_seq) {
+ /*
+ * Behind the tail. Catch up and try again. This
+ * can happen for -ENOENT and -EINVAL cases.
+ */
+ *seq = tail_seq;
+
+ } else if (err == -ENOENT) {
+ /* Record exists, but no data available. Skip. */
+ (*seq)++;
+
+ } else {
+ /* Non-existent/non-finalized record. Must stop. */
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * prb_read_valid() - Non-blocking read of a requested record or (if gone)
+ * the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @r: A record data buffer to store the read record to.
+ *
+ * This is the public function available to readers to read a record.
+ *
+ * The reader provides the @info and @text_buf buffers of @r to be
+ * filled in. Any of the buffer pointers can be set to NULL if the reader
+ * is not interested in that data. To ensure proper initialization of @r,
+ * prb_rec_init_rd() should be used.
+ *
+ * Context: Any context.
+ * Return: true if a record was read, otherwise false.
+ *
+ * On success, the reader must check r->info.seq to see which record was
+ * actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r)
+{
+ return _prb_read_valid(rb, &seq, r, NULL);
+}
+
+/**
+ * prb_read_valid_info() - Non-blocking read of meta data for a requested
+ * record or (if gone) the next available record.
+ *
+ * @rb: The ringbuffer to read from.
+ * @seq: The sequence number of the record to read.
+ * @info: A buffer to store the read record meta data to.
+ * @line_count: A buffer to store the number of lines in the record text.
+ *
+ * This is the public function available to readers to read only the
+ * meta data of a record.
+ *
+ * The reader provides the @info, @line_count buffers to be filled in.
+ * Either of the buffer pointers can be set to NULL if the reader is not
+ * interested in that data.
+ *
+ * Context: Any context.
+ * Return: true if a record's meta data was read, otherwise false.
+ *
+ * On success, the reader must check info->seq to see which record meta data
+ * was actually read. This allows the reader to detect dropped records.
+ *
+ * Failure means @seq refers to a not yet written record.
+ */
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count)
+{
+ struct printk_record r;
+
+ prb_rec_init_rd(&r, info, NULL, 0);
+
+ return _prb_read_valid(rb, &seq, &r, line_count);
+}
+
+/**
+ * prb_first_valid_seq() - Get the sequence number of the oldest available
+ * record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the
+ * first/oldest valid sequence number is.
+ *
+ * This provides readers a starting point to begin iterating the ringbuffer.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the first/oldest record or, if the
+ * ringbuffer is empty, 0 is returned.
+ */
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq = 0;
+
+ if (!_prb_read_valid(rb, &seq, NULL, NULL))
+ return 0;
+
+ return seq;
+}
+
+/**
+ * prb_next_seq() - Get the sequence number after the last available record.
+ *
+ * @rb: The ringbuffer to get the sequence number from.
+ *
+ * This is the public function available to readers to see what the next
+ * newest sequence number available to readers will be.
+ *
+ * This provides readers a sequence number to jump to if all currently
+ * available records should be skipped.
+ *
+ * Context: Any context.
+ * Return: The sequence number of the next newest (not yet available) record
+ * for readers.
+ */
+u64 prb_next_seq(struct printk_ringbuffer *rb)
+{
+ u64 seq = 0;
+
+ /* Search forward from the oldest descriptor. */
+ while (_prb_read_valid(rb, &seq, NULL, NULL))
+ seq++;
+
+ return seq;
+}
+
+/**
+ * prb_init() - Initialize a ringbuffer to use provided external buffers.
+ *
+ * @rb: The ringbuffer to initialize.
+ * @text_buf: The data buffer for text data.
+ * @textbits: The size of @text_buf as a power-of-2 value.
+ * @descs: The descriptor buffer for ringbuffer records.
+ * @descbits: The count of @descs items as a power-of-2 value.
+ * @infos: The printk_info buffer for ringbuffer records.
+ *
+ * This is the public function available to writers to setup a ringbuffer
+ * during runtime using provided buffers.
+ *
+ * This must match the initialization of DEFINE_PRINTKRB().
+ *
+ * Context: Any context.
+ */
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int textbits,
+ struct prb_desc *descs, unsigned int descbits,
+ struct printk_info *infos)
+{
+ memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
+ memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));
+
+ rb->desc_ring.count_bits = descbits;
+ rb->desc_ring.descs = descs;
+ rb->desc_ring.infos = infos;
+ atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
+ atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
+
+ rb->text_data_ring.size_bits = textbits;
+ rb->text_data_ring.data = text_buf;
+ atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
+ atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));
+
+ atomic_long_set(&rb->fail, 0);
+
+ atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
+ descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;
+
+ infos[0].seq = -(u64)_DESCS_COUNT(descbits);
+ infos[_DESCS_COUNT(descbits) - 1].seq = 0;
+}
+
+/**
+ * prb_record_text_space() - Query the full actual used ringbuffer space for
+ * the text data of a reserved entry.
+ *
+ * @e: The successfully reserved entry to query.
+ *
+ * This is the public function available to writers to see how much actual
+ * space is used in the ringbuffer to store the text data of the specified
+ * entry.
+ *
+ * This function is only valid if @e has been successfully reserved using
+ * prb_reserve().
+ *
+ * Context: Any context.
+ * Return: The size in bytes used by the text data of the associated record.
+ */
+unsigned int prb_record_text_space(struct prb_reserved_entry *e)
+{
+ return e->text_space;
+}
diff --git a/kernel/printk/printk_ringbuffer.h b/kernel/printk/printk_ringbuffer.h
new file mode 100644
index 000000000000..5dc9d022db07
--- /dev/null
+++ b/kernel/printk/printk_ringbuffer.h
@@ -0,0 +1,382 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _KERNEL_PRINTK_RINGBUFFER_H
+#define _KERNEL_PRINTK_RINGBUFFER_H
+
+#include <linux/atomic.h>
+#include <linux/dev_printk.h>
+
+/*
+ * Meta information about each stored message.
+ *
+ * All fields are set by the printk code except for @seq, which is
+ * set by the ringbuffer code.
+ */
+struct printk_info {
+ u64 seq; /* sequence number */
+ u64 ts_nsec; /* timestamp in nanoseconds */
+ u16 text_len; /* length of text message */
+ u8 facility; /* syslog facility */
+ u8 flags:5; /* internal record flags */
+ u8 level:3; /* syslog level */
+ u32 caller_id; /* thread id or processor id */
+
+ struct dev_printk_info dev_info;
+};
+
+/*
+ * A structure providing the buffers, used by writers and readers.
+ *
+ * Writers:
+ * Using prb_rec_init_wr(), a writer sets @text_buf_size before calling
+ * prb_reserve(). On success, prb_reserve() sets @info and @text_buf to
+ * buffers reserved for that writer.
+ *
+ * Readers:
+ * Using prb_rec_init_rd(), a reader sets all fields before calling
+ * prb_read_valid(). Note that the reader provides the @info and @text_buf,
+ * buffers. On success, the struct pointed to by @info will be filled and
+ * the char array pointed to by @text_buf will be filled with text data.
+ */
+struct printk_record {
+ struct printk_info *info;
+ char *text_buf;
+ unsigned int text_buf_size;
+};
+
+/* Specifies the logical position and span of a data block. */
+struct prb_data_blk_lpos {
+ unsigned long begin;
+ unsigned long next;
+};
+
+/*
+ * A descriptor: the complete meta-data for a record.
+ *
+ * @state_var: A bitwise combination of descriptor ID and descriptor state.
+ */
+struct prb_desc {
+ atomic_long_t state_var;
+ struct prb_data_blk_lpos text_blk_lpos;
+};
+
+/* A ringbuffer of "ID + data" elements. */
+struct prb_data_ring {
+ unsigned int size_bits;
+ char *data;
+ atomic_long_t head_lpos;
+ atomic_long_t tail_lpos;
+};
+
+/* A ringbuffer of "struct prb_desc" elements. */
+struct prb_desc_ring {
+ unsigned int count_bits;
+ struct prb_desc *descs;
+ struct printk_info *infos;
+ atomic_long_t head_id;
+ atomic_long_t tail_id;
+};
+
+/*
+ * The high level structure representing the printk ringbuffer.
+ *
+ * @fail: Count of failed prb_reserve() calls where not even a data-less
+ * record was created.
+ */
+struct printk_ringbuffer {
+ struct prb_desc_ring desc_ring;
+ struct prb_data_ring text_data_ring;
+ atomic_long_t fail;
+};
+
+/*
+ * Used by writers as a reserve/commit handle.
+ *
+ * @rb: Ringbuffer where the entry is reserved.
+ * @irqflags: Saved irq flags to restore on entry commit.
+ * @id: ID of the reserved descriptor.
+ * @text_space: Total occupied buffer space in the text data ring, including
+ * ID, alignment padding, and wrapping data blocks.
+ *
+ * This structure is an opaque handle for writers. Its contents are only
+ * to be used by the ringbuffer implementation.
+ */
+struct prb_reserved_entry {
+ struct printk_ringbuffer *rb;
+ unsigned long irqflags;
+ unsigned long id;
+ unsigned int text_space;
+};
+
+/* The possible responses of a descriptor state-query. */
+enum desc_state {
+ desc_miss = -1, /* ID mismatch (pseudo state) */
+ desc_reserved = 0x0, /* reserved, in use by writer */
+ desc_committed = 0x1, /* committed by writer, could get reopened */
+ desc_finalized = 0x2, /* committed, no further modification allowed */
+ desc_reusable = 0x3, /* free, not yet used by any writer */
+};
+
+#define _DATA_SIZE(sz_bits) (1UL << (sz_bits))
+#define _DESCS_COUNT(ct_bits) (1U << (ct_bits))
+#define DESC_SV_BITS (sizeof(unsigned long) * 8)
+#define DESC_FLAGS_SHIFT (DESC_SV_BITS - 2)
+#define DESC_FLAGS_MASK (3UL << DESC_FLAGS_SHIFT)
+#define DESC_STATE(sv) (3UL & (sv >> DESC_FLAGS_SHIFT))
+#define DESC_SV(id, state) (((unsigned long)state << DESC_FLAGS_SHIFT) | id)
+#define DESC_ID_MASK (~DESC_FLAGS_MASK)
+#define DESC_ID(sv) ((sv) & DESC_ID_MASK)
+#define FAILED_LPOS 0x1
+#define NO_LPOS 0x3
+
+#define FAILED_BLK_LPOS \
+{ \
+ .begin = FAILED_LPOS, \
+ .next = FAILED_LPOS, \
+}
+
+/*
+ * Descriptor Bootstrap
+ *
+ * The descriptor array is minimally initialized to allow immediate usage
+ * by readers and writers. The requirements that the descriptor array
+ * initialization must satisfy:
+ *
+ * Req1
+ * The tail must point to an existing (committed or reusable) descriptor.
+ * This is required by the implementation of prb_first_seq().
+ *
+ * Req2
+ * Readers must see that the ringbuffer is initially empty.
+ *
+ * Req3
+ * The first record reserved by a writer is assigned sequence number 0.
+ *
+ * To satisfy Req1, the tail initially points to a descriptor that is
+ * minimally initialized (having no data block, i.e. data-less with the
+ * data block's lpos @begin and @next values set to FAILED_LPOS).
+ *
+ * To satisfy Req2, the initial tail descriptor is initialized to the
+ * reusable state. Readers recognize reusable descriptors as existing
+ * records, but skip over them.
+ *
+ * To satisfy Req3, the last descriptor in the array is used as the initial
+ * head (and tail) descriptor. This allows the first record reserved by a
+ * writer (head + 1) to be the first descriptor in the array. (Only the first
+ * descriptor in the array could have a valid sequence number of 0.)
+ *
+ * The first time a descriptor is reserved, it is assigned a sequence number
+ * with the value of the array index. A "first time reserved" descriptor can
+ * be recognized because it has a sequence number of 0 but does not have an
+ * index of 0. (Only the first descriptor in the array could have a valid
+ * sequence number of 0.) After the first reservation, all future reservations
+ * (recycling) simply involve incrementing the sequence number by the array
+ * count.
+ *
+ * Hack #1
+ * Only the first descriptor in the array is allowed to have the sequence
+ * number 0. In this case it is not possible to recognize if it is being
+ * reserved the first time (set to index value) or has been reserved
+ * previously (increment by the array count). This is handled by _always_
+ * incrementing the sequence number by the array count when reserving the
+ * first descriptor in the array. In order to satisfy Req3, the sequence
+ * number of the first descriptor in the array is initialized to minus
+ * the array count. Then, upon the first reservation, it is incremented
+ * to 0, thus satisfying Req3.
+ *
+ * Hack #2
+ * prb_first_seq() can be called at any time by readers to retrieve the
+ * sequence number of the tail descriptor. However, due to Req2 and Req3,
+ * initially there are no records to report the sequence number of
+ * (sequence numbers are u64 and there is nothing less than 0). To handle
+ * this, the sequence number of the initial tail descriptor is initialized
+ * to 0. Technically this is incorrect, because there is no record with
+ * sequence number 0 (yet) and the tail descriptor is not the first
+ * descriptor in the array. But it allows prb_read_valid() to correctly
+ * report the existence of a record for _any_ given sequence number at all
+ * times. Bootstrapping is complete when the tail is pushed the first
+ * time, thus finally pointing to the first descriptor reserved by a
+ * writer, which has the assigned sequence number 0.
+ */
+
+/*
+ * Initiating Logical Value Overflows
+ *
+ * Both logical position (lpos) and ID values can be mapped to array indexes
+ * but may experience overflows during the lifetime of the system. To ensure
+ * that printk_ringbuffer can handle the overflows for these types, initial
+ * values are chosen that map to the correct initial array indexes, but will
+ * result in overflows soon.
+ *
+ * BLK0_LPOS
+ * The initial @head_lpos and @tail_lpos for data rings. It is at index
+ * 0 and the lpos value is such that it will overflow on the first wrap.
+ *
+ * DESC0_ID
+ * The initial @head_id and @tail_id for the desc ring. It is at the last
+ * index of the descriptor array (see Req3 above) and the ID value is such
+ * that it will overflow on the second wrap.
+ */
+#define BLK0_LPOS(sz_bits) (-(_DATA_SIZE(sz_bits)))
+#define DESC0_ID(ct_bits) DESC_ID(-(_DESCS_COUNT(ct_bits) + 1))
+#define DESC0_SV(ct_bits) DESC_SV(DESC0_ID(ct_bits), desc_reusable)
+
+/*
+ * Define a ringbuffer with an external text data buffer. The same as
+ * DEFINE_PRINTKRB() but requires specifying an external buffer for the
+ * text data.
+ *
+ * Note: The specified external buffer must be of the size:
+ * 2 ^ (descbits + avgtextbits)
+ */
+#define _DEFINE_PRINTKRB(name, descbits, avgtextbits, text_buf) \
+static struct prb_desc _##name##_descs[_DESCS_COUNT(descbits)] = { \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reusable */ \
+ .state_var = ATOMIC_INIT(DESC0_SV(descbits)), \
+ /* no associated data block */ \
+ .text_blk_lpos = FAILED_BLK_LPOS, \
+ }, \
+}; \
+static struct printk_info _##name##_infos[_DESCS_COUNT(descbits)] = { \
+ /* this will be the first record reserved by a writer */ \
+ [0] = { \
+ /* will be incremented to 0 on the first reservation */ \
+ .seq = -(u64)_DESCS_COUNT(descbits), \
+ }, \
+ /* the initial head and tail */ \
+ [_DESCS_COUNT(descbits) - 1] = { \
+ /* reports the first seq value during the bootstrap phase */ \
+ .seq = 0, \
+ }, \
+}; \
+static struct printk_ringbuffer name = { \
+ .desc_ring = { \
+ .count_bits = descbits, \
+ .descs = &_##name##_descs[0], \
+ .infos = &_##name##_infos[0], \
+ .head_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ .tail_id = ATOMIC_INIT(DESC0_ID(descbits)), \
+ }, \
+ .text_data_ring = { \
+ .size_bits = (avgtextbits) + (descbits), \
+ .data = text_buf, \
+ .head_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ .tail_lpos = ATOMIC_LONG_INIT(BLK0_LPOS((avgtextbits) + (descbits))), \
+ }, \
+ .fail = ATOMIC_LONG_INIT(0), \
+}
+
+/**
+ * DEFINE_PRINTKRB() - Define a ringbuffer.
+ *
+ * @name: The name of the ringbuffer variable.
+ * @descbits: The number of descriptors as a power-of-2 value.
+ * @avgtextbits: The average text data size per record as a power-of-2 value.
+ *
+ * This is a macro for defining a ringbuffer and all internal structures
+ * such that it is ready for immediate use. See _DEFINE_PRINTKRB() for a
+ * variant where the text data buffer can be specified externally.
+ */
+#define DEFINE_PRINTKRB(name, descbits, avgtextbits) \
+static char _##name##_text[1U << ((avgtextbits) + (descbits))] \
+ __aligned(__alignof__(unsigned long)); \
+_DEFINE_PRINTKRB(name, descbits, avgtextbits, &_##name##_text[0])
+
+/* Writer Interface */
+
+/**
+ * prb_rec_init_wd() - Initialize a buffer for writing records.
+ *
+ * @r: The record to initialize.
+ * @text_buf_size: The needed text buffer size.
+ */
+static inline void prb_rec_init_wr(struct printk_record *r,
+ unsigned int text_buf_size)
+{
+ r->info = NULL;
+ r->text_buf = NULL;
+ r->text_buf_size = text_buf_size;
+}
+
+bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r);
+bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
+ struct printk_record *r, u32 caller_id, unsigned int max_size);
+void prb_commit(struct prb_reserved_entry *e);
+void prb_final_commit(struct prb_reserved_entry *e);
+
+void prb_init(struct printk_ringbuffer *rb,
+ char *text_buf, unsigned int text_buf_size,
+ struct prb_desc *descs, unsigned int descs_count_bits,
+ struct printk_info *infos);
+unsigned int prb_record_text_space(struct prb_reserved_entry *e);
+
+/* Reader Interface */
+
+/**
+ * prb_rec_init_rd() - Initialize a buffer for reading records.
+ *
+ * @r: The record to initialize.
+ * @info: A buffer to store record meta-data.
+ * @text_buf: A buffer to store text data.
+ * @text_buf_size: The size of @text_buf.
+ *
+ * Initialize all the fields that a reader is interested in. All arguments
+ * (except @r) are optional. Only record data for arguments that are
+ * non-NULL or non-zero will be read.
+ */
+static inline void prb_rec_init_rd(struct printk_record *r,
+ struct printk_info *info,
+ char *text_buf, unsigned int text_buf_size)
+{
+ r->info = info;
+ r->text_buf = text_buf;
+ r->text_buf_size = text_buf_size;
+}
+
+/**
+ * prb_for_each_record() - Iterate over the records of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @r: A printk_record to store the record on each iteration.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_record(from, rb, s, r) \
+for ((s) = from; prb_read_valid(rb, s, r); (s) = (r)->info->seq + 1)
+
+/**
+ * prb_for_each_info() - Iterate over the meta data of a ringbuffer.
+ *
+ * @from: The sequence number to begin with.
+ * @rb: The ringbuffer to iterate over.
+ * @s: A u64 to store the sequence number on each iteration.
+ * @i: A printk_info to store the record meta data on each iteration.
+ * @lc: An unsigned int to store the text line count of each record.
+ *
+ * This is a macro for conveniently iterating over a ringbuffer.
+ * Note that @s may not be the sequence number of the record on each
+ * iteration. For the sequence number, @r->info->seq should be checked.
+ *
+ * Context: Any context.
+ */
+#define prb_for_each_info(from, rb, s, i, lc) \
+for ((s) = from; prb_read_valid_info(rb, s, i, lc); (s) = (i)->seq + 1)
+
+bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_record *r);
+bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
+ struct printk_info *info, unsigned int *line_count);
+
+u64 prb_first_valid_seq(struct printk_ringbuffer *rb);
+u64 prb_next_seq(struct printk_ringbuffer *rb);
+
+#endif /* _KERNEL_PRINTK_RINGBUFFER_H */
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 50aeae770434..5dbc40160990 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -375,7 +375,7 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args)
raw_spin_trylock(&logbuf_lock)) {
int len;
- len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, 0, fmt, args);
+ len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
raw_spin_unlock(&logbuf_lock);
defer_console_output();
return len;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 676d4af62103..8ad7a293255a 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -196,6 +196,10 @@ struct seccomp_filter {
*/
static void populate_seccomp_data(struct seccomp_data *sd)
{
+ /*
+ * Instead of using current_pt_reg(), we're already doing the work
+ * to safely fetch "current", so just use "task" everywhere below.
+ */
struct task_struct *task = current;
struct pt_regs *regs = task_pt_regs(task);
unsigned long args[6];
@@ -910,7 +914,7 @@ out:
if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE)
return 0;
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, current_pt_regs(),
err, ret);
return -1;
}
@@ -943,13 +947,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/* Set low-order bits as an errno, capped at MAX_ERRNO. */
if (data > MAX_ERRNO)
data = MAX_ERRNO;
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, current_pt_regs(),
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
- syscall_rollback(current, task_pt_regs(current));
+ syscall_rollback(current, current_pt_regs());
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
@@ -962,7 +966,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
/* ENOSYS these calls if there is no tracer attached. */
if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
syscall_set_return_value(current,
- task_pt_regs(current),
+ current_pt_regs(),
-ENOSYS, 0);
goto skip;
}
@@ -982,7 +986,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
if (fatal_signal_pending(current))
goto skip;
/* Check if the tracer forced the syscall to be skipped. */
- this_syscall = syscall_get_nr(current, task_pt_regs(current));
+ this_syscall = syscall_get_nr(current, current_pt_regs());
if (this_syscall < 0)
goto skip;
@@ -1020,20 +1024,20 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
default:
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
- if (action == SECCOMP_RET_KILL_PROCESS ||
+ if (action != SECCOMP_RET_KILL_THREAD ||
get_nr_threads(current) == 1) {
kernel_siginfo_t info;
/* Show the original registers in the dump. */
- syscall_rollback(current, task_pt_regs(current));
+ syscall_rollback(current, current_pt_regs());
/* Trigger a manual coredump since do_exit skips it. */
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
- if (action == SECCOMP_RET_KILL_PROCESS)
- do_group_exit(SIGSYS);
- else
+ if (action == SECCOMP_RET_KILL_THREAD)
do_exit(SIGSYS);
+ else
+ do_group_exit(SIGSYS);
}
unreachable();
@@ -1060,7 +1064,7 @@ int __secure_computing(const struct seccomp_data *sd)
return 0;
this_syscall = sd ? sd->nr :
- syscall_get_nr(current, task_pt_regs(current));
+ syscall_get_nr(current, current_pt_regs());
switch (mode) {
case SECCOMP_MODE_STRICT:
@@ -1472,13 +1476,7 @@ static const struct file_operations seccomp_notify_ops = {
static struct file *init_listener(struct seccomp_filter *filter)
{
- struct file *ret = ERR_PTR(-EBUSY);
- struct seccomp_filter *cur;
-
- for (cur = current->seccomp.filter; cur; cur = cur->prev) {
- if (cur->notif)
- goto out;
- }
+ struct file *ret;
ret = ERR_PTR(-ENOMEM);
filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
@@ -1504,6 +1502,31 @@ out:
return ret;
}
+/*
+ * Does @new_child have a listener while an ancestor also has a listener?
+ * If so, we'll want to reject this filter.
+ * This only has to be tested for the current process, even in the TSYNC case,
+ * because TSYNC installs @child with the same parent on all threads.
+ * Note that @new_child is not hooked up to its parent at this point yet, so
+ * we use current->seccomp.filter.
+ */
+static bool has_duplicate_listener(struct seccomp_filter *new_child)
+{
+ struct seccomp_filter *cur;
+
+ /* must be protected against concurrent TSYNC */
+ lockdep_assert_held(&current->sighand->siglock);
+
+ if (!new_child->notif)
+ return false;
+ for (cur = current->seccomp.filter; cur; cur = cur->prev) {
+ if (cur->notif)
+ return true;
+ }
+
+ return false;
+}
+
/**
* seccomp_set_mode_filter: internal function for setting seccomp filter
* @flags: flags to change filter behavior
@@ -1575,6 +1598,11 @@ static long seccomp_set_mode_filter(unsigned int flags,
if (!seccomp_may_assign_mode(seccomp_mode))
goto out;
+ if (has_duplicate_listener(prepared)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = seccomp_attach_filter(flags, prepared);
if (ret)
goto out;
diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py
index 2fa7bb83885f..a92c55bd8de5 100644
--- a/scripts/gdb/linux/dmesg.py
+++ b/scripts/gdb/linux/dmesg.py
@@ -16,8 +16,13 @@ import sys
from linux import utils
-printk_log_type = utils.CachedType("struct printk_log")
-
+printk_info_type = utils.CachedType("struct printk_info")
+prb_data_blk_lpos_type = utils.CachedType("struct prb_data_blk_lpos")
+prb_desc_type = utils.CachedType("struct prb_desc")
+prb_desc_ring_type = utils.CachedType("struct prb_desc_ring")
+prb_data_ring_type = utils.CachedType("struct prb_data_ring")
+printk_ringbuffer_type = utils.CachedType("struct printk_ringbuffer")
+atomic_long_type = utils.CachedType("atomic_long_t")
class LxDmesg(gdb.Command):
"""Print Linux kernel log buffer."""
@@ -26,44 +31,110 @@ class LxDmesg(gdb.Command):
super(LxDmesg, self).__init__("lx-dmesg", gdb.COMMAND_DATA)
def invoke(self, arg, from_tty):
- log_buf_addr = int(str(gdb.parse_and_eval(
- "(void *)'printk.c'::log_buf")).split()[0], 16)
- log_first_idx = int(gdb.parse_and_eval("'printk.c'::log_first_idx"))
- log_next_idx = int(gdb.parse_and_eval("'printk.c'::log_next_idx"))
- log_buf_len = int(gdb.parse_and_eval("'printk.c'::log_buf_len"))
-
inf = gdb.inferiors()[0]
- start = log_buf_addr + log_first_idx
- if log_first_idx < log_next_idx:
- log_buf_2nd_half = -1
- length = log_next_idx - log_first_idx
- log_buf = utils.read_memoryview(inf, start, length).tobytes()
- else:
- log_buf_2nd_half = log_buf_len - log_first_idx
- a = utils.read_memoryview(inf, start, log_buf_2nd_half)
- b = utils.read_memoryview(inf, log_buf_addr, log_next_idx)
- log_buf = a.tobytes() + b.tobytes()
-
- length_offset = printk_log_type.get_type()['len'].bitpos // 8
- text_len_offset = printk_log_type.get_type()['text_len'].bitpos // 8
- time_stamp_offset = printk_log_type.get_type()['ts_nsec'].bitpos // 8
- text_offset = printk_log_type.get_type().sizeof
-
- pos = 0
- while pos < log_buf.__len__():
- length = utils.read_u16(log_buf, pos + length_offset)
- if length == 0:
- if log_buf_2nd_half == -1:
- gdb.write("Corrupted log buffer!\n")
+
+ # read in prb structure
+ prb_addr = int(str(gdb.parse_and_eval("(void *)'printk.c'::prb")).split()[0], 16)
+ sz = printk_ringbuffer_type.get_type().sizeof
+ prb = utils.read_memoryview(inf, prb_addr, sz).tobytes()
+
+ # read in descriptor ring structure
+ off = printk_ringbuffer_type.get_type()['desc_ring'].bitpos // 8
+ addr = prb_addr + off
+ sz = prb_desc_ring_type.get_type().sizeof
+ desc_ring = utils.read_memoryview(inf, addr, sz).tobytes()
+
+ # read in descriptor array
+ off = prb_desc_ring_type.get_type()['count_bits'].bitpos // 8
+ desc_ring_count = 1 << utils.read_u32(desc_ring, off)
+ desc_sz = prb_desc_type.get_type().sizeof
+ off = prb_desc_ring_type.get_type()['descs'].bitpos // 8
+ addr = utils.read_ulong(desc_ring, off)
+ descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes()
+
+ # read in info array
+ info_sz = printk_info_type.get_type().sizeof
+ off = prb_desc_ring_type.get_type()['infos'].bitpos // 8
+ addr = utils.read_ulong(desc_ring, off)
+ infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes()
+
+ # read in text data ring structure
+ off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8
+ addr = prb_addr + off
+ sz = prb_data_ring_type.get_type().sizeof
+ text_data_ring = utils.read_memoryview(inf, addr, sz).tobytes()
+
+ # read in text data
+ off = prb_data_ring_type.get_type()['size_bits'].bitpos // 8
+ text_data_sz = 1 << utils.read_u32(text_data_ring, off)
+ off = prb_data_ring_type.get_type()['data'].bitpos // 8
+ addr = utils.read_ulong(text_data_ring, off)
+ text_data = utils.read_memoryview(inf, addr, text_data_sz).tobytes()
+
+ counter_off = atomic_long_type.get_type()['counter'].bitpos // 8
+
+ sv_off = prb_desc_type.get_type()['state_var'].bitpos // 8
+
+ off = prb_desc_type.get_type()['text_blk_lpos'].bitpos // 8
+ begin_off = off + (prb_data_blk_lpos_type.get_type()['begin'].bitpos // 8)
+ next_off = off + (prb_data_blk_lpos_type.get_type()['next'].bitpos // 8)
+
+ ts_off = printk_info_type.get_type()['ts_nsec'].bitpos // 8
+ len_off = printk_info_type.get_type()['text_len'].bitpos // 8
+
+ # definitions from kernel/printk/printk_ringbuffer.h
+ desc_committed = 1
+ desc_finalized = 2
+ desc_sv_bits = utils.get_long_type().sizeof * 8
+ desc_flags_shift = desc_sv_bits - 2
+ desc_flags_mask = 3 << desc_flags_shift
+ desc_id_mask = ~desc_flags_mask
+
+ # read in tail and head descriptor ids
+ off = prb_desc_ring_type.get_type()['tail_id'].bitpos // 8
+ tail_id = utils.read_u64(desc_ring, off + counter_off)
+ off = prb_desc_ring_type.get_type()['head_id'].bitpos // 8
+ head_id = utils.read_u64(desc_ring, off + counter_off)
+
+ did = tail_id
+ while True:
+ ind = did % desc_ring_count
+ desc_off = desc_sz * ind
+ info_off = info_sz * ind
+
+ # skip non-committed record
+ state = 3 & (utils.read_u64(descs, desc_off + sv_off +
+ counter_off) >> desc_flags_shift)
+ if state != desc_committed and state != desc_finalized:
+ if did == head_id:
break
- pos = log_buf_2nd_half
+ did = (did + 1) & desc_id_mask
continue
- text_len = utils.read_u16(log_buf, pos + text_len_offset)
- text_start = pos + text_offset
- text = log_buf[text_start:text_start + text_len].decode(
- encoding='utf8', errors='replace')
- time_stamp = utils.read_u64(log_buf, pos + time_stamp_offset)
+ begin = utils.read_ulong(descs, desc_off + begin_off) % text_data_sz
+ end = utils.read_ulong(descs, desc_off + next_off) % text_data_sz
+
+ # handle data-less record
+ if begin & 1 == 1:
+ text = ""
+ else:
+ # handle wrapping data block
+ if begin > end:
+ begin = 0
+
+ # skip over descriptor id
+ text_start = begin + utils.get_long_type().sizeof
+
+ text_len = utils.read_u16(infos, info_off + len_off)
+
+ # handle truncated message
+ if end - text_start < text_len:
+ text_len = end - text_start
+
+ text = text_data[text_start:text_start + text_len].decode(
+ encoding='utf8', errors='replace')
+
+ time_stamp = utils.read_u64(infos, info_off + ts_off)
for line in text.splitlines():
msg = u"[{time:12.6f}] {line}\n".format(
@@ -75,7 +146,9 @@ class LxDmesg(gdb.Command):
msg = msg.encode(encoding='utf8', errors='replace')
gdb.write(msg)
- pos += length
+ if did == head_id:
+ break
+ did = (did + 1) & desc_id_mask
LxDmesg()
diff --git a/scripts/gdb/linux/utils.py b/scripts/gdb/linux/utils.py
index ea94221dbd39..ff7c1799d588 100644
--- a/scripts/gdb/linux/utils.py
+++ b/scripts/gdb/linux/utils.py
@@ -123,6 +123,13 @@ def read_u64(buffer, offset):
return read_u32(buffer, offset + 4) + (read_u32(buffer, offset) << 32)
+def read_ulong(buffer, offset):
+ if get_long_type().sizeof == 8:
+ return read_u64(buffer, offset)
+ else:
+ return read_u32(buffer, offset)
+
+
target_arch = None
diff --git a/scripts/selinux/mdp/mdp.c b/scripts/selinux/mdp/mdp.c
index 6ceb88eb9b59..105c1c31a316 100644
--- a/scripts/selinux/mdp/mdp.c
+++ b/scripts/selinux/mdp/mdp.c
@@ -35,6 +35,9 @@ struct security_class_mapping {
#include "classmap.h"
#include "initial_sid_to_string.h"
+#include "policycap_names.h"
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
int main(int argc, char *argv[])
{
@@ -115,6 +118,10 @@ int main(int argc, char *argv[])
}
}
+ /* enable all policy capabilities */
+ for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
+ fprintf(fout, "policycap %s;\n", selinux_policycap_names[i]);
+
/* types, roles, and allows */
fprintf(fout, "type base_t;\n");
fprintf(fout, "role base_r;\n");
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index d18cb32a242a..3c05827608b6 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -31,6 +31,9 @@
#include "avc_ss.h"
#include "classmap.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/avc.h>
+
#define AVC_CACHE_SLOTS 512
#define AVC_DEF_CACHE_THRESHOLD 512
#define AVC_CACHE_RECLAIM 16
@@ -702,33 +705,37 @@ static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
{
struct common_audit_data *ad = a;
struct selinux_audit_data *sad = ad->selinux_audit_data;
- char *scontext;
+ char *scontext = NULL;
+ char *tcontext = NULL;
+ const char *tclass = NULL;
u32 scontext_len;
+ u32 tcontext_len;
int rc;
rc = security_sid_to_context(sad->state, sad->ssid, &scontext,
&scontext_len);
if (rc)
audit_log_format(ab, " ssid=%d", sad->ssid);
- else {
+ else
audit_log_format(ab, " scontext=%s", scontext);
- kfree(scontext);
- }
- rc = security_sid_to_context(sad->state, sad->tsid, &scontext,
- &scontext_len);
+ rc = security_sid_to_context(sad->state, sad->tsid, &tcontext,
+ &tcontext_len);
if (rc)
audit_log_format(ab, " tsid=%d", sad->tsid);
- else {
- audit_log_format(ab, " tcontext=%s", scontext);
- kfree(scontext);
- }
+ else
+ audit_log_format(ab, " tcontext=%s", tcontext);
- audit_log_format(ab, " tclass=%s", secclass_map[sad->tclass-1].name);
+ tclass = secclass_map[sad->tclass-1].name;
+ audit_log_format(ab, " tclass=%s", tclass);
if (sad->denied)
audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);
+ trace_selinux_audited(sad, scontext, tcontext, tclass);
+ kfree(tcontext);
+ kfree(scontext);
+
/* in case of invalid context report also the actual context string */
rc = security_sid_to_context_inval(sad->state, sad->ssid, &scontext,
&scontext_len);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a340986aa92e..2dabd58b126a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1978,7 +1978,7 @@ static inline u32 file_to_av(struct file *file)
}
/*
- * Convert a file to an access vector and include the correct open
+ * Convert a file to an access vector and include the correct
* open permission.
*/
static inline u32 open_file_to_av(struct file *file)
@@ -3271,6 +3271,9 @@ static int selinux_inode_removexattr(struct dentry *dentry, const char *name)
return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}
+ if (!selinux_initialized(&selinux_state))
+ return 0;
+
/* No one is allowed to remove a SELinux security label.
You can change the label, but all data must be labeled. */
return -EACCES;
@@ -3709,7 +3712,7 @@ static int selinux_mmap_file(struct file *file, unsigned long reqprot,
return rc;
}
- if (selinux_state.checkreqprot)
+ if (checkreqprot_get(&selinux_state))
prot = reqprot;
return file_map_prot_check(file, prot,
@@ -3723,7 +3726,7 @@ static int selinux_file_mprotect(struct vm_area_struct *vma,
const struct cred *cred = current_cred();
u32 sid = cred_sid(cred);
- if (selinux_state.checkreqprot)
+ if (checkreqprot_get(&selinux_state))
prot = reqprot;
if (default_noexec &&
@@ -4438,7 +4441,7 @@ static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
*
* If @skb_sid is valid then the user:role:type information from @sk_sid is
* combined with the MLS information from @skb_sid in order to create
- * @conn_sid. If @skb_sid is not valid then then @conn_sid is simply a copy
+ * @conn_sid. If @skb_sid is not valid then @conn_sid is simply a copy
* of @sk_sid. Returns zero on success, negative values on failure.
*
*/
@@ -5308,7 +5311,7 @@ static int selinux_sctp_bind_connect(struct sock *sk, int optname,
/* As selinux_sctp_bind_connect() is called by the
* SCTP protocol layer, the socket is already locked,
- * therefore selinux_netlbl_socket_connect_locked() is
+ * therefore selinux_netlbl_socket_connect_locked()
* is called here. The situations handled are:
* sctp_connectx(3), sctp_sendmsg(3), sendmsg(2),
* whenever a new IP address is added or when a new
@@ -7225,10 +7228,10 @@ static __init int selinux_init(void)
memset(&selinux_state, 0, sizeof(selinux_state));
enforcing_set(&selinux_state, selinux_enforcing_boot);
- selinux_state.checkreqprot = selinux_checkreqprot_boot;
- selinux_ss_init(&selinux_state.ss);
+ checkreqprot_set(&selinux_state, selinux_checkreqprot_boot);
selinux_avc_init(&selinux_state.avc);
mutex_init(&selinux_state.status_lock);
+ mutex_init(&selinux_state.policy_mutex);
/* Set the security state for the initial task. */
cred_init_security();
diff --git a/security/selinux/include/conditional.h b/security/selinux/include/conditional.h
index 539ab357707d..b09343346e3f 100644
--- a/security/selinux/include/conditional.h
+++ b/security/selinux/include/conditional.h
@@ -13,7 +13,7 @@
#include "security.h"
-int security_get_bools(struct selinux_state *state,
+int security_get_bools(struct selinux_policy *policy,
u32 *len, char ***names, int **values);
int security_set_bools(struct selinux_state *state, u32 len, int *values);
diff --git a/security/selinux/include/policycap.h b/security/selinux/include/policycap.h
new file mode 100644
index 000000000000..2ec038efbb03
--- /dev/null
+++ b/security/selinux/include/policycap.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SELINUX_POLICYCAP_H_
+#define _SELINUX_POLICYCAP_H_
+
+/* Policy capabilities */
+enum {
+ POLICYDB_CAPABILITY_NETPEER,
+ POLICYDB_CAPABILITY_OPENPERM,
+ POLICYDB_CAPABILITY_EXTSOCKCLASS,
+ POLICYDB_CAPABILITY_ALWAYSNETWORK,
+ POLICYDB_CAPABILITY_CGROUPSECLABEL,
+ POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION,
+ POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS,
+ __POLICYDB_CAPABILITY_MAX
+};
+#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
+
+extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX];
+
+#endif /* _SELINUX_POLICYCAP_H_ */
diff --git a/security/selinux/include/policycap_names.h b/security/selinux/include/policycap_names.h
new file mode 100644
index 000000000000..b89289f092c9
--- /dev/null
+++ b/security/selinux/include/policycap_names.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SELINUX_POLICYCAP_NAMES_H_
+#define _SELINUX_POLICYCAP_NAMES_H_
+
+#include "policycap.h"
+
+/* Policy capability names */
+const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = {
+ "network_peer_controls",
+ "open_perms",
+ "extended_socket_class",
+ "always_check_network",
+ "cgroup_seclabel",
+ "nnp_nosuid_transition",
+ "genfs_seclabel_symlinks"
+};
+
+#endif /* _SELINUX_POLICYCAP_NAMES_H_ */
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index b0e02cfe3ce1..3cc8bab31ea8 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -13,9 +13,11 @@
#include <linux/dcache.h>
#include <linux/magic.h>
#include <linux/types.h>
+#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include "flask.h"
+#include "policycap.h"
#define SECSID_NULL 0x00000000 /* unspecified SID */
#define SECSID_WILD 0xffffffff /* wildcard SID */
@@ -72,21 +74,6 @@ struct netlbl_lsm_secattr;
extern int selinux_enabled_boot;
-/* Policy capabilities */
-enum {
- POLICYDB_CAPABILITY_NETPEER,
- POLICYDB_CAPABILITY_OPENPERM,
- POLICYDB_CAPABILITY_EXTSOCKCLASS,
- POLICYDB_CAPABILITY_ALWAYSNETWORK,
- POLICYDB_CAPABILITY_CGROUPSECLABEL,
- POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION,
- POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS,
- __POLICYDB_CAPABILITY_MAX
-};
-#define POLICYDB_CAPABILITY_MAX (__POLICYDB_CAPABILITY_MAX - 1)
-
-extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX];
-
/*
* type_datum properties
* available at the kernel policy version >= POLICYDB_VERSION_BOUNDARY
@@ -98,7 +85,7 @@ extern const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX];
#define POLICYDB_BOUNDS_MAXDEPTH 4
struct selinux_avc;
-struct selinux_ss;
+struct selinux_policy;
struct selinux_state {
#ifdef CONFIG_SECURITY_SELINUX_DISABLE
@@ -115,10 +102,10 @@ struct selinux_state {
struct mutex status_lock;
struct selinux_avc *avc;
- struct selinux_ss *ss;
+ struct selinux_policy __rcu *policy;
+ struct mutex policy_mutex;
} __randomize_layout;
-void selinux_ss_init(struct selinux_ss **ss);
void selinux_avc_init(struct selinux_avc **avc);
extern struct selinux_state selinux_state;
@@ -156,6 +143,16 @@ static inline void enforcing_set(struct selinux_state *state, bool value)
}
#endif
+static inline bool checkreqprot_get(const struct selinux_state *state)
+{
+ return READ_ONCE(state->checkreqprot);
+}
+
+static inline void checkreqprot_set(struct selinux_state *state, bool value)
+{
+ WRITE_ONCE(state->checkreqprot, value);
+}
+
#ifdef CONFIG_SECURITY_SELINUX_DISABLE
static inline bool selinux_disabled(struct selinux_state *state)
{
@@ -177,57 +174,61 @@ static inline bool selinux_policycap_netpeer(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_NETPEER];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NETPEER]);
}
static inline bool selinux_policycap_openperm(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_OPENPERM];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_OPENPERM]);
}
static inline bool selinux_policycap_extsockclass(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_EXTSOCKCLASS]);
}
static inline bool selinux_policycap_alwaysnetwork(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_ALWAYSNETWORK]);
}
static inline bool selinux_policycap_cgroupseclabel(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_CGROUPSECLABEL]);
}
static inline bool selinux_policycap_nnp_nosuid_transition(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_NNP_NOSUID_TRANSITION]);
}
static inline bool selinux_policycap_genfs_seclabel_symlinks(void)
{
struct selinux_state *state = &selinux_state;
- return state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS];
+ return READ_ONCE(state->policycap[POLICYDB_CAPABILITY_GENFS_SECLABEL_SYMLINKS]);
}
int security_mls_enabled(struct selinux_state *state);
int security_load_policy(struct selinux_state *state,
- void *data, size_t len);
+ void *data, size_t len,
+ struct selinux_policy **newpolicyp);
+void selinux_policy_commit(struct selinux_state *state,
+ struct selinux_policy *newpolicy);
+void selinux_policy_cancel(struct selinux_state *state,
+ struct selinux_policy *policy);
int security_read_policy(struct selinux_state *state,
void **data, size_t *len);
-size_t security_policydb_len(struct selinux_state *state);
int security_policycap_supported(struct selinux_state *state,
unsigned int req_cap);
@@ -358,9 +359,9 @@ int security_net_peersid_resolve(struct selinux_state *state,
u32 xfrm_sid,
u32 *peer_sid);
-int security_get_classes(struct selinux_state *state,
+int security_get_classes(struct selinux_policy *policy,
char ***classes, int *nclasses);
-int security_get_permissions(struct selinux_state *state,
+int security_get_permissions(struct selinux_policy *policy,
char *class, char ***perms, int *nperms);
int security_get_reject_unknown(struct selinux_state *state);
int security_get_allow_unknown(struct selinux_state *state);
@@ -380,6 +381,10 @@ int security_genfs_sid(struct selinux_state *state,
const char *fstype, char *name, u16 sclass,
u32 *sid);
+int selinux_policy_genfs_sid(struct selinux_policy *policy,
+ const char *fstype, char *name, u16 sclass,
+ u32 *sid);
+
#ifdef CONFIG_NETLABEL
int security_netlbl_secattr_to_sid(struct selinux_state *state,
struct netlbl_lsm_secattr *secattr,
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 4781314c2510..4bde570d56a2 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -20,6 +20,7 @@
#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/mutex.h>
+#include <linux/namei.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/security.h>
@@ -74,7 +75,6 @@ struct selinux_fs_info {
unsigned long last_class_ino;
bool policy_opened;
struct dentry *policycap_dir;
- struct mutex mutex;
unsigned long last_ino;
struct selinux_state *state;
struct super_block *sb;
@@ -88,7 +88,6 @@ static int selinux_fs_info_create(struct super_block *sb)
if (!fsi)
return -ENOMEM;
- mutex_init(&fsi->mutex);
fsi->last_ino = SEL_INO_NEXT - 1;
fsi->state = &selinux_state;
fsi->sb = sb;
@@ -117,6 +116,10 @@ static void selinux_fs_info_free(struct super_block *sb)
#define SEL_POLICYCAP_INO_OFFSET 0x08000000
#define SEL_INO_MASK 0x00ffffff
+#define BOOL_DIR_NAME "booleans"
+#define CLASS_DIR_NAME "class"
+#define POLICYCAP_DIR_NAME "policy_capabilities"
+
#define TMPBUFLEN 12
static ssize_t sel_read_enforce(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
@@ -346,14 +349,24 @@ static const struct file_operations sel_policyvers_ops = {
};
/* declaration for sel_write_load */
-static int sel_make_bools(struct selinux_fs_info *fsi);
-static int sel_make_classes(struct selinux_fs_info *fsi);
-static int sel_make_policycap(struct selinux_fs_info *fsi);
+static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir,
+ unsigned int *bool_num, char ***bool_pending_names,
+ unsigned int **bool_pending_values);
+static int sel_make_classes(struct selinux_policy *newpolicy,
+ struct dentry *class_dir,
+ unsigned long *last_class_ino);
/* declaration for sel_make_class_dirs */
static struct dentry *sel_make_dir(struct dentry *dir, const char *name,
unsigned long *ino);
+/* declaration for sel_make_policy_nodes */
+static struct dentry *sel_make_disconnected_dir(struct super_block *sb,
+ unsigned long *ino);
+
+/* declaration for sel_make_policy_nodes */
+static void sel_remove_entries(struct dentry *de);
+
static ssize_t sel_read_mls(struct file *filp, char __user *buf,
size_t count, loff_t *ppos)
{
@@ -385,7 +398,7 @@ static int sel_open_policy(struct inode *inode, struct file *filp)
BUG_ON(filp->private_data);
- mutex_lock(&fsi->mutex);
+ mutex_lock(&fsi->state->policy_mutex);
rc = avc_has_perm(&selinux_state,
current_sid(), SECINITSID_SECURITY,
@@ -402,25 +415,25 @@ static int sel_open_policy(struct inode *inode, struct file *filp)
if (!plm)
goto err;
- if (i_size_read(inode) != security_policydb_len(state)) {
- inode_lock(inode);
- i_size_write(inode, security_policydb_len(state));
- inode_unlock(inode);
- }
-
rc = security_read_policy(state, &plm->data, &plm->len);
if (rc)
goto err;
+ if ((size_t)i_size_read(inode) != plm->len) {
+ inode_lock(inode);
+ i_size_write(inode, plm->len);
+ inode_unlock(inode);
+ }
+
fsi->policy_opened = 1;
filp->private_data = plm;
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
return 0;
err:
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
if (plm)
vfree(plm->data);
@@ -508,29 +521,94 @@ static const struct file_operations sel_policy_ops = {
.llseek = generic_file_llseek,
};
-static int sel_make_policy_nodes(struct selinux_fs_info *fsi)
+static void sel_remove_old_bool_data(unsigned int bool_num, char **bool_names,
+ unsigned int *bool_values)
{
- int ret;
+ u32 i;
+
+ /* bool_dir cleanup */
+ for (i = 0; i < bool_num; i++)
+ kfree(bool_names[i]);
+ kfree(bool_names);
+ kfree(bool_values);
+}
- ret = sel_make_bools(fsi);
+static int sel_make_policy_nodes(struct selinux_fs_info *fsi,
+ struct selinux_policy *newpolicy)
+{
+ int ret = 0;
+ struct dentry *tmp_parent, *tmp_bool_dir, *tmp_class_dir, *old_dentry;
+ unsigned int tmp_bool_num, old_bool_num;
+ char **tmp_bool_names, **old_bool_names;
+ unsigned int *tmp_bool_values, *old_bool_values;
+ unsigned long tmp_ino = fsi->last_ino; /* Don't increment last_ino in this function */
+
+ tmp_parent = sel_make_disconnected_dir(fsi->sb, &tmp_ino);
+ if (IS_ERR(tmp_parent))
+ return PTR_ERR(tmp_parent);
+
+ tmp_ino = fsi->bool_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */
+ tmp_bool_dir = sel_make_dir(tmp_parent, BOOL_DIR_NAME, &tmp_ino);
+ if (IS_ERR(tmp_bool_dir)) {
+ ret = PTR_ERR(tmp_bool_dir);
+ goto out;
+ }
+
+ tmp_ino = fsi->class_dir->d_inode->i_ino - 1; /* sel_make_dir will increment and set */
+ tmp_class_dir = sel_make_dir(tmp_parent, CLASS_DIR_NAME, &tmp_ino);
+ if (IS_ERR(tmp_class_dir)) {
+ ret = PTR_ERR(tmp_class_dir);
+ goto out;
+ }
+
+ ret = sel_make_bools(newpolicy, tmp_bool_dir, &tmp_bool_num,
+ &tmp_bool_names, &tmp_bool_values);
if (ret) {
pr_err("SELinux: failed to load policy booleans\n");
- return ret;
+ goto out;
}
- ret = sel_make_classes(fsi);
+ ret = sel_make_classes(newpolicy, tmp_class_dir,
+ &fsi->last_class_ino);
if (ret) {
pr_err("SELinux: failed to load policy classes\n");
- return ret;
+ goto out;
}
- ret = sel_make_policycap(fsi);
- if (ret) {
- pr_err("SELinux: failed to load policy capabilities\n");
- return ret;
- }
+ /* booleans */
+ old_dentry = fsi->bool_dir;
+ lock_rename(tmp_bool_dir, old_dentry);
+ d_exchange(tmp_bool_dir, fsi->bool_dir);
- return 0;
+ old_bool_num = fsi->bool_num;
+ old_bool_names = fsi->bool_pending_names;
+ old_bool_values = fsi->bool_pending_values;
+
+ fsi->bool_num = tmp_bool_num;
+ fsi->bool_pending_names = tmp_bool_names;
+ fsi->bool_pending_values = tmp_bool_values;
+
+ sel_remove_old_bool_data(old_bool_num, old_bool_names, old_bool_values);
+
+ fsi->bool_dir = tmp_bool_dir;
+ unlock_rename(tmp_bool_dir, old_dentry);
+
+ /* classes */
+ old_dentry = fsi->class_dir;
+ lock_rename(tmp_class_dir, old_dentry);
+ d_exchange(tmp_class_dir, fsi->class_dir);
+ fsi->class_dir = tmp_class_dir;
+ unlock_rename(tmp_class_dir, old_dentry);
+
+out:
+ /* Since the other temporary dirs are children of tmp_parent
+ * this will handle all the cleanup in the case of a failure before
+ * the swapover
+ */
+ sel_remove_entries(tmp_parent);
+ dput(tmp_parent); /* d_genocide() only handles the children */
+
+ return ret;
}
static ssize_t sel_write_load(struct file *file, const char __user *buf,
@@ -538,10 +616,11 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf,
{
struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info;
+ struct selinux_policy *newpolicy;
ssize_t length;
void *data = NULL;
- mutex_lock(&fsi->mutex);
+ mutex_lock(&fsi->state->policy_mutex);
length = avc_has_perm(&selinux_state,
current_sid(), SECINITSID_SECURITY,
@@ -563,15 +642,19 @@ static ssize_t sel_write_load(struct file *file, const char __user *buf,
if (copy_from_user(data, buf, count) != 0)
goto out;
- length = security_load_policy(fsi->state, data, count);
+ length = security_load_policy(fsi->state, data, count, &newpolicy);
if (length) {
pr_warn_ratelimited("SELinux: failed to load policy\n");
goto out;
}
- length = sel_make_policy_nodes(fsi);
- if (length)
+ length = sel_make_policy_nodes(fsi, newpolicy);
+ if (length) {
+ selinux_policy_cancel(fsi->state, newpolicy);
goto out1;
+ }
+
+ selinux_policy_commit(fsi->state, newpolicy);
length = count;
@@ -581,7 +664,7 @@ out1:
from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
out:
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
vfree(data);
return length;
}
@@ -634,7 +717,8 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf,
char tmpbuf[TMPBUFLEN];
ssize_t length;
- length = scnprintf(tmpbuf, TMPBUFLEN, "%u", fsi->state->checkreqprot);
+ length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
+ checkreqprot_get(fsi->state));
return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}
@@ -676,7 +760,7 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf,
comm, current->pid);
}
- fsi->state->checkreqprot = new_value ? 1 : 0;
+ checkreqprot_set(fsi->state, (new_value ? 1 : 0));
length = count;
out:
kfree(page);
@@ -1186,7 +1270,7 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
unsigned index = file_inode(filep)->i_ino & SEL_INO_MASK;
const char *name = filep->f_path.dentry->d_name.name;
- mutex_lock(&fsi->mutex);
+ mutex_lock(&fsi->state->policy_mutex);
ret = -EINVAL;
if (index >= fsi->bool_num || strcmp(name,
@@ -1205,14 +1289,14 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
}
length = scnprintf(page, PAGE_SIZE, "%d %d", cur_enforcing,
fsi->bool_pending_values[index]);
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
ret = simple_read_from_buffer(buf, count, ppos, page, length);
out_free:
free_page((unsigned long)page);
return ret;
out_unlock:
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
goto out_free;
}
@@ -1237,7 +1321,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
if (IS_ERR(page))
return PTR_ERR(page);
- mutex_lock(&fsi->mutex);
+ mutex_lock(&fsi->state->policy_mutex);
length = avc_has_perm(&selinux_state,
current_sid(), SECINITSID_SECURITY,
@@ -1262,7 +1346,7 @@ static ssize_t sel_write_bool(struct file *filep, const char __user *buf,
length = count;
out:
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
kfree(page);
return length;
}
@@ -1293,7 +1377,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
if (IS_ERR(page))
return PTR_ERR(page);
- mutex_lock(&fsi->mutex);
+ mutex_lock(&fsi->state->policy_mutex);
length = avc_has_perm(&selinux_state,
current_sid(), SECINITSID_SECURITY,
@@ -1315,7 +1399,7 @@ static ssize_t sel_commit_bools_write(struct file *filep,
length = count;
out:
- mutex_unlock(&fsi->mutex);
+ mutex_unlock(&fsi->state->policy_mutex);
kfree(page);
return length;
}
@@ -1331,14 +1415,13 @@ static void sel_remove_entries(struct dentry *de)
shrink_dcache_parent(de);
}
-#define BOOL_DIR_NAME "booleans"
-
-static int sel_make_bools(struct selinux_fs_info *fsi)
+static int sel_make_bools(struct selinux_policy *newpolicy, struct dentry *bool_dir,
+ unsigned int *bool_num, char ***bool_pending_names,
+ unsigned int **bool_pending_values)
{
int ret;
ssize_t len;
struct dentry *dentry = NULL;
- struct dentry *dir = fsi->bool_dir;
struct inode *inode = NULL;
struct inode_security_struct *isec;
char **names = NULL, *page;
@@ -1346,34 +1429,23 @@ static int sel_make_bools(struct selinux_fs_info *fsi)
int *values = NULL;
u32 sid;
- /* remove any existing files */
- for (i = 0; i < fsi->bool_num; i++)
- kfree(fsi->bool_pending_names[i]);
- kfree(fsi->bool_pending_names);
- kfree(fsi->bool_pending_values);
- fsi->bool_num = 0;
- fsi->bool_pending_names = NULL;
- fsi->bool_pending_values = NULL;
-
- sel_remove_entries(dir);
-
ret = -ENOMEM;
page = (char *)get_zeroed_page(GFP_KERNEL);
if (!page)
goto out;
- ret = security_get_bools(fsi->state, &num, &names, &values);
+ ret = security_get_bools(newpolicy, &num, &names, &values);
if (ret)
goto out;
for (i = 0; i < num; i++) {
ret = -ENOMEM;
- dentry = d_alloc_name(dir, names[i]);
+ dentry = d_alloc_name(bool_dir, names[i]);
if (!dentry)
goto out;
ret = -ENOMEM;
- inode = sel_make_inode(dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR);
+ inode = sel_make_inode(bool_dir->d_sb, S_IFREG | S_IRUGO | S_IWUSR);
if (!inode) {
dput(dentry);
goto out;
@@ -1388,7 +1460,7 @@ static int sel_make_bools(struct selinux_fs_info *fsi)
}
isec = selinux_inode(inode);
- ret = security_genfs_sid(fsi->state, "selinuxfs", page,
+ ret = selinux_policy_genfs_sid(newpolicy, "selinuxfs", page,
SECCLASS_FILE, &sid);
if (ret) {
pr_warn_ratelimited("SELinux: no sid found, defaulting to security isid for %s\n",
@@ -1402,9 +1474,9 @@ static int sel_make_bools(struct selinux_fs_info *fsi)
inode->i_ino = i|SEL_BOOL_INO_OFFSET;
d_add(dentry, inode);
}
- fsi->bool_num = num;
- fsi->bool_pending_names = names;
- fsi->bool_pending_values = values;
+ *bool_num = num;
+ *bool_pending_names = names;
+ *bool_pending_values = values;
free_page((unsigned long)page);
return 0;
@@ -1417,7 +1489,7 @@ out:
kfree(names);
}
kfree(values);
- sel_remove_entries(dir);
+ sel_remove_entries(bool_dir);
return ret;
}
@@ -1791,14 +1863,14 @@ static const struct file_operations sel_policycap_ops = {
.llseek = generic_file_llseek,
};
-static int sel_make_perm_files(char *objclass, int classvalue,
- struct dentry *dir)
+static int sel_make_perm_files(struct selinux_policy *newpolicy,
+ char *objclass, int classvalue,
+ struct dentry *dir)
{
- struct selinux_fs_info *fsi = dir->d_sb->s_fs_info;
int i, rc, nperms;
char **perms;
- rc = security_get_permissions(fsi->state, objclass, &perms, &nperms);
+ rc = security_get_permissions(newpolicy, objclass, &perms, &nperms);
if (rc)
return rc;
@@ -1831,8 +1903,9 @@ out:
return rc;
}
-static int sel_make_class_dir_entries(char *classname, int index,
- struct dentry *dir)
+static int sel_make_class_dir_entries(struct selinux_policy *newpolicy,
+ char *classname, int index,
+ struct dentry *dir)
{
struct super_block *sb = dir->d_sb;
struct selinux_fs_info *fsi = sb->s_fs_info;
@@ -1858,39 +1931,38 @@ static int sel_make_class_dir_entries(char *classname, int index,
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- rc = sel_make_perm_files(classname, index, dentry);
+ rc = sel_make_perm_files(newpolicy, classname, index, dentry);
return rc;
}
-static int sel_make_classes(struct selinux_fs_info *fsi)
+static int sel_make_classes(struct selinux_policy *newpolicy,
+ struct dentry *class_dir,
+ unsigned long *last_class_ino)
{
int rc, nclasses, i;
char **classes;
- /* delete any existing entries */
- sel_remove_entries(fsi->class_dir);
-
- rc = security_get_classes(fsi->state, &classes, &nclasses);
+ rc = security_get_classes(newpolicy, &classes, &nclasses);
if (rc)
return rc;
/* +2 since classes are 1-indexed */
- fsi->last_class_ino = sel_class_to_ino(nclasses + 2);
+ *last_class_ino = sel_class_to_ino(nclasses + 2);
for (i = 0; i < nclasses; i++) {
struct dentry *class_name_dir;
- class_name_dir = sel_make_dir(fsi->class_dir, classes[i],
- &fsi->last_class_ino);
+ class_name_dir = sel_make_dir(class_dir, classes[i],
+ last_class_ino);
if (IS_ERR(class_name_dir)) {
rc = PTR_ERR(class_name_dir);
goto out;
}
/* i+1 since class values are 1-indexed */
- rc = sel_make_class_dir_entries(classes[i], i + 1,
+ rc = sel_make_class_dir_entries(newpolicy, classes[i], i + 1,
class_name_dir);
if (rc)
goto out;
@@ -1909,8 +1981,6 @@ static int sel_make_policycap(struct selinux_fs_info *fsi)
struct dentry *dentry = NULL;
struct inode *inode = NULL;
- sel_remove_entries(fsi->policycap_dir);
-
for (iter = 0; iter <= POLICYDB_CAPABILITY_MAX; iter++) {
if (iter < ARRAY_SIZE(selinux_policycap_names))
dentry = d_alloc_name(fsi->policycap_dir,
@@ -1962,6 +2032,22 @@ static struct dentry *sel_make_dir(struct dentry *dir, const char *name,
return dentry;
}
+static struct dentry *sel_make_disconnected_dir(struct super_block *sb,
+ unsigned long *ino)
+{
+ struct inode *inode = sel_make_inode(sb, S_IFDIR | S_IRUGO | S_IXUGO);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+ inode->i_ino = ++(*ino);
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ return d_obtain_alias(inode);
+}
+
#define NULL_FILE_NAME "null"
static int sel_fill_super(struct super_block *sb, struct fs_context *fc)
@@ -2060,14 +2146,14 @@ static int sel_fill_super(struct super_block *sb, struct fs_context *fc)
if (ret)
goto err;
- fsi->class_dir = sel_make_dir(sb->s_root, "class", &fsi->last_ino);
+ fsi->class_dir = sel_make_dir(sb->s_root, CLASS_DIR_NAME, &fsi->last_ino);
if (IS_ERR(fsi->class_dir)) {
ret = PTR_ERR(fsi->class_dir);
fsi->class_dir = NULL;
goto err;
}
- fsi->policycap_dir = sel_make_dir(sb->s_root, "policy_capabilities",
+ fsi->policycap_dir = sel_make_dir(sb->s_root, POLICYCAP_DIR_NAME,
&fsi->last_ino);
if (IS_ERR(fsi->policycap_dir)) {
ret = PTR_ERR(fsi->policycap_dir);
@@ -2075,9 +2161,12 @@ static int sel_fill_super(struct super_block *sb, struct fs_context *fc)
goto err;
}
- ret = sel_make_policy_nodes(fsi);
- if (ret)
+ ret = sel_make_policycap(fsi);
+ if (ret) {
+ pr_err("SELinux: failed to load policy capabilities\n");
goto err;
+ }
+
return 0;
err:
pr_err("SELinux: %s: failed while creating inodes\n",
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 01b300a4a882..0172d87e2b9a 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -301,7 +301,6 @@ void avtab_destroy(struct avtab *h)
void avtab_init(struct avtab *h)
{
- kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
}
@@ -340,6 +339,54 @@ int avtab_alloc(struct avtab *h, u32 nrules)
return 0;
}
+int avtab_duplicate(struct avtab *new, struct avtab *orig)
+{
+ int i;
+ struct avtab_node *node, *tmp, *tail;
+
+ memset(new, 0, sizeof(*new));
+
+ new->htable = kvcalloc(orig->nslot, sizeof(void *), GFP_KERNEL);
+ if (!new->htable)
+ return -ENOMEM;
+ new->nslot = orig->nslot;
+ new->mask = orig->mask;
+
+ for (i = 0; i < orig->nslot; i++) {
+ tail = NULL;
+ for (node = orig->htable[i]; node; node = node->next) {
+ tmp = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL);
+ if (!tmp)
+ goto error;
+ tmp->key = node->key;
+ if (tmp->key.specified & AVTAB_XPERMS) {
+ tmp->datum.u.xperms =
+ kmem_cache_zalloc(avtab_xperms_cachep,
+ GFP_KERNEL);
+ if (!tmp->datum.u.xperms) {
+ kmem_cache_free(avtab_node_cachep, tmp);
+ goto error;
+ }
+ tmp->datum.u.xperms = node->datum.u.xperms;
+ } else
+ tmp->datum.u.data = node->datum.u.data;
+
+ if (tail)
+ tail->next = tmp;
+ else
+ new->htable[i] = tmp;
+
+ tail = tmp;
+ new->nel++;
+ }
+ }
+
+ return 0;
+error:
+ avtab_destroy(new);
+ return -ENOMEM;
+}
+
void avtab_hash_eval(struct avtab *h, char *tag)
{
int i, chain_len, slots_used, max_chain_len;
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index 5fdcb6696bcc..4c4445ca9118 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -89,6 +89,7 @@ struct avtab {
void avtab_init(struct avtab *h);
int avtab_alloc(struct avtab *, u32);
+int avtab_duplicate(struct avtab *new, struct avtab *orig);
struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *k);
void avtab_destroy(struct avtab *h);
void avtab_hash_eval(struct avtab *h, char *tag);
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index 5a47258c1d77..0b32f3ab025e 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -600,3 +600,158 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
services_compute_xperms_drivers(xperms, node);
}
}
+
+static int cond_dup_av_list(struct cond_av_list *new,
+ struct cond_av_list *orig,
+ struct avtab *avtab)
+{
+ struct avtab_node *avnode;
+ u32 i;
+
+ memset(new, 0, sizeof(*new));
+
+ new->nodes = kcalloc(orig->len, sizeof(*new->nodes), GFP_KERNEL);
+ if (!new->nodes)
+ return -ENOMEM;
+
+ for (i = 0; i < orig->len; i++) {
+ avnode = avtab_search_node(avtab, &orig->nodes[i]->key);
+ if (WARN_ON(!avnode))
+ return -EINVAL;
+ new->nodes[i] = avnode;
+ new->len++;
+ }
+
+ return 0;
+}
+
+static int duplicate_policydb_cond_list(struct policydb *newp,
+ struct policydb *origp)
+{
+ int rc, i, j;
+
+ rc = avtab_duplicate(&newp->te_cond_avtab, &origp->te_cond_avtab);
+ if (rc)
+ return rc;
+
+ newp->cond_list_len = 0;
+ newp->cond_list = kcalloc(origp->cond_list_len,
+ sizeof(*newp->cond_list),
+ GFP_KERNEL);
+ if (!newp->cond_list)
+ goto error;
+
+ for (i = 0; i < origp->cond_list_len; i++) {
+ struct cond_node *newn = &newp->cond_list[i];
+ struct cond_node *orign = &origp->cond_list[i];
+
+ newp->cond_list_len++;
+
+ newn->cur_state = orign->cur_state;
+ newn->expr.nodes = kcalloc(orign->expr.len,
+ sizeof(*newn->expr.nodes), GFP_KERNEL);
+ if (!newn->expr.nodes)
+ goto error;
+ for (j = 0; j < orign->expr.len; j++)
+ newn->expr.nodes[j] = orign->expr.nodes[j];
+ newn->expr.len = orign->expr.len;
+
+ rc = cond_dup_av_list(&newn->true_list, &orign->true_list,
+ &newp->te_cond_avtab);
+ if (rc)
+ goto error;
+
+ rc = cond_dup_av_list(&newn->false_list, &orign->false_list,
+ &newp->te_cond_avtab);
+ if (rc)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ avtab_destroy(&newp->te_cond_avtab);
+ cond_list_destroy(newp);
+ return -ENOMEM;
+}
+
+static int cond_bools_destroy(void *key, void *datum, void *args)
+{
+ /* key was not copied so no need to free here */
+ kfree(datum);
+ return 0;
+}
+
+static int cond_bools_copy(struct hashtab_node *new, struct hashtab_node *orig, void *args)
+{
+ struct cond_bool_datum *datum;
+
+ datum = kmemdup(orig->datum, sizeof(struct cond_bool_datum),
+ GFP_KERNEL);
+ if (!datum)
+ return -ENOMEM;
+
+ new->key = orig->key; /* No need to copy, never modified */
+ new->datum = datum;
+ return 0;
+}
+
+static int cond_bools_index(void *key, void *datum, void *args)
+{
+ struct cond_bool_datum *booldatum, **cond_bool_array;
+
+ booldatum = datum;
+ cond_bool_array = args;
+ cond_bool_array[booldatum->value - 1] = booldatum;
+
+ return 0;
+}
+
+static int duplicate_policydb_bools(struct policydb *newdb,
+ struct policydb *orig)
+{
+ struct cond_bool_datum **cond_bool_array;
+ int rc;
+
+ cond_bool_array = kmalloc_array(orig->p_bools.nprim,
+ sizeof(*orig->bool_val_to_struct),
+ GFP_KERNEL);
+ if (!cond_bool_array)
+ return -ENOMEM;
+
+ rc = hashtab_duplicate(&newdb->p_bools.table, &orig->p_bools.table,
+ cond_bools_copy, cond_bools_destroy, NULL);
+ if (rc) {
+ kfree(cond_bool_array);
+ return -ENOMEM;
+ }
+
+ hashtab_map(&newdb->p_bools.table, cond_bools_index, cond_bool_array);
+ newdb->bool_val_to_struct = cond_bool_array;
+
+ newdb->p_bools.nprim = orig->p_bools.nprim;
+
+ return 0;
+}
+
+void cond_policydb_destroy_dup(struct policydb *p)
+{
+ hashtab_map(&p->p_bools.table, cond_bools_destroy, NULL);
+ hashtab_destroy(&p->p_bools.table);
+ cond_policydb_destroy(p);
+}
+
+int cond_policydb_dup(struct policydb *new, struct policydb *orig)
+{
+ cond_policydb_init(new);
+
+ if (duplicate_policydb_bools(new, orig))
+ return -ENOMEM;
+
+ if (duplicate_policydb_cond_list(new, orig)) {
+ cond_policydb_destroy_dup(new);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h
index 79e7e03db859..e47ec6ddeaf6 100644
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -79,5 +79,7 @@ void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
struct extended_perms_decision *xpermd);
void evaluate_cond_nodes(struct policydb *p);
+void cond_policydb_destroy_dup(struct policydb *p);
+int cond_policydb_dup(struct policydb *new, struct policydb *orig);
#endif /* _CONDITIONAL_H_ */
diff --git a/security/selinux/ss/hashtab.c b/security/selinux/ss/hashtab.c
index d9287bb4bfeb..dab8c25c739b 100644
--- a/security/selinux/ss/hashtab.c
+++ b/security/selinux/ss/hashtab.c
@@ -122,6 +122,59 @@ void hashtab_stat(struct hashtab *h, struct hashtab_info *info)
info->max_chain_len = max_chain_len;
}
+int hashtab_duplicate(struct hashtab *new, struct hashtab *orig,
+ int (*copy)(struct hashtab_node *new,
+ struct hashtab_node *orig, void *args),
+ int (*destroy)(void *k, void *d, void *args),
+ void *args)
+{
+ struct hashtab_node *cur, *tmp, *tail;
+ int i, rc;
+
+ memset(new, 0, sizeof(*new));
+
+ new->htable = kcalloc(orig->size, sizeof(*new->htable), GFP_KERNEL);
+ if (!new->htable)
+ return -ENOMEM;
+
+ new->size = orig->size;
+
+ for (i = 0; i < orig->size; i++) {
+ tail = NULL;
+ for (cur = orig->htable[i]; cur; cur = cur->next) {
+ tmp = kmem_cache_zalloc(hashtab_node_cachep,
+ GFP_KERNEL);
+ if (!tmp)
+ goto error;
+ rc = copy(tmp, cur, args);
+ if (rc) {
+ kmem_cache_free(hashtab_node_cachep, tmp);
+ goto error;
+ }
+ tmp->next = NULL;
+ if (!tail)
+ new->htable[i] = tmp;
+ else
+ tail->next = tmp;
+ tail = tmp;
+ new->nel++;
+ }
+ }
+
+ return 0;
+
+ error:
+ for (i = 0; i < new->size; i++) {
+ for (cur = new->htable[i]; cur; cur = tmp) {
+ tmp = cur->next;
+ destroy(cur->key, cur->datum, args);
+ kmem_cache_free(hashtab_node_cachep, cur);
+ }
+ }
+ kmem_cache_free(hashtab_node_cachep, new);
+ return -ENOMEM;
+}
+
void __init hashtab_cache_init(void)
{
hashtab_node_cachep = kmem_cache_create("hashtab_node",
diff --git a/security/selinux/ss/hashtab.h b/security/selinux/ss/hashtab.h
index 3c952f0f01f9..043a773bf0b7 100644
--- a/security/selinux/ss/hashtab.h
+++ b/security/selinux/ss/hashtab.h
@@ -136,6 +136,12 @@ int hashtab_map(struct hashtab *h,
int (*apply)(void *k, void *d, void *args),
void *args);
+int hashtab_duplicate(struct hashtab *new, struct hashtab *orig,
+ int (*copy)(struct hashtab_node *new,
+ struct hashtab_node *orig, void *args),
+ int (*destroy)(void *k, void *d, void *args),
+ void *args);
+
/* Fill info with some hash table statistics */
void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 1caf4e603309..9704c8a32303 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -64,25 +64,7 @@
#include "xfrm.h"
#include "ebitmap.h"
#include "audit.h"
-
-/* Policy capability names */
-const char *selinux_policycap_names[__POLICYDB_CAPABILITY_MAX] = {
- "network_peer_controls",
- "open_perms",
- "extended_socket_class",
- "always_check_network",
- "cgroup_seclabel",
- "nnp_nosuid_transition",
- "genfs_seclabel_symlinks"
-};
-
-static struct selinux_ss selinux_ss;
-
-void selinux_ss_init(struct selinux_ss **ss)
-{
- rwlock_init(&selinux_ss.policy_rwlock);
- *ss = &selinux_ss;
-}
+#include "policycap_names.h"
/* Forward declaration. */
static int context_struct_to_string(struct policydb *policydb,
@@ -248,9 +230,17 @@ static void map_decision(struct selinux_map *map,
int security_mls_enabled(struct selinux_state *state)
{
- struct policydb *p = &state->ss->policydb;
+ int mls_enabled;
+ struct selinux_policy *policy;
- return p->mls_enabled;
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ mls_enabled = policy->policydb.mls_enabled;
+ rcu_read_unlock();
+ return mls_enabled;
}
/*
@@ -721,13 +711,14 @@ static void context_struct_compute_av(struct policydb *policydb,
}
static int security_validtrans_handle_fail(struct selinux_state *state,
- struct sidtab_entry *oentry,
- struct sidtab_entry *nentry,
- struct sidtab_entry *tentry,
- u16 tclass)
-{
- struct policydb *p = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct selinux_policy *policy,
+ struct sidtab_entry *oentry,
+ struct sidtab_entry *nentry,
+ struct sidtab_entry *tentry,
+ u16 tclass)
+{
+ struct policydb *p = &policy->policydb;
+ struct sidtab *sidtab = policy->sidtab;
char *o = NULL, *n = NULL, *t = NULL;
u32 olen, nlen, tlen;
@@ -755,6 +746,7 @@ static int security_compute_validatetrans(struct selinux_state *state,
u32 oldsid, u32 newsid, u32 tasksid,
u16 orig_tclass, bool user)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct sidtab_entry *oentry;
@@ -769,13 +761,14 @@ static int security_compute_validatetrans(struct selinux_state *state,
if (!selinux_initialized(state))
return 0;
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
if (!user)
- tclass = unmap_class(&state->ss->map, orig_tclass);
+ tclass = unmap_class(&policy->map, orig_tclass);
else
tclass = orig_tclass;
@@ -818,17 +811,18 @@ static int security_compute_validatetrans(struct selinux_state *state,
rc = -EPERM;
else
rc = security_validtrans_handle_fail(state,
- oentry,
- nentry,
- tentry,
- tclass);
+ policy,
+ oentry,
+ nentry,
+ tentry,
+ tclass);
goto out;
}
constraint = constraint->next;
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -860,6 +854,7 @@ int security_validate_transition(struct selinux_state *state,
int security_bounded_transition(struct selinux_state *state,
u32 old_sid, u32 new_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct sidtab_entry *old_entry, *new_entry;
@@ -870,10 +865,10 @@ int security_bounded_transition(struct selinux_state *state,
if (!selinux_initialized(state))
return 0;
- read_lock(&state->ss->policy_rwlock);
-
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
rc = -EINVAL;
old_entry = sidtab_search_entry(sidtab, old_sid);
@@ -934,17 +929,20 @@ int security_bounded_transition(struct selinux_state *state,
kfree(old_name);
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
-static void avd_init(struct selinux_state *state, struct av_decision *avd)
+static void avd_init(struct selinux_policy *policy, struct av_decision *avd)
{
avd->allowed = 0;
avd->auditallow = 0;
avd->auditdeny = 0xffffffff;
- avd->seqno = state->ss->latest_granting;
+ if (policy)
+ avd->seqno = policy->latest_granting;
+ else
+ avd->seqno = 0;
avd->flags = 0;
}
@@ -1009,6 +1007,7 @@ void security_compute_xperms_decision(struct selinux_state *state,
u8 driver,
struct extended_perms_decision *xpermd)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
u16 tclass;
@@ -1025,12 +1024,13 @@ void security_compute_xperms_decision(struct selinux_state *state,
memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
if (!selinux_initialized(state))
goto allow;
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
scontext = sidtab_search(sidtab, ssid);
if (!scontext) {
@@ -1046,7 +1046,7 @@ void security_compute_xperms_decision(struct selinux_state *state,
goto out;
}
- tclass = unmap_class(&state->ss->map, orig_tclass);
+ tclass = unmap_class(&policy->map, orig_tclass);
if (unlikely(orig_tclass && !tclass)) {
if (policydb->allow_unknown)
goto allow;
@@ -1078,7 +1078,7 @@ void security_compute_xperms_decision(struct selinux_state *state,
}
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return;
allow:
memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
@@ -1103,19 +1103,21 @@ void security_compute_av(struct selinux_state *state,
struct av_decision *avd,
struct extended_perms *xperms)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
u16 tclass;
struct context *scontext = NULL, *tcontext = NULL;
- read_lock(&state->ss->policy_rwlock);
- avd_init(state, avd);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ avd_init(policy, avd);
xperms->len = 0;
if (!selinux_initialized(state))
goto allow;
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
scontext = sidtab_search(sidtab, ssid);
if (!scontext) {
@@ -1135,7 +1137,7 @@ void security_compute_av(struct selinux_state *state,
goto out;
}
- tclass = unmap_class(&state->ss->map, orig_tclass);
+ tclass = unmap_class(&policy->map, orig_tclass);
if (unlikely(orig_tclass && !tclass)) {
if (policydb->allow_unknown)
goto allow;
@@ -1143,10 +1145,10 @@ void security_compute_av(struct selinux_state *state,
}
context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
xperms);
- map_decision(&state->ss->map, orig_tclass, avd,
+ map_decision(&policy->map, orig_tclass, avd,
policydb->allow_unknown);
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return;
allow:
avd->allowed = 0xffffffff;
@@ -1159,17 +1161,19 @@ void security_compute_av_user(struct selinux_state *state,
u16 tclass,
struct av_decision *avd)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct context *scontext = NULL, *tcontext = NULL;
- read_lock(&state->ss->policy_rwlock);
- avd_init(state, avd);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ avd_init(policy, avd);
if (!selinux_initialized(state))
goto allow;
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
scontext = sidtab_search(sidtab, ssid);
if (!scontext) {
@@ -1198,7 +1202,7 @@ void security_compute_av_user(struct selinux_state *state,
context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
NULL);
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return;
allow:
avd->allowed = 0xffffffff;
@@ -1283,6 +1287,7 @@ static int sidtab_entry_to_string(struct policydb *p,
int security_sidtab_hash_stats(struct selinux_state *state, char *page)
{
+ struct selinux_policy *policy;
int rc;
if (!selinux_initialized(state)) {
@@ -1291,9 +1296,10 @@ int security_sidtab_hash_stats(struct selinux_state *state, char *page)
return -EINVAL;
}
- read_lock(&state->ss->policy_rwlock);
- rc = sidtab_hash_stats(state->ss->sidtab, page);
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ rc = sidtab_hash_stats(policy->sidtab, page);
+ rcu_read_unlock();
return rc;
}
@@ -1310,6 +1316,7 @@ static int security_sid_to_context_core(struct selinux_state *state,
u32 *scontext_len, int force,
int only_invalid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct sidtab_entry *entry;
@@ -1339,9 +1346,10 @@ static int security_sid_to_context_core(struct selinux_state *state,
"load_policy on unknown SID %d\n", __func__, sid);
return -EINVAL;
}
- read_lock(&state->ss->policy_rwlock);
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
if (force)
entry = sidtab_search_entry_force(sidtab, sid);
@@ -1360,7 +1368,7 @@ static int security_sid_to_context_core(struct selinux_state *state,
scontext_len);
out_unlock:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -1495,6 +1503,7 @@ static int security_context_to_sid_core(struct selinux_state *state,
u32 *sid, u32 def_sid, gfp_t gfp_flags,
int force)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
char *scontext2, *str = NULL;
@@ -1533,9 +1542,10 @@ static int security_context_to_sid_core(struct selinux_state *state,
if (!str)
goto out;
}
- read_lock(&state->ss->policy_rwlock);
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
rc = string_to_context_struct(policydb, sidtab, scontext2,
&context, def_sid);
if (rc == -EINVAL && force) {
@@ -1547,7 +1557,7 @@ static int security_context_to_sid_core(struct selinux_state *state,
rc = sidtab_context_to_sid(sidtab, &context, sid);
context_destroy(&context);
out_unlock:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
out:
kfree(scontext2);
kfree(str);
@@ -1617,13 +1627,14 @@ int security_context_to_sid_force(struct selinux_state *state,
static int compute_sid_handle_invalid_context(
struct selinux_state *state,
+ struct selinux_policy *policy,
struct sidtab_entry *sentry,
struct sidtab_entry *tentry,
u16 tclass,
struct context *newcontext)
{
- struct policydb *policydb = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct policydb *policydb = &policy->policydb;
+ struct sidtab *sidtab = policy->sidtab;
char *s = NULL, *t = NULL, *n = NULL;
u32 slen, tlen, nlen;
struct audit_buffer *ab;
@@ -1690,6 +1701,7 @@ static int security_compute_sid(struct selinux_state *state,
u32 *out_sid,
bool kern)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct class_datum *cladatum = NULL;
@@ -1716,19 +1728,21 @@ static int security_compute_sid(struct selinux_state *state,
context_init(&newcontext);
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+
+ policy = rcu_dereference(state->policy);
if (kern) {
- tclass = unmap_class(&state->ss->map, orig_tclass);
+ tclass = unmap_class(&policy->map, orig_tclass);
sock = security_is_socket_class(orig_tclass);
} else {
tclass = orig_tclass;
- sock = security_is_socket_class(map_class(&state->ss->map,
+ sock = security_is_socket_class(map_class(&policy->map,
tclass));
}
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
sentry = sidtab_search_entry(sidtab, ssid);
if (!sentry) {
@@ -1848,15 +1862,16 @@ static int security_compute_sid(struct selinux_state *state,
/* Check the validity of the context. */
if (!policydb_context_isvalid(policydb, &newcontext)) {
- rc = compute_sid_handle_invalid_context(state, sentry, tentry,
- tclass, &newcontext);
+ rc = compute_sid_handle_invalid_context(state, policy, sentry,
+ tentry, tclass,
+ &newcontext);
if (rc)
goto out_unlock;
}
/* Obtain the sid for the context. */
rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
out_unlock:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
context_destroy(&newcontext);
out:
return rc;
@@ -1943,9 +1958,9 @@ int security_change_sid(struct selinux_state *state,
static inline int convert_context_handle_invalid_context(
struct selinux_state *state,
+ struct policydb *policydb,
struct context *context)
{
- struct policydb *policydb = &state->ss->policydb;
char *s;
u32 len;
@@ -2077,7 +2092,9 @@ static int convert_context(struct context *oldc, struct context *newc, void *p)
/* Check the validity of the new context. */
if (!policydb_context_isvalid(args->newp, newc)) {
- rc = convert_context_handle_invalid_context(args->state, oldc);
+ rc = convert_context_handle_invalid_context(args->state,
+ args->oldp,
+ oldc);
if (rc)
goto bad;
}
@@ -2096,14 +2113,18 @@ bad:
return 0;
}
-static void security_load_policycaps(struct selinux_state *state)
+static void security_load_policycaps(struct selinux_state *state,
+ struct selinux_policy *policy)
{
- struct policydb *p = &state->ss->policydb;
+ struct policydb *p;
unsigned int i;
struct ebitmap_node *node;
+ p = &policy->policydb;
+
for (i = 0; i < ARRAY_SIZE(state->policycap); i++)
- state->policycap[i] = ebitmap_get_bit(&p->policycaps, i);
+ WRITE_ONCE(state->policycap[i],
+ ebitmap_get_bit(&p->policycaps, i));
for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
pr_info("SELinux: policy capability %s=%d\n",
@@ -2117,8 +2138,97 @@ static void security_load_policycaps(struct selinux_state *state)
}
}
-static int security_preserve_bools(struct selinux_state *state,
- struct policydb *newpolicydb);
+static int security_preserve_bools(struct selinux_policy *oldpolicy,
+ struct selinux_policy *newpolicy);
+
+static void selinux_policy_free(struct selinux_policy *policy)
+{
+ if (!policy)
+ return;
+
+ sidtab_destroy(policy->sidtab);
+ kfree(policy->map.mapping);
+ policydb_destroy(&policy->policydb);
+ kfree(policy->sidtab);
+ kfree(policy);
+}
+
+static void selinux_policy_cond_free(struct selinux_policy *policy)
+{
+ cond_policydb_destroy_dup(&policy->policydb);
+ kfree(policy);
+}
+
+void selinux_policy_cancel(struct selinux_state *state,
+ struct selinux_policy *policy)
+{
+ struct selinux_policy *oldpolicy;
+
+ oldpolicy = rcu_dereference_protected(state->policy,
+ lockdep_is_held(&state->policy_mutex));
+
+ sidtab_cancel_convert(oldpolicy->sidtab);
+ selinux_policy_free(policy);
+}
+
+static void selinux_notify_policy_change(struct selinux_state *state,
+ u32 seqno)
+{
+ /* Flush external caches and notify userspace of policy load */
+ avc_ss_reset(state->avc, seqno);
+ selnl_notify_policyload(seqno);
+ selinux_status_update_policyload(state, seqno);
+ selinux_netlbl_cache_invalidate();
+ selinux_xfrm_notify_policyload();
+}
+
+void selinux_policy_commit(struct selinux_state *state,
+ struct selinux_policy *newpolicy)
+{
+ struct selinux_policy *oldpolicy;
+ u32 seqno;
+
+ oldpolicy = rcu_dereference_protected(state->policy,
+ lockdep_is_held(&state->policy_mutex));
+
+ /* If switching between different policy types, log MLS status */
+ if (oldpolicy) {
+ if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled)
+ pr_info("SELinux: Disabling MLS support...\n");
+ else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled)
+ pr_info("SELinux: Enabling MLS support...\n");
+ }
+
+ /* Set latest granting seqno for new policy. */
+ if (oldpolicy)
+ newpolicy->latest_granting = oldpolicy->latest_granting + 1;
+ else
+ newpolicy->latest_granting = 1;
+ seqno = newpolicy->latest_granting;
+
+ /* Install the new policy. */
+ rcu_assign_pointer(state->policy, newpolicy);
+
+ /* Load the policycaps from the new policy */
+ security_load_policycaps(state, newpolicy);
+
+ if (!selinux_initialized(state)) {
+ /*
+ * After first policy load, the security server is
+ * marked as initialized and ready to handle requests and
+ * any objects created prior to policy load are then labeled.
+ */
+ selinux_mark_initialized(state);
+ selinux_complete_init();
+ }
+
+ /* Free the old policy */
+ synchronize_rcu();
+ selinux_policy_free(oldpolicy);
+
+ /* Notify others of the policy change */
+ selinux_notify_policy_change(state, seqno);
+}
/**
* security_load_policy - Load a security policy configuration.
@@ -2130,173 +2240,95 @@ static int security_preserve_bools(struct selinux_state *state,
* This function will flush the access vector cache after
* loading the new policy.
*/
-int security_load_policy(struct selinux_state *state, void *data, size_t len)
+int security_load_policy(struct selinux_state *state, void *data, size_t len,
+ struct selinux_policy **newpolicyp)
{
- struct policydb *policydb;
- struct sidtab *oldsidtab, *newsidtab;
- struct policydb *oldpolicydb, *newpolicydb;
- struct selinux_mapping *oldmapping;
- struct selinux_map newmap;
+ struct selinux_policy *newpolicy, *oldpolicy;
struct sidtab_convert_params convert_params;
struct convert_context_args args;
- u32 seqno;
int rc = 0;
struct policy_file file = { data, len }, *fp = &file;
- policydb = &state->ss->policydb;
-
- newsidtab = kmalloc(sizeof(*newsidtab), GFP_KERNEL);
- if (!newsidtab)
+ newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL);
+ if (!newpolicy)
return -ENOMEM;
- if (!selinux_initialized(state)) {
- rc = policydb_read(policydb, fp);
- if (rc) {
- kfree(newsidtab);
- return rc;
- }
-
- policydb->len = len;
- rc = selinux_set_mapping(policydb, secclass_map,
- &state->ss->map);
- if (rc) {
- kfree(newsidtab);
- policydb_destroy(policydb);
- return rc;
- }
-
- rc = policydb_load_isids(policydb, newsidtab);
- if (rc) {
- kfree(newsidtab);
- policydb_destroy(policydb);
- return rc;
- }
-
- state->ss->sidtab = newsidtab;
- security_load_policycaps(state);
- selinux_mark_initialized(state);
- seqno = ++state->ss->latest_granting;
- selinux_complete_init();
- avc_ss_reset(state->avc, seqno);
- selnl_notify_policyload(seqno);
- selinux_status_update_policyload(state, seqno);
- selinux_netlbl_cache_invalidate();
- selinux_xfrm_notify_policyload();
- return 0;
+ newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL);
+ if (!newpolicy->sidtab) {
+ rc = -ENOMEM;
+ goto err_policy;
}
- oldpolicydb = kcalloc(2, sizeof(*oldpolicydb), GFP_KERNEL);
- if (!oldpolicydb) {
- kfree(newsidtab);
- return -ENOMEM;
- }
- newpolicydb = oldpolicydb + 1;
+ rc = policydb_read(&newpolicy->policydb, fp);
+ if (rc)
+ goto err_sidtab;
+
+ newpolicy->policydb.len = len;
+ rc = selinux_set_mapping(&newpolicy->policydb, secclass_map,
+ &newpolicy->map);
+ if (rc)
+ goto err_policydb;
- rc = policydb_read(newpolicydb, fp);
+ rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab);
if (rc) {
- kfree(newsidtab);
- goto out;
+ pr_err("SELinux: unable to load the initial SIDs\n");
+ goto err_mapping;
}
- newpolicydb->len = len;
- /* If switching between different policy types, log MLS status */
- if (policydb->mls_enabled && !newpolicydb->mls_enabled)
- pr_info("SELinux: Disabling MLS support...\n");
- else if (!policydb->mls_enabled && newpolicydb->mls_enabled)
- pr_info("SELinux: Enabling MLS support...\n");
- rc = policydb_load_isids(newpolicydb, newsidtab);
- if (rc) {
- pr_err("SELinux: unable to load the initial SIDs\n");
- policydb_destroy(newpolicydb);
- kfree(newsidtab);
- goto out;
+ if (!selinux_initialized(state)) {
+ /* First policy load, so no need to preserve state from old policy */
+ *newpolicyp = newpolicy;
+ return 0;
}
- rc = selinux_set_mapping(newpolicydb, secclass_map, &newmap);
- if (rc)
- goto err;
+ oldpolicy = rcu_dereference_protected(state->policy,
+ lockdep_is_held(&state->policy_mutex));
- rc = security_preserve_bools(state, newpolicydb);
+ /* Preserve active boolean values from the old policy */
+ rc = security_preserve_bools(oldpolicy, newpolicy);
if (rc) {
pr_err("SELinux: unable to preserve booleans\n");
- goto err;
+ goto err_free_isids;
}
- oldsidtab = state->ss->sidtab;
-
/*
* Convert the internal representations of contexts
* in the new SID table.
*/
args.state = state;
- args.oldp = policydb;
- args.newp = newpolicydb;
+ args.oldp = &oldpolicy->policydb;
+ args.newp = &newpolicy->policydb;
convert_params.func = convert_context;
convert_params.args = &args;
- convert_params.target = newsidtab;
+ convert_params.target = newpolicy->sidtab;
- rc = sidtab_convert(oldsidtab, &convert_params);
+ rc = sidtab_convert(oldpolicy->sidtab, &convert_params);
if (rc) {
pr_err("SELinux: unable to convert the internal"
" representation of contexts in the new SID"
" table\n");
- goto err;
+ goto err_free_isids;
}
- /* Save the old policydb and SID table to free later. */
- memcpy(oldpolicydb, policydb, sizeof(*policydb));
-
- /* Install the new policydb and SID table. */
- write_lock_irq(&state->ss->policy_rwlock);
- memcpy(policydb, newpolicydb, sizeof(*policydb));
- state->ss->sidtab = newsidtab;
- security_load_policycaps(state);
- oldmapping = state->ss->map.mapping;
- state->ss->map.mapping = newmap.mapping;
- state->ss->map.size = newmap.size;
- seqno = ++state->ss->latest_granting;
- write_unlock_irq(&state->ss->policy_rwlock);
-
- /* Free the old policydb and SID table. */
- policydb_destroy(oldpolicydb);
- sidtab_destroy(oldsidtab);
- kfree(oldsidtab);
- kfree(oldmapping);
-
- avc_ss_reset(state->avc, seqno);
- selnl_notify_policyload(seqno);
- selinux_status_update_policyload(state, seqno);
- selinux_netlbl_cache_invalidate();
- selinux_xfrm_notify_policyload();
-
- rc = 0;
- goto out;
+ *newpolicyp = newpolicy;
+ return 0;
-err:
- kfree(newmap.mapping);
- sidtab_destroy(newsidtab);
- kfree(newsidtab);
- policydb_destroy(newpolicydb);
+err_free_isids:
+ sidtab_destroy(newpolicy->sidtab);
+err_mapping:
+ kfree(newpolicy->map.mapping);
+err_policydb:
+ policydb_destroy(&newpolicy->policydb);
+err_sidtab:
+ kfree(newpolicy->sidtab);
+err_policy:
+ kfree(newpolicy);
-out:
- kfree(oldpolicydb);
return rc;
}
-size_t security_policydb_len(struct selinux_state *state)
-{
- struct policydb *p = &state->ss->policydb;
- size_t len;
-
- read_lock(&state->ss->policy_rwlock);
- len = p->len;
- read_unlock(&state->ss->policy_rwlock);
-
- return len;
-}
-
/**
* security_port_sid - Obtain the SID for a port.
* @protocol: protocol number
@@ -2306,15 +2338,21 @@ size_t security_policydb_len(struct selinux_state *state)
int security_port_sid(struct selinux_state *state,
u8 protocol, u16 port, u32 *out_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct ocontext *c;
int rc = 0;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *out_sid = SECINITSID_PORT;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
c = policydb->ocontexts[OCON_PORT];
while (c) {
@@ -2338,7 +2376,7 @@ int security_port_sid(struct selinux_state *state,
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -2351,15 +2389,21 @@ out:
int security_ib_pkey_sid(struct selinux_state *state,
u64 subnet_prefix, u16 pkey_num, u32 *out_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct ocontext *c;
int rc = 0;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *out_sid = SECINITSID_UNLABELED;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
c = policydb->ocontexts[OCON_IBPKEY];
while (c) {
@@ -2384,7 +2428,7 @@ int security_ib_pkey_sid(struct selinux_state *state,
*out_sid = SECINITSID_UNLABELED;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -2397,15 +2441,21 @@ out:
int security_ib_endport_sid(struct selinux_state *state,
const char *dev_name, u8 port_num, u32 *out_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct ocontext *c;
int rc = 0;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *out_sid = SECINITSID_UNLABELED;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
c = policydb->ocontexts[OCON_IBENDPORT];
while (c) {
@@ -2430,7 +2480,7 @@ int security_ib_endport_sid(struct selinux_state *state,
*out_sid = SECINITSID_UNLABELED;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -2442,15 +2492,21 @@ out:
int security_netif_sid(struct selinux_state *state,
char *name, u32 *if_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
int rc = 0;
struct ocontext *c;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *if_sid = SECINITSID_NETIF;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
c = policydb->ocontexts[OCON_NETIF];
while (c) {
@@ -2475,7 +2531,7 @@ int security_netif_sid(struct selinux_state *state,
*if_sid = SECINITSID_NETIF;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -2505,15 +2561,21 @@ int security_node_sid(struct selinux_state *state,
u32 addrlen,
u32 *out_sid)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
int rc;
struct ocontext *c;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *out_sid = SECINITSID_NODE;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
switch (domain) {
case AF_INET: {
@@ -2568,7 +2630,7 @@ int security_node_sid(struct selinux_state *state,
rc = 0;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -2594,6 +2656,7 @@ int security_get_user_sids(struct selinux_state *state,
u32 **sids,
u32 *nel)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
struct context *fromcon, usercon;
@@ -2610,10 +2673,10 @@ int security_get_user_sids(struct selinux_state *state,
if (!selinux_initialized(state))
goto out;
- read_lock(&state->ss->policy_rwlock);
-
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
context_init(&usercon);
@@ -2664,7 +2727,7 @@ int security_get_user_sids(struct selinux_state *state,
}
rc = 0;
out_unlock:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
if (rc || !mynel) {
kfree(mysids);
goto out;
@@ -2705,17 +2768,15 @@ out:
* Obtain a SID to use for a file in a filesystem that
* cannot support xattr or use a fixed labeling behavior like
* transition SIDs or task SIDs.
- *
- * The caller must acquire the policy_rwlock before calling this function.
*/
-static inline int __security_genfs_sid(struct selinux_state *state,
+static inline int __security_genfs_sid(struct selinux_policy *policy,
const char *fstype,
char *path,
u16 orig_sclass,
u32 *sid)
{
- struct policydb *policydb = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct policydb *policydb = &policy->policydb;
+ struct sidtab *sidtab = policy->sidtab;
int len;
u16 sclass;
struct genfs *genfs;
@@ -2725,7 +2786,7 @@ static inline int __security_genfs_sid(struct selinux_state *state,
while (path[0] == '/' && path[1] == '/')
path++;
- sclass = unmap_class(&state->ss->map, orig_sclass);
+ sclass = unmap_class(&policy->map, orig_sclass);
*sid = SECINITSID_UNLABELED;
for (genfs = policydb->genfs; genfs; genfs = genfs->next) {
@@ -2777,20 +2838,39 @@ int security_genfs_sid(struct selinux_state *state,
u16 orig_sclass,
u32 *sid)
{
+ struct selinux_policy *policy;
int retval;
- read_lock(&state->ss->policy_rwlock);
- retval = __security_genfs_sid(state, fstype, path, orig_sclass, sid);
- read_unlock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ *sid = SECINITSID_UNLABELED;
+ return 0;
+ }
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ retval = __security_genfs_sid(policy,
+ fstype, path, orig_sclass, sid);
+ rcu_read_unlock();
return retval;
}
+int selinux_policy_genfs_sid(struct selinux_policy *policy,
+ const char *fstype,
+ char *path,
+ u16 orig_sclass,
+ u32 *sid)
+{
+ /* no lock required, policy is not yet accessible by other threads */
+ return __security_genfs_sid(policy, fstype, path, orig_sclass, sid);
+}
+
/**
* security_fs_use - Determine how to handle labeling for a filesystem.
* @sb: superblock in question
*/
int security_fs_use(struct selinux_state *state, struct super_block *sb)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
struct sidtab *sidtab;
int rc = 0;
@@ -2798,10 +2878,16 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
struct superblock_security_struct *sbsec = sb->s_security;
const char *fstype = sb->s_type->name;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state)) {
+ sbsec->behavior = SECURITY_FS_USE_NONE;
+ sbsec->sid = SECINITSID_UNLABELED;
+ return 0;
+ }
- policydb = &state->ss->policydb;
- sidtab = state->ss->sidtab;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
c = policydb->ocontexts[OCON_FSUSE];
while (c) {
@@ -2820,8 +2906,8 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
}
sbsec->sid = c->sid[0];
} else {
- rc = __security_genfs_sid(state, fstype, "/", SECCLASS_DIR,
- &sbsec->sid);
+ rc = __security_genfs_sid(policy, fstype, "/",
+ SECCLASS_DIR, &sbsec->sid);
if (rc) {
sbsec->behavior = SECURITY_FS_USE_NONE;
rc = 0;
@@ -2831,27 +2917,18 @@ int security_fs_use(struct selinux_state *state, struct super_block *sb)
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
-int security_get_bools(struct selinux_state *state,
+int security_get_bools(struct selinux_policy *policy,
u32 *len, char ***names, int **values)
{
struct policydb *policydb;
u32 i;
int rc;
- if (!selinux_initialized(state)) {
- *len = 0;
- *names = NULL;
- *values = NULL;
- return 0;
- }
-
- read_lock(&state->ss->policy_rwlock);
-
- policydb = &state->ss->policydb;
+ policydb = &policy->policydb;
*names = NULL;
*values = NULL;
@@ -2882,7 +2959,6 @@ int security_get_bools(struct selinux_state *state,
}
rc = 0;
out:
- read_unlock(&state->ss->policy_rwlock);
return rc;
err:
if (*names) {
@@ -2900,61 +2976,89 @@ err:
int security_set_bools(struct selinux_state *state, u32 len, int *values)
{
- struct policydb *policydb;
+ struct selinux_policy *newpolicy, *oldpolicy;
int rc;
- u32 i, lenp, seqno = 0;
+ u32 i, seqno = 0;
- write_lock_irq(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state))
+ return -EINVAL;
- policydb = &state->ss->policydb;
+ oldpolicy = rcu_dereference_protected(state->policy,
+ lockdep_is_held(&state->policy_mutex));
- rc = -EFAULT;
- lenp = policydb->p_bools.nprim;
- if (len != lenp)
- goto out;
+ /* Consistency check on number of booleans, should never fail */
+ if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim))
+ return -EINVAL;
+
+ newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL);
+ if (!newpolicy)
+ return -ENOMEM;
+
+ /*
+ * Deep copy only the parts of the policydb that might be
+ * modified as a result of changing booleans.
+ */
+ rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb);
+ if (rc) {
+ kfree(newpolicy);
+ return -ENOMEM;
+ }
+ /* Update the boolean states in the copy */
for (i = 0; i < len; i++) {
- if (!!values[i] != policydb->bool_val_to_struct[i]->state) {
+ int new_state = !!values[i];
+ int old_state = newpolicy->policydb.bool_val_to_struct[i]->state;
+
+ if (new_state != old_state) {
audit_log(audit_context(), GFP_ATOMIC,
AUDIT_MAC_CONFIG_CHANGE,
"bool=%s val=%d old_val=%d auid=%u ses=%u",
- sym_name(policydb, SYM_BOOLS, i),
- !!values[i],
- policydb->bool_val_to_struct[i]->state,
+ sym_name(&newpolicy->policydb, SYM_BOOLS, i),
+ new_state,
+ old_state,
from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
+ newpolicy->policydb.bool_val_to_struct[i]->state = new_state;
}
- if (values[i])
- policydb->bool_val_to_struct[i]->state = 1;
- else
- policydb->bool_val_to_struct[i]->state = 0;
}
- evaluate_cond_nodes(policydb);
+ /* Re-evaluate the conditional rules in the copy */
+ evaluate_cond_nodes(&newpolicy->policydb);
- seqno = ++state->ss->latest_granting;
- rc = 0;
-out:
- write_unlock_irq(&state->ss->policy_rwlock);
- if (!rc) {
- avc_ss_reset(state->avc, seqno);
- selnl_notify_policyload(seqno);
- selinux_status_update_policyload(state, seqno);
- selinux_xfrm_notify_policyload();
- }
- return rc;
+ /* Set latest granting seqno for new policy */
+ newpolicy->latest_granting = oldpolicy->latest_granting + 1;
+ seqno = newpolicy->latest_granting;
+
+ /* Install the new policy */
+ rcu_assign_pointer(state->policy, newpolicy);
+
+ /*
+ * Free the conditional portions of the old policydb
+ * that were copied for the new policy, and the oldpolicy
+ * structure itself but not what it references.
+ */
+ synchronize_rcu();
+ selinux_policy_cond_free(oldpolicy);
+
+ /* Notify others of the policy change */
+ selinux_notify_policy_change(state, seqno);
+ return 0;
}
int security_get_bool_value(struct selinux_state *state,
u32 index)
{
+ struct selinux_policy *policy;
struct policydb *policydb;
int rc;
u32 len;
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state))
+ return 0;
- policydb = &state->ss->policydb;
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
rc = -EFAULT;
len = policydb->p_bools.nprim;
@@ -2963,27 +3067,28 @@ int security_get_bool_value(struct selinux_state *state,
rc = policydb->bool_val_to_struct[index]->state;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
-static int security_preserve_bools(struct selinux_state *state,
- struct policydb *policydb)
+static int security_preserve_bools(struct selinux_policy *oldpolicy,
+ struct selinux_policy *newpolicy)
{
int rc, *bvalues = NULL;
char **bnames = NULL;
struct cond_bool_datum *booldatum;
u32 i, nbools = 0;
- rc = security_get_bools(state, &nbools, &bnames, &bvalues);
+ rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues);
if (rc)
goto out;
for (i = 0; i < nbools; i++) {
- booldatum = symtab_search(&policydb->p_bools, bnames[i]);
+ booldatum = symtab_search(&newpolicy->policydb.p_bools,
+ bnames[i]);
if (booldatum)
booldatum->state = bvalues[i];
}
- evaluate_cond_nodes(policydb);
+ evaluate_cond_nodes(&newpolicy->policydb);
out:
if (bnames) {
@@ -3002,8 +3107,9 @@ out:
int security_sid_mls_copy(struct selinux_state *state,
u32 sid, u32 mls_sid, u32 *new_sid)
{
- struct policydb *policydb = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct selinux_policy *policy;
+ struct policydb *policydb;
+ struct sidtab *sidtab;
struct context *context1;
struct context *context2;
struct context newcon;
@@ -3012,14 +3118,22 @@ int security_sid_mls_copy(struct selinux_state *state,
int rc;
rc = 0;
- if (!selinux_initialized(state) || !policydb->mls_enabled) {
+ if (!selinux_initialized(state)) {
*new_sid = sid;
goto out;
}
context_init(&newcon);
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
+
+ if (!policydb->mls_enabled) {
+ *new_sid = sid;
+ goto out_unlock;
+ }
rc = -EINVAL;
context1 = sidtab_search(sidtab, sid);
@@ -3046,7 +3160,8 @@ int security_sid_mls_copy(struct selinux_state *state,
/* Check the validity of the new context. */
if (!policydb_context_isvalid(policydb, &newcon)) {
- rc = convert_context_handle_invalid_context(state, &newcon);
+ rc = convert_context_handle_invalid_context(state, policydb,
+ &newcon);
if (rc) {
if (!context_struct_to_string(policydb, &newcon, &s,
&len)) {
@@ -3067,7 +3182,7 @@ int security_sid_mls_copy(struct selinux_state *state,
}
rc = sidtab_context_to_sid(sidtab, &newcon, new_sid);
out_unlock:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
context_destroy(&newcon);
out:
return rc;
@@ -3098,8 +3213,9 @@ int security_net_peersid_resolve(struct selinux_state *state,
u32 xfrm_sid,
u32 *peer_sid)
{
- struct policydb *policydb = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct selinux_policy *policy;
+ struct policydb *policydb;
+ struct sidtab *sidtab;
int rc;
struct context *nlbl_ctx;
struct context *xfrm_ctx;
@@ -3121,15 +3237,23 @@ int security_net_peersid_resolve(struct selinux_state *state,
return 0;
}
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
+
/*
* We don't need to check initialized here since the only way both
* nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
* security server was initialized and state->initialized was true.
*/
- if (!policydb->mls_enabled)
- return 0;
-
- read_lock(&state->ss->policy_rwlock);
+ if (!policydb->mls_enabled) {
+ rc = 0;
+ goto out;
+ }
rc = -EINVAL;
nlbl_ctx = sidtab_search(sidtab, nlbl_sid);
@@ -3156,7 +3280,7 @@ int security_net_peersid_resolve(struct selinux_state *state,
* expressive */
*peer_sid = xfrm_sid;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -3173,19 +3297,13 @@ static int get_classes_callback(void *k, void *d, void *args)
return 0;
}
-int security_get_classes(struct selinux_state *state,
+int security_get_classes(struct selinux_policy *policy,
char ***classes, int *nclasses)
{
- struct policydb *policydb = &state->ss->policydb;
+ struct policydb *policydb;
int rc;
- if (!selinux_initialized(state)) {
- *nclasses = 0;
- *classes = NULL;
- return 0;
- }
-
- read_lock(&state->ss->policy_rwlock);
+ policydb = &policy->policydb;
rc = -ENOMEM;
*nclasses = policydb->p_classes.nprim;
@@ -3203,7 +3321,6 @@ int security_get_classes(struct selinux_state *state,
}
out:
- read_unlock(&state->ss->policy_rwlock);
return rc;
}
@@ -3220,14 +3337,14 @@ static int get_permissions_callback(void *k, void *d, void *args)
return 0;
}
-int security_get_permissions(struct selinux_state *state,
+int security_get_permissions(struct selinux_policy *policy,
char *class, char ***perms, int *nperms)
{
- struct policydb *policydb = &state->ss->policydb;
+ struct policydb *policydb;
int rc, i;
struct class_datum *match;
- read_lock(&state->ss->policy_rwlock);
+ policydb = &policy->policydb;
rc = -EINVAL;
match = symtab_search(&policydb->p_classes, class);
@@ -3256,11 +3373,9 @@ int security_get_permissions(struct selinux_state *state,
goto err;
out:
- read_unlock(&state->ss->policy_rwlock);
return rc;
err:
- read_unlock(&state->ss->policy_rwlock);
for (i = 0; i < *nperms; i++)
kfree((*perms)[i]);
kfree(*perms);
@@ -3269,12 +3384,32 @@ err:
int security_get_reject_unknown(struct selinux_state *state)
{
- return state->ss->policydb.reject_unknown;
+ struct selinux_policy *policy;
+ int value;
+
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ value = policy->policydb.reject_unknown;
+ rcu_read_unlock();
+ return value;
}
int security_get_allow_unknown(struct selinux_state *state)
{
- return state->ss->policydb.allow_unknown;
+ struct selinux_policy *policy;
+ int value;
+
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ value = policy->policydb.allow_unknown;
+ rcu_read_unlock();
+ return value;
}
/**
@@ -3290,12 +3425,16 @@ int security_get_allow_unknown(struct selinux_state *state)
int security_policycap_supported(struct selinux_state *state,
unsigned int req_cap)
{
- struct policydb *policydb = &state->ss->policydb;
+ struct selinux_policy *policy;
int rc;
- read_lock(&state->ss->policy_rwlock);
- rc = ebitmap_get_bit(&policydb->policycaps, req_cap);
- read_unlock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap);
+ rcu_read_unlock();
return rc;
}
@@ -3318,7 +3457,8 @@ void selinux_audit_rule_free(void *vrule)
int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
{
struct selinux_state *state = &selinux_state;
- struct policydb *policydb = &state->ss->policydb;
+ struct selinux_policy *policy;
+ struct policydb *policydb;
struct selinux_audit_rule *tmprule;
struct role_datum *roledatum;
struct type_datum *typedatum;
@@ -3361,9 +3501,11 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
context_init(&tmprule->au_ctxt);
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
- tmprule->au_seqno = state->ss->latest_granting;
+ tmprule->au_seqno = policy->latest_granting;
switch (field) {
case AUDIT_SUBJ_USER:
@@ -3402,7 +3544,7 @@ int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule)
}
rc = 0;
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
if (rc) {
selinux_audit_rule_free(tmprule);
@@ -3442,6 +3584,7 @@ int selinux_audit_rule_known(struct audit_krule *rule)
int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
{
struct selinux_state *state = &selinux_state;
+ struct selinux_policy *policy;
struct context *ctxt;
struct mls_level *level;
struct selinux_audit_rule *rule = vrule;
@@ -3452,14 +3595,19 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
return -ENOENT;
}
- read_lock(&state->ss->policy_rwlock);
+ if (!selinux_initialized(state))
+ return 0;
+
+ rcu_read_lock();
- if (rule->au_seqno < state->ss->latest_granting) {
+ policy = rcu_dereference(state->policy);
+
+ if (rule->au_seqno < policy->latest_granting) {
match = -ESTALE;
goto out;
}
- ctxt = sidtab_search(state->ss->sidtab, sid);
+ ctxt = sidtab_search(policy->sidtab, sid);
if (unlikely(!ctxt)) {
WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
sid);
@@ -3543,7 +3691,7 @@ int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule)
}
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return match;
}
@@ -3621,8 +3769,9 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state,
struct netlbl_lsm_secattr *secattr,
u32 *sid)
{
- struct policydb *policydb = &state->ss->policydb;
- struct sidtab *sidtab = state->ss->sidtab;
+ struct selinux_policy *policy;
+ struct policydb *policydb;
+ struct sidtab *sidtab;
int rc;
struct context *ctx;
struct context ctx_new;
@@ -3632,7 +3781,10 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state,
return 0;
}
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
+ sidtab = policy->sidtab;
if (secattr->flags & NETLBL_SECATTR_CACHE)
*sid = *(u32 *)secattr->cache->data;
@@ -3668,12 +3820,12 @@ int security_netlbl_secattr_to_sid(struct selinux_state *state,
} else
*sid = SECSID_NULL;
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return 0;
out_free:
ebitmap_destroy(&ctx_new.range.level[0].cat);
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
@@ -3690,17 +3842,20 @@ out:
int security_netlbl_sid_to_secattr(struct selinux_state *state,
u32 sid, struct netlbl_lsm_secattr *secattr)
{
- struct policydb *policydb = &state->ss->policydb;
+ struct selinux_policy *policy;
+ struct policydb *policydb;
int rc;
struct context *ctx;
if (!selinux_initialized(state))
return 0;
- read_lock(&state->ss->policy_rwlock);
+ rcu_read_lock();
+ policy = rcu_dereference(state->policy);
+ policydb = &policy->policydb;
rc = -ENOENT;
- ctx = sidtab_search(state->ss->sidtab, sid);
+ ctx = sidtab_search(policy->sidtab, sid);
if (ctx == NULL)
goto out;
@@ -3715,7 +3870,7 @@ int security_netlbl_sid_to_secattr(struct selinux_state *state,
mls_export_netlbl_lvl(policydb, ctx, secattr);
rc = mls_export_netlbl_cat(policydb, ctx, secattr);
out:
- read_unlock(&state->ss->policy_rwlock);
+ rcu_read_unlock();
return rc;
}
#endif /* CONFIG_NETLABEL */
@@ -3729,15 +3884,16 @@ out:
int security_read_policy(struct selinux_state *state,
void **data, size_t *len)
{
- struct policydb *policydb = &state->ss->policydb;
+ struct selinux_policy *policy;
int rc;
struct policy_file fp;
- if (!selinux_initialized(state))
+ policy = rcu_dereference_protected(
+ state->policy, lockdep_is_held(&state->policy_mutex));
+ if (!policy)
return -EINVAL;
- *len = security_policydb_len(state);
-
+ *len = policy->policydb.len;
*data = vmalloc_user(*len);
if (!*data)
return -ENOMEM;
@@ -3745,10 +3901,7 @@ int security_read_policy(struct selinux_state *state,
fp.data = *data;
fp.len = *len;
- read_lock(&state->ss->policy_rwlock);
- rc = policydb_write(policydb, &fp);
- read_unlock(&state->ss->policy_rwlock);
-
+ rc = policydb_write(&policy->policydb, &fp);
if (rc)
return rc;
diff --git a/security/selinux/ss/services.h b/security/selinux/ss/services.h
index a06f3d835216..9555ad074303 100644
--- a/security/selinux/ss/services.h
+++ b/security/selinux/ss/services.h
@@ -22,12 +22,11 @@ struct selinux_map {
u16 size; /* array size of mapping */
};
-struct selinux_ss {
+struct selinux_policy {
struct sidtab *sidtab;
struct policydb policydb;
- rwlock_t policy_rwlock;
- u32 latest_granting;
struct selinux_map map;
+ u32 latest_granting;
} __randomize_layout;
void services_compute_xperms_drivers(struct extended_perms *xperms,
diff --git a/security/selinux/ss/sidtab.c b/security/selinux/ss/sidtab.c
index eb6d27b5aeb4..5ee190bd30f5 100644
--- a/security/selinux/ss/sidtab.c
+++ b/security/selinux/ss/sidtab.c
@@ -464,6 +464,16 @@ int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params)
return 0;
}
+void sidtab_cancel_convert(struct sidtab *s)
+{
+ unsigned long flags;
+
+ /* cancelling policy load - disable live convert of sidtab */
+ spin_lock_irqsave(&s->lock, flags);
+ s->convert = NULL;
+ spin_unlock_irqrestore(&s->lock, flags);
+}
+
static void sidtab_destroy_entry(struct sidtab_entry *entry)
{
context_destroy(&entry->context);
diff --git a/security/selinux/ss/sidtab.h b/security/selinux/ss/sidtab.h
index f2a84560b8b3..80c744d07ad6 100644
--- a/security/selinux/ss/sidtab.h
+++ b/security/selinux/ss/sidtab.h
@@ -123,6 +123,8 @@ static inline struct context *sidtab_search_force(struct sidtab *s, u32 sid)
int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params);
+void sidtab_cancel_convert(struct sidtab *s);
+
int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid);
void sidtab_destroy(struct sidtab *s);
diff --git a/security/smack/smack.h b/security/smack/smack.h
index e9e817d09785..a9768b12716b 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -100,7 +100,12 @@ struct socket_smack {
struct smack_known *smk_out; /* outbound label */
struct smack_known *smk_in; /* inbound label */
struct smack_known *smk_packet; /* TCP peer label */
+ int smk_state; /* netlabel socket states */
};
+#define SMK_NETLBL_UNSET 0
+#define SMK_NETLBL_UNLABELED 1
+#define SMK_NETLBL_LABELED 2
+#define SMK_NETLBL_REQSKB 3
/*
* Inode smack data
@@ -197,19 +202,6 @@ enum {
#define SMACK_CIPSO_OPTION "-CIPSO"
/*
- * How communications on this socket are treated.
- * Usually it's determined by the underlying netlabel code
- * but there are certain cases, including single label hosts
- * and potentially single label interfaces for which the
- * treatment can not be known in advance.
- *
- * The possibility of additional labeling schemes being
- * introduced in the future exists as well.
- */
-#define SMACK_UNLABELED_SOCKET 0
-#define SMACK_CIPSO_SOCKET 1
-
-/*
* CIPSO defaults.
*/
#define SMACK_CIPSO_DOI_DEFAULT 3 /* Historical */
@@ -305,6 +297,7 @@ struct smack_known *smk_find_entry(const char *);
bool smack_privileged(int cap);
bool smack_privileged_cred(int cap, const struct cred *cred);
void smk_destroy_label_list(struct list_head *list);
+int smack_populate_secattr(struct smack_known *skp);
/*
* Shared data.
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index 38ac3da4e791..efe2406a3960 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -511,6 +511,42 @@ int smk_netlbl_mls(int level, char *catset, struct netlbl_lsm_secattr *sap,
}
/**
+ * smack_populate_secattr - fill in the smack_known netlabel information
+ * @skp: pointer to the structure to fill
+ *
+ * Populate the netlabel secattr structure for a Smack label.
+ *
+ * Returns 0 unless creating the category mapping fails
+ */
+int smack_populate_secattr(struct smack_known *skp)
+{
+ int slen;
+
+ skp->smk_netlabel.attr.secid = skp->smk_secid;
+ skp->smk_netlabel.domain = skp->smk_known;
+ skp->smk_netlabel.cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
+ if (skp->smk_netlabel.cache != NULL) {
+ skp->smk_netlabel.flags |= NETLBL_SECATTR_CACHE;
+ skp->smk_netlabel.cache->free = NULL;
+ skp->smk_netlabel.cache->data = skp;
+ }
+ skp->smk_netlabel.flags |= NETLBL_SECATTR_SECID |
+ NETLBL_SECATTR_MLS_LVL |
+ NETLBL_SECATTR_DOMAIN;
+ /*
+ * If direct labeling works use it.
+ * Otherwise use mapped labeling.
+ */
+ slen = strlen(skp->smk_known);
+ if (slen < SMK_CIPSOLEN)
+ return smk_netlbl_mls(smack_cipso_direct, skp->smk_known,
+ &skp->smk_netlabel, slen);
+
+ return smk_netlbl_mls(smack_cipso_mapped, (char *)&skp->smk_secid,
+ &skp->smk_netlabel, sizeof(skp->smk_secid));
+}
+
+/**
* smk_import_entry - import a label, return the list entry
* @string: a text string that might be a Smack label
* @len: the maximum size, or zero if it is NULL terminated.
@@ -523,7 +559,6 @@ struct smack_known *smk_import_entry(const char *string, int len)
{
struct smack_known *skp;
char *smack;
- int slen;
int rc;
smack = smk_parse_smack(string, len);
@@ -544,21 +579,8 @@ struct smack_known *smk_import_entry(const char *string, int len)
skp->smk_known = smack;
skp->smk_secid = smack_next_secid++;
- skp->smk_netlabel.domain = skp->smk_known;
- skp->smk_netlabel.flags =
- NETLBL_SECATTR_DOMAIN | NETLBL_SECATTR_MLS_LVL;
- /*
- * If direct labeling works use it.
- * Otherwise use mapped labeling.
- */
- slen = strlen(smack);
- if (slen < SMK_CIPSOLEN)
- rc = smk_netlbl_mls(smack_cipso_direct, skp->smk_known,
- &skp->smk_netlabel, slen);
- else
- rc = smk_netlbl_mls(smack_cipso_mapped, (char *)&skp->smk_secid,
- &skp->smk_netlabel, sizeof(skp->smk_secid));
+ rc = smack_populate_secattr(skp);
if (rc >= 0) {
INIT_LIST_HEAD(&skp->smk_rules);
mutex_init(&skp->smk_rules_lock);
@@ -569,9 +591,6 @@ struct smack_known *smk_import_entry(const char *string, int len)
smk_insert_entry(skp);
goto unlockout;
}
- /*
- * smk_netlbl_mls failed.
- */
kfree(skp);
skp = ERR_PTR(rc);
freeout:
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 8c0893eb5aa8..5c90b9fa4d40 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -2383,38 +2383,31 @@ static struct smack_known *smack_ipv6host_label(struct sockaddr_in6 *sip)
}
/**
- * smack_netlabel - Set the secattr on a socket
+ * smack_netlbl_add - Set the secattr on a socket
* @sk: the socket
- * @labeled: socket label scheme
*
- * Convert the outbound smack value (smk_out) to a
- * secattr and attach it to the socket.
+ * Attach the outbound smack value (smk_out) to the socket.
*
* Returns 0 on success or an error code
*/
-static int smack_netlabel(struct sock *sk, int labeled)
+static int smack_netlbl_add(struct sock *sk)
{
- struct smack_known *skp;
struct socket_smack *ssp = sk->sk_security;
- int rc = 0;
+ struct smack_known *skp = ssp->smk_out;
+ int rc;
- /*
- * Usually the netlabel code will handle changing the
- * packet labeling based on the label.
- * The case of a single label host is different, because
- * a single label host should never get a labeled packet
- * even though the label is usually associated with a packet
- * label.
- */
local_bh_disable();
bh_lock_sock_nested(sk);
- if (ssp->smk_out == smack_net_ambient ||
- labeled == SMACK_UNLABELED_SOCKET)
- netlbl_sock_delattr(sk);
- else {
- skp = ssp->smk_out;
- rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel);
+ rc = netlbl_sock_setattr(sk, sk->sk_family, &skp->smk_netlabel);
+ switch (rc) {
+ case 0:
+ ssp->smk_state = SMK_NETLBL_LABELED;
+ break;
+ case -EDESTADDRREQ:
+ ssp->smk_state = SMK_NETLBL_REQSKB;
+ rc = 0;
+ break;
}
bh_unlock_sock(sk);
@@ -2424,7 +2417,31 @@ static int smack_netlabel(struct sock *sk, int labeled)
}
/**
- * smack_netlbel_send - Set the secattr on a socket and perform access checks
+ * smack_netlbl_delete - Remove the secattr from a socket
+ * @sk: the socket
+ *
+ * Remove the outbound smack value from a socket
+ */
+static void smack_netlbl_delete(struct sock *sk)
+{
+ struct socket_smack *ssp = sk->sk_security;
+
+ /*
+ * Take the label off the socket if one is set.
+ */
+ if (ssp->smk_state != SMK_NETLBL_LABELED)
+ return;
+
+ local_bh_disable();
+ bh_lock_sock_nested(sk);
+ netlbl_sock_delattr(sk);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ ssp->smk_state = SMK_NETLBL_UNLABELED;
+}
+
+/**
+ * smk_ipv4_check - Perform IPv4 host access checks
* @sk: the socket
* @sap: the destination address
*
@@ -2434,11 +2451,10 @@ static int smack_netlabel(struct sock *sk, int labeled)
* Returns 0 on success or an error code.
*
*/
-static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
+static int smk_ipv4_check(struct sock *sk, struct sockaddr_in *sap)
{
struct smack_known *skp;
- int rc;
- int sk_lbl;
+ int rc = 0;
struct smack_known *hkp;
struct socket_smack *ssp = sk->sk_security;
struct smk_audit_info ad;
@@ -2454,19 +2470,18 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
ad.a.u.net->dport = sap->sin_port;
ad.a.u.net->v4info.daddr = sap->sin_addr.s_addr;
#endif
- sk_lbl = SMACK_UNLABELED_SOCKET;
skp = ssp->smk_out;
rc = smk_access(skp, hkp, MAY_WRITE, &ad);
rc = smk_bu_note("IPv4 host check", skp, hkp, MAY_WRITE, rc);
- } else {
- sk_lbl = SMACK_CIPSO_SOCKET;
- rc = 0;
+ /*
+ * Clear the socket netlabel if it's set.
+ */
+ if (!rc)
+ smack_netlbl_delete(sk);
}
rcu_read_unlock();
- if (rc != 0)
- return rc;
- return smack_netlabel(sk, sk_lbl);
+ return rc;
}
/**
@@ -2703,7 +2718,7 @@ static int smack_inode_setsecurity(struct inode *inode, const char *name,
else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) {
ssp->smk_out = skp;
if (sock->sk->sk_family == PF_INET) {
- rc = smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET);
+ rc = smack_netlbl_add(sock->sk);
if (rc != 0)
printk(KERN_WARNING
"Smack: \"%s\" netlbl error %d.\n",
@@ -2754,7 +2769,7 @@ static int smack_socket_post_create(struct socket *sock, int family,
/*
* Set the outbound netlbl.
*/
- return smack_netlabel(sock->sk, SMACK_CIPSO_SOCKET);
+ return smack_netlbl_add(sock->sk);
}
/**
@@ -2845,7 +2860,7 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap,
}
if (sap->sa_family != AF_INET || addrlen < sizeof(struct sockaddr_in))
return 0;
- rc = smack_netlabel_send(sock->sk, (struct sockaddr_in *)sap);
+ rc = smk_ipv4_check(sock->sk, (struct sockaddr_in *)sap);
return rc;
}
@@ -3663,7 +3678,7 @@ static int smack_socket_sendmsg(struct socket *sock, struct msghdr *msg,
if (msg->msg_namelen < sizeof(struct sockaddr_in) ||
sip->sin_family != AF_INET)
return -EINVAL;
- rc = smack_netlabel_send(sock->sk, sip);
+ rc = smk_ipv4_check(sock->sk, sip);
break;
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
@@ -3700,6 +3715,18 @@ static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap,
int acat;
int kcat;
+ /*
+ * Netlabel found it in the cache.
+ */
+ if ((sap->flags & NETLBL_SECATTR_CACHE) != 0)
+ return (struct smack_known *)sap->cache->data;
+
+ if ((sap->flags & NETLBL_SECATTR_SECID) != 0)
+ /*
+ * Looks like a fallback, which gives us a secid.
+ */
+ return smack_from_secid(sap->attr.secid);
+
if ((sap->flags & NETLBL_SECATTR_MLS_LVL) != 0) {
/*
* Looks like a CIPSO packet.
@@ -3747,11 +3774,6 @@ static struct smack_known *smack_from_secattr(struct netlbl_lsm_secattr *sap,
return &smack_known_web;
return &smack_known_star;
}
- if ((sap->flags & NETLBL_SECATTR_SECID) != 0)
- /*
- * Looks like a fallback, which gives us a secid.
- */
- return smack_from_secid(sap->attr.secid);
/*
* Without guidance regarding the smack value
* for the packet fall back on the network
@@ -3811,6 +3833,62 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip)
#endif /* CONFIG_IPV6 */
/**
+ * smack_from_skb - Smack data from the secmark in an skb
+ * @skb: packet
+ *
+ * Returns smack_known of the secmark or NULL if that won't work.
+ */
+#ifdef CONFIG_NETWORK_SECMARK
+static struct smack_known *smack_from_skb(struct sk_buff *skb)
+{
+ if (skb == NULL || skb->secmark == 0)
+ return NULL;
+
+ return smack_from_secid(skb->secmark);
+}
+#else
+static inline struct smack_known *smack_from_skb(struct sk_buff *skb)
+{
+ return NULL;
+}
+#endif
+
+/**
+ * smack_from_netlbl - Smack data from the IP options in an skb
+ * @sk: socket data came in on
+ * @family: address family
+ * @skb: packet
+ *
+ * Find the Smack label in the IP options. If it hasn't been
+ * added to the netlabel cache, add it here.
+ *
+ * Returns smack_known of the IP options or NULL if that won't work.
+ */
+static struct smack_known *smack_from_netlbl(struct sock *sk, u16 family,
+ struct sk_buff *skb)
+{
+ struct netlbl_lsm_secattr secattr;
+ struct socket_smack *ssp = NULL;
+ struct smack_known *skp = NULL;
+ int rc;
+
+ netlbl_secattr_init(&secattr);
+
+ if (sk)
+ ssp = sk->sk_security;
+
+ if (netlbl_skbuff_getattr(skb, family, &secattr) == 0) {
+ skp = smack_from_secattr(&secattr, ssp);
+ if (secattr.flags & NETLBL_SECATTR_CACHEABLE)
+ rc = netlbl_cache_add(skb, family, &skp->smk_netlabel);
+ }
+
+ netlbl_secattr_destroy(&secattr);
+
+ return skp;
+}
+
+/**
* smack_socket_sock_rcv_skb - Smack packet delivery access check
* @sk: socket
* @skb: packet
@@ -3819,7 +3897,6 @@ static int smk_skb_to_addr_ipv6(struct sk_buff *skb, struct sockaddr_in6 *sip)
*/
static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- struct netlbl_lsm_secattr secattr;
struct socket_smack *ssp = sk->sk_security;
struct smack_known *skp = NULL;
int rc = 0;
@@ -3838,33 +3915,18 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
switch (family) {
case PF_INET:
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
/*
* If there is a secmark use it rather than the CIPSO label.
* If there is no secmark fall back to CIPSO.
* The secmark is assumed to reflect policy better.
*/
- if (skb && skb->secmark != 0) {
- skp = smack_from_secid(skb->secmark);
- goto access_check;
+ skp = smack_from_skb(skb);
+ if (skp == NULL) {
+ skp = smack_from_netlbl(sk, family, skb);
+ if (skp == NULL)
+ skp = smack_net_ambient;
}
-#endif /* CONFIG_SECURITY_SMACK_NETFILTER */
- /*
- * Translate what netlabel gave us.
- */
- netlbl_secattr_init(&secattr);
- rc = netlbl_skbuff_getattr(skb, family, &secattr);
- if (rc == 0)
- skp = smack_from_secattr(&secattr, ssp);
- else
- skp = smack_net_ambient;
-
- netlbl_secattr_destroy(&secattr);
-
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
-access_check:
-#endif
#ifdef CONFIG_AUDIT
smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
ad.a.u.net->family = family;
@@ -3890,16 +3952,14 @@ access_check:
proto != IPPROTO_TCP && proto != IPPROTO_DCCP)
break;
#ifdef SMACK_IPV6_SECMARK_LABELING
- if (skb && skb->secmark != 0)
- skp = smack_from_secid(skb->secmark);
- else if (smk_ipv6_localhost(&sadd))
- break;
- else
+ skp = smack_from_skb(skb);
+ if (skp == NULL) {
+ if (smk_ipv6_localhost(&sadd))
+ break;
skp = smack_ipv6host_label(&sadd);
- if (skp == NULL)
- skp = smack_net_ambient;
- if (skb == NULL)
- break;
+ if (skp == NULL)
+ skp = smack_net_ambient;
+ }
#ifdef CONFIG_AUDIT
smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
ad.a.u.net->family = family;
@@ -3971,12 +4031,11 @@ static int smack_socket_getpeersec_dgram(struct socket *sock,
struct sk_buff *skb, u32 *secid)
{
- struct netlbl_lsm_secattr secattr;
struct socket_smack *ssp = NULL;
struct smack_known *skp;
+ struct sock *sk = NULL;
int family = PF_UNSPEC;
u32 s = 0; /* 0 is the invalid secid */
- int rc;
if (skb != NULL) {
if (skb->protocol == htons(ETH_P_IP))
@@ -3995,27 +4054,25 @@ static int smack_socket_getpeersec_dgram(struct socket *sock,
s = ssp->smk_out->smk_secid;
break;
case PF_INET:
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
- s = skb->secmark;
- if (s != 0)
+ skp = smack_from_skb(skb);
+ if (skp) {
+ s = skp->smk_secid;
break;
-#endif
+ }
/*
* Translate what netlabel gave us.
*/
- if (sock != NULL && sock->sk != NULL)
- ssp = sock->sk->sk_security;
- netlbl_secattr_init(&secattr);
- rc = netlbl_skbuff_getattr(skb, family, &secattr);
- if (rc == 0) {
- skp = smack_from_secattr(&secattr, ssp);
+ if (sock != NULL)
+ sk = sock->sk;
+ skp = smack_from_netlbl(sk, family, skb);
+ if (skp != NULL)
s = skp->smk_secid;
- }
- netlbl_secattr_destroy(&secattr);
break;
case PF_INET6:
#ifdef SMACK_IPV6_SECMARK_LABELING
- s = skb->secmark;
+ skp = smack_from_skb(skb);
+ if (skp)
+ s = skp->smk_secid;
#endif
break;
}
@@ -4063,7 +4120,6 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
u16 family = sk->sk_family;
struct smack_known *skp;
struct socket_smack *ssp = sk->sk_security;
- struct netlbl_lsm_secattr secattr;
struct sockaddr_in addr;
struct iphdr *hdr;
struct smack_known *hskp;
@@ -4087,29 +4143,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
}
#endif /* CONFIG_IPV6 */
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
/*
* If there is a secmark use it rather than the CIPSO label.
* If there is no secmark fall back to CIPSO.
* The secmark is assumed to reflect policy better.
*/
- if (skb && skb->secmark != 0) {
- skp = smack_from_secid(skb->secmark);
- goto access_check;
+ skp = smack_from_skb(skb);
+ if (skp == NULL) {
+ skp = smack_from_netlbl(sk, family, skb);
+ if (skp == NULL)
+ skp = &smack_known_huh;
}
-#endif /* CONFIG_SECURITY_SMACK_NETFILTER */
-
- netlbl_secattr_init(&secattr);
- rc = netlbl_skbuff_getattr(skb, family, &secattr);
- if (rc == 0)
- skp = smack_from_secattr(&secattr, ssp);
- else
- skp = &smack_known_huh;
- netlbl_secattr_destroy(&secattr);
-
-#ifdef CONFIG_SECURITY_SMACK_NETFILTER
-access_check:
-#endif
#ifdef CONFIG_AUDIT
smk_ad_init_net(&ad, __func__, LSM_AUDIT_DATA_NET, &net);
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 9c4308077574..e567b4baf3a0 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -922,6 +922,10 @@ static ssize_t smk_set_cipso(struct file *file, const char __user *buf,
skp->smk_netlabel.attr.mls.cat = ncats.attr.mls.cat;
skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl;
rc = count;
+ /*
+ * This mapping may have been cached, so clear the cache.
+ */
+ netlbl_cache_invalidate();
}
out:
@@ -2950,15 +2954,6 @@ static struct file_system_type smk_fs_type = {
static struct vfsmount *smackfs_mount;
-static int __init smk_preset_netlabel(struct smack_known *skp)
-{
- skp->smk_netlabel.domain = skp->smk_known;
- skp->smk_netlabel.flags =
- NETLBL_SECATTR_DOMAIN | NETLBL_SECATTR_MLS_LVL;
- return smk_netlbl_mls(smack_cipso_direct, skp->smk_known,
- &skp->smk_netlabel, strlen(skp->smk_known));
-}
-
/**
* init_smk_fs - get the smackfs superblock
*
@@ -2997,19 +2992,19 @@ static int __init init_smk_fs(void)
smk_cipso_doi();
smk_unlbl_ambient(NULL);
- rc = smk_preset_netlabel(&smack_known_floor);
+ rc = smack_populate_secattr(&smack_known_floor);
if (err == 0 && rc < 0)
err = rc;
- rc = smk_preset_netlabel(&smack_known_hat);
+ rc = smack_populate_secattr(&smack_known_hat);
if (err == 0 && rc < 0)
err = rc;
- rc = smk_preset_netlabel(&smack_known_huh);
+ rc = smack_populate_secattr(&smack_known_huh);
if (err == 0 && rc < 0)
err = rc;
- rc = smk_preset_netlabel(&smack_known_star);
+ rc = smack_populate_secattr(&smack_known_star);
if (err == 0 && rc < 0)
err = rc;
- rc = smk_preset_netlabel(&smack_known_web);
+ rc = smack_populate_secattr(&smack_known_web);
if (err == 0 && rc < 0)
err = rc;
diff --git a/security/tomoyo/util.c b/security/tomoyo/util.c
index eba0b3395851..a40abb0b91ee 100644
--- a/security/tomoyo/util.c
+++ b/security/tomoyo/util.c
@@ -143,6 +143,8 @@ char *tomoyo_read_token(struct tomoyo_acl_param *param)
return pos;
}
+static bool tomoyo_correct_path2(const char *filename, const size_t len);
+
/**
* tomoyo_get_domainname - Read a domainname from a line.
*
@@ -157,10 +159,10 @@ const struct tomoyo_path_info *tomoyo_get_domainname
char *pos = start;
while (*pos) {
- if (*pos++ != ' ' || *pos++ == '/')
+ if (*pos++ != ' ' ||
+ tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos))
continue;
- pos -= 2;
- *pos++ = '\0';
+ *(pos - 1) = '\0';
break;
}
param->data = pos;
@@ -514,6 +516,22 @@ bool tomoyo_correct_word(const char *string)
}
/**
+ * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules.
+ *
+ * @filename: The pathname to check.
+ * @len: Length of @filename.
+ *
+ * Returns true if @filename follows the naming rules, false otherwise.
+ */
+static bool tomoyo_correct_path2(const char *filename, const size_t len)
+{
+ const char *cp1 = memchr(filename, '/', len);
+ const char *cp2 = memchr(filename, '.', len);
+
+ return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len);
+}
+
+/**
* tomoyo_correct_path - Validate a pathname.
*
* @filename: The pathname to check.
@@ -523,7 +541,7 @@ bool tomoyo_correct_word(const char *string)
*/
bool tomoyo_correct_path(const char *filename)
{
- return *filename == '/' && tomoyo_correct_word(filename);
+ return tomoyo_correct_path2(filename, strlen(filename));
}
/**
@@ -545,8 +563,7 @@ bool tomoyo_correct_domain(const unsigned char *domainname)
if (!cp)
break;
- if (*domainname != '/' ||
- !tomoyo_correct_word2(domainname, cp - domainname))
+ if (!tomoyo_correct_path2(domainname, cp - domainname))
return false;
domainname = cp + 1;
}
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
index b7e6dec36173..42be3b925830 100644
--- a/tools/testing/selftests/clone3/clone3.c
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -20,13 +20,6 @@
#include "../kselftest.h"
#include "clone3_selftests.h"
-/*
- * Different sizes of struct clone_args
- */
-#ifndef CLONE3_ARGS_SIZE_V0
-#define CLONE3_ARGS_SIZE_V0 64
-#endif
-
enum test_mode {
CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0,
@@ -38,13 +31,13 @@ enum test_mode {
static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
{
- struct clone_args args = {
+ struct __clone_args args = {
.flags = flags,
.exit_signal = SIGCHLD,
};
struct clone_args_extended {
- struct clone_args args;
+ struct __clone_args args;
__aligned_u64 excess_space[2];
} args_ext;
@@ -52,11 +45,11 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
int status;
memset(&args_ext, 0, sizeof(args_ext));
- if (size > sizeof(struct clone_args))
+ if (size > sizeof(struct __clone_args))
args_ext.excess_space[1] = 1;
if (size == 0)
- size = sizeof(struct clone_args);
+ size = sizeof(struct __clone_args);
switch (test_mode) {
case CLONE3_ARGS_ALL_0:
@@ -77,9 +70,9 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
break;
}
- memcpy(&args_ext.args, &args, sizeof(struct clone_args));
+ memcpy(&args_ext.args, &args, sizeof(struct __clone_args));
- pid = sys_clone3((struct clone_args *)&args_ext, size);
+ pid = sys_clone3((struct __clone_args *)&args_ext, size);
if (pid < 0) {
ksft_print_msg("%s - Failed to create new process\n",
strerror(errno));
@@ -144,14 +137,14 @@ int main(int argc, char *argv[])
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
- /* Do a clone3() with CLONE3_ARGS_SIZE_V0. */
- test_clone3(0, CLONE3_ARGS_SIZE_V0, 0, CLONE3_ARGS_NO_TEST);
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0. */
+ test_clone3(0, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST);
- /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 */
- test_clone3(0, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 */
+ test_clone3(0, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with sizeof(struct clone_args) + 8 */
- test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
+ test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with exit_signal having highest 32 bits non-zero */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG);
@@ -165,31 +158,31 @@ int main(int argc, char *argv[])
/* Do a clone3() with NSIG < exit_signal < CSIG */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG);
- test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
+ test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
- test_clone3(0, sizeof(struct clone_args) + 16, -E2BIG,
+ test_clone3(0, sizeof(struct __clone_args) + 16, -E2BIG,
CLONE3_ARGS_ALL_0);
- test_clone3(0, sizeof(struct clone_args) * 2, -E2BIG,
+ test_clone3(0, sizeof(struct __clone_args) * 2, -E2BIG,
CLONE3_ARGS_ALL_0);
/* Do a clone3() with > page size */
test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST);
- /* Do a clone3() with CLONE3_ARGS_SIZE_V0 in a new PID NS. */
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 in a new PID NS. */
if (uid == 0)
- test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0, 0,
+ test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0, 0,
CLONE3_ARGS_NO_TEST);
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
- /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 in a new PID NS */
- test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL,
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS */
+ test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL,
CLONE3_ARGS_NO_TEST);
/* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */
if (uid == 0)
- test_clone3(CLONE_NEWPID, sizeof(struct clone_args) + 8, 0,
+ test_clone3(CLONE_NEWPID, sizeof(struct __clone_args) + 8, 0,
CLONE3_ARGS_NO_TEST);
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
index 9562425aa0a9..55bd387ce7ec 100644
--- a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
+++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
@@ -44,13 +44,13 @@ static int call_clone3_set_tid(struct __test_metadata *_metadata,
int status;
pid_t pid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.exit_signal = SIGCHLD,
.set_tid = ptr_to_u64(set_tid),
.set_tid_size = set_tid_size,
};
- pid = sys_clone3(&args, sizeof(struct clone_args));
+ pid = sys_clone3(&args, sizeof(args));
if (pid < 0) {
TH_LOG("%s - Failed to create new process", strerror(errno));
return -errno;
diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c
index db5fc9c5edcf..47a8c0fc3676 100644
--- a/tools/testing/selftests/clone3/clone3_clear_sighand.c
+++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c
@@ -47,7 +47,7 @@ static void test_clone3_clear_sighand(void)
{
int ret;
pid_t pid;
- struct clone_args args = {};
+ struct __clone_args args = {};
struct sigaction act;
/*
diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
index 91c1a78ddb39..e81ffaaee02b 100644
--- a/tools/testing/selftests/clone3/clone3_selftests.h
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -19,13 +19,11 @@
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#endif
-#ifndef CLONE_ARGS_SIZE_VER0
-#define CLONE_ARGS_SIZE_VER0 64
-#endif
-
#ifndef __NR_clone3
#define __NR_clone3 -1
-struct clone_args {
+#endif
+
+struct __clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
@@ -34,15 +32,21 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
-#define CLONE_ARGS_SIZE_VER1 80
+#ifndef CLONE_ARGS_SIZE_VER0
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+#endif
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
-#define CLONE_ARGS_SIZE_VER2 88
+#ifndef CLONE_ARGS_SIZE_VER1
+#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#endif
__aligned_u64 cgroup;
+#ifndef CLONE_ARGS_SIZE_VER2
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#endif
};
-#endif /* __NR_clone3 */
-static pid_t sys_clone3(struct clone_args *args, size_t size)
+static pid_t sys_clone3(struct __clone_args *args, size_t size)
{
fflush(stdout);
fflush(stderr);
@@ -52,7 +56,7 @@ static pid_t sys_clone3(struct clone_args *args, size_t size)
static inline void test_clone3_supported(void)
{
pid_t pid;
- struct clone_args args = {};
+ struct __clone_args args = {};
if (__NR_clone3 < 0)
ksft_exit_skip("clone3() syscall is not supported\n");
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
index 5831c1082d6d..0229e9ebb995 100644
--- a/tools/testing/selftests/clone3/clone3_set_tid.c
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -46,14 +46,14 @@ static int call_clone3_set_tid(pid_t *set_tid,
int status;
pid_t pid = -1;
- struct clone_args args = {
+ struct __clone_args args = {
.flags = flags,
.exit_signal = SIGCHLD,
.set_tid = ptr_to_u64(set_tid),
.set_tid_size = set_tid_size,
};
- pid = sys_clone3(&args, sizeof(struct clone_args));
+ pid = sys_clone3(&args, sizeof(args));
if (pid < 0) {
ksft_print_msg("%s - Failed to create new process\n",
strerror(errno));
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
index 7dca1aa4672d..1f085b922c6e 100644
--- a/tools/testing/selftests/pidfd/pidfd_setns_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -75,7 +75,7 @@ static int sys_waitid(int which, pid_t pid, int options)
pid_t create_child(int *pidfd, unsigned flags)
{
- struct clone_args args = {
+ struct __clone_args args = {
.flags = CLONE_PIDFD | flags,
.exit_signal = SIGCHLD,
.pidfd = ptr_to_u64(pidfd),
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index 7a6d40286a42..4a180439ee9e 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -774,8 +774,15 @@ void *kill_thread(void *data)
return (void *)SIBLING_EXIT_UNKILLED;
}
+enum kill_t {
+ KILL_THREAD,
+ KILL_PROCESS,
+ RET_UNKNOWN
+};
+
/* Prepare a thread that will kill itself or both of us. */
-void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
+void kill_thread_or_group(struct __test_metadata *_metadata,
+ enum kill_t kill_how)
{
pthread_t thread;
void *status;
@@ -791,11 +798,12 @@ void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
.len = (unsigned short)ARRAY_SIZE(filter_thread),
.filter = filter_thread,
};
+ int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA;
struct sock_filter filter_process[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, nr)),
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
- BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_PROCESS),
+ BPF_STMT(BPF_RET|BPF_K, kill),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog_process = {
@@ -808,13 +816,15 @@ void kill_thread_or_group(struct __test_metadata *_metadata, bool kill_process)
}
ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
- kill_process ? &prog_process : &prog_thread));
+ kill_how == KILL_THREAD ? &prog_thread
+ : &prog_process));
/*
* Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
* flag cannot be downgraded by a new filter.
*/
- ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
+ if (kill_how == KILL_PROCESS)
+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
/* Start a thread that will exit immediately. */
ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
@@ -842,7 +852,7 @@ TEST(KILL_thread)
child_pid = fork();
ASSERT_LE(0, child_pid);
if (child_pid == 0) {
- kill_thread_or_group(_metadata, false);
+ kill_thread_or_group(_metadata, KILL_THREAD);
_exit(38);
}
@@ -861,7 +871,7 @@ TEST(KILL_process)
child_pid = fork();
ASSERT_LE(0, child_pid);
if (child_pid == 0) {
- kill_thread_or_group(_metadata, true);
+ kill_thread_or_group(_metadata, KILL_PROCESS);
_exit(38);
}
@@ -872,6 +882,27 @@ TEST(KILL_process)
ASSERT_EQ(SIGSYS, WTERMSIG(status));
}
+TEST(KILL_unknown)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, RET_UNKNOWN);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If the entire process was killed, we'll see SIGSYS. */
+ EXPECT_TRUE(WIFSIGNALED(status)) {
+ TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
+ }
+ ASSERT_EQ(SIGSYS, WTERMSIG(status));
+}
+
/* TODO(wad) add 64-bit versus 32-bit arg tests. */
TEST(arg_out_of_range)
{
@@ -1667,70 +1698,148 @@ TEST_F(TRACE_poke, getpid_runs_normally)
}
#if defined(__x86_64__)
-# define ARCH_REGS struct user_regs_struct
-# define SYSCALL_NUM orig_rax
-# define SYSCALL_RET rax
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).orig_rax
+# define SYSCALL_RET(_regs) (_regs).rax
#elif defined(__i386__)
-# define ARCH_REGS struct user_regs_struct
-# define SYSCALL_NUM orig_eax
-# define SYSCALL_RET eax
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).orig_eax
+# define SYSCALL_RET(_regs) (_regs).eax
#elif defined(__arm__)
-# define ARCH_REGS struct pt_regs
-# define SYSCALL_NUM ARM_r7
-# define SYSCALL_RET ARM_r0
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).ARM_r7
+# ifndef PTRACE_SET_SYSCALL
+# define PTRACE_SET_SYSCALL 23
+# endif
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
+# define SYSCALL_RET(_regs) (_regs).ARM_r0
#elif defined(__aarch64__)
-# define ARCH_REGS struct user_pt_regs
-# define SYSCALL_NUM regs[8]
-# define SYSCALL_RET regs[0]
+# define ARCH_REGS struct user_pt_regs
+# define SYSCALL_NUM(_regs) (_regs).regs[8]
+# ifndef NT_ARM_SYSTEM_CALL
+# define NT_ARM_SYSTEM_CALL 0x404
+# endif
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ struct iovec __v; \
+ typeof(_nr) __nr = (_nr); \
+ __v.iov_base = &__nr; \
+ __v.iov_len = sizeof(__nr); \
+ EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee, \
+ NT_ARM_SYSTEM_CALL, &__v)); \
+ } while (0)
+# define SYSCALL_RET(_regs) (_regs).regs[0]
#elif defined(__riscv) && __riscv_xlen == 64
-# define ARCH_REGS struct user_regs_struct
-# define SYSCALL_NUM a7
-# define SYSCALL_RET a0
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).a7
+# define SYSCALL_RET(_regs) (_regs).a0
#elif defined(__csky__)
-# define ARCH_REGS struct pt_regs
-#if defined(__CSKYABIV2__)
-# define SYSCALL_NUM regs[3]
-#else
-# define SYSCALL_NUM regs[9]
-#endif
-# define SYSCALL_RET a0
+# define ARCH_REGS struct pt_regs
+# if defined(__CSKYABIV2__)
+# define SYSCALL_NUM(_regs) (_regs).regs[3]
+# else
+# define SYSCALL_NUM(_regs) (_regs).regs[9]
+# endif
+# define SYSCALL_RET(_regs) (_regs).a0
#elif defined(__hppa__)
-# define ARCH_REGS struct user_regs_struct
-# define SYSCALL_NUM gr[20]
-# define SYSCALL_RET gr[28]
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).gr[20]
+# define SYSCALL_RET(_regs) (_regs).gr[28]
#elif defined(__powerpc__)
-# define ARCH_REGS struct pt_regs
-# define SYSCALL_NUM gpr[0]
-# define SYSCALL_RET gpr[3]
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).gpr[0]
+# define SYSCALL_RET(_regs) (_regs).gpr[3]
+# define SYSCALL_RET_SET(_regs, _val) \
+ do { \
+ typeof(_val) _result = (_val); \
+ /* \
+ * A syscall error is signaled by CR0 SO bit \
+ * and the code is stored as a positive value. \
+ */ \
+ if (_result < 0) { \
+ SYSCALL_RET(_regs) = -result; \
+ (_regs).ccr |= 0x10000000; \
+ } else { \
+ SYSCALL_RET(_regs) = result; \
+ (_regs).ccr &= ~0x10000000; \
+ } \
+ } while (0)
+# define SYSCALL_RET_SET_ON_PTRACE_EXIT
#elif defined(__s390__)
-# define ARCH_REGS s390_regs
-# define SYSCALL_NUM gprs[2]
-# define SYSCALL_RET gprs[2]
-# define SYSCALL_NUM_RET_SHARE_REG
+# define ARCH_REGS s390_regs
+# define SYSCALL_NUM(_regs) (_regs).gprs[2]
+# define SYSCALL_RET_SET(_regs, _val) \
+ TH_LOG("Can't modify syscall return on this architecture")
#elif defined(__mips__)
-# define ARCH_REGS struct pt_regs
-# define SYSCALL_NUM regs[2]
-# define SYSCALL_SYSCALL_NUM regs[4]
-# define SYSCALL_RET regs[2]
-# define SYSCALL_NUM_RET_SHARE_REG
+# include <asm/unistd_nr_n32.h>
+# include <asm/unistd_nr_n64.h>
+# include <asm/unistd_nr_o32.h>
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) \
+ ({ \
+ typeof((_regs).regs[2]) _nr; \
+ if ((_regs).regs[2] == __NR_O32_Linux) \
+ _nr = (_regs).regs[4]; \
+ else \
+ _nr = (_regs).regs[2]; \
+ _nr; \
+ })
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ if ((_regs).regs[2] == __NR_O32_Linux) \
+ (_regs).regs[4] = _nr; \
+ else \
+ (_regs).regs[2] = _nr; \
+ } while (0)
+# define SYSCALL_RET_SET(_regs, _val) \
+ TH_LOG("Can't modify syscall return on this architecture")
#elif defined(__xtensa__)
-# define ARCH_REGS struct user_pt_regs
-# define SYSCALL_NUM syscall
+# define ARCH_REGS struct user_pt_regs
+# define SYSCALL_NUM(_regs) (_regs).syscall
/*
* On xtensa syscall return value is in the register
* a2 of the current window which is not fixed.
*/
-#define SYSCALL_RET(reg) a[(reg).windowbase * 4 + 2]
+#define SYSCALL_RET(_regs) (_regs).a[(_regs).windowbase * 4 + 2]
#elif defined(__sh__)
-# define ARCH_REGS struct pt_regs
-# define SYSCALL_NUM gpr[3]
-# define SYSCALL_RET gpr[0]
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).gpr[3]
+# define SYSCALL_RET(_regs) (_regs).gpr[0]
#else
# error "Do not know how to find your architecture's registers and syscalls"
#endif
+/*
+ * Most architectures can change the syscall by just updating the
+ * associated register. This is the default if not defined above.
+ */
+#ifndef SYSCALL_NUM_SET
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ SYSCALL_NUM(_regs) = (_nr); \
+ } while (0)
+#endif
+/*
+ * Most architectures can change the syscall return value by just
+ * writing to the SYSCALL_RET register. This is the default if not
+ * defined above. If an architecture cannot set the return value
+ * (for example when the syscall and return value register is
+ * shared), report it with TH_LOG() in an arch-specific definition
+ * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
+ */
+#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
+# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
+#endif
+#ifndef SYSCALL_RET_SET
+# define SYSCALL_RET_SET(_regs, _val) \
+ do { \
+ SYSCALL_RET(_regs) = (_val); \
+ } while (0)
+#endif
+
/* When the syscall return can't be changed, stub out the tests for it. */
-#ifdef SYSCALL_NUM_RET_SHARE_REG
+#ifndef SYSCALL_RET
# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action)
#else
# define EXPECT_SYSCALL_RETURN(val, action) \
@@ -1745,116 +1854,92 @@ TEST_F(TRACE_poke, getpid_runs_normally)
} while (0)
#endif
-/* Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
+/*
+ * Some architectures (e.g. powerpc) can only set syscall
+ * return values on syscall exit during ptrace.
+ */
+const bool ptrace_entry_set_syscall_nr = true;
+const bool ptrace_entry_set_syscall_ret =
+#ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
+ true;
+#else
+ false;
+#endif
+
+/*
+ * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
* architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
*/
#if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
-#define HAVE_GETREGS
+# define ARCH_GETREGS(_regs) ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
+# define ARCH_SETREGS(_regs) ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
+#else
+# define ARCH_GETREGS(_regs) ({ \
+ struct iovec __v; \
+ __v.iov_base = &(_regs); \
+ __v.iov_len = sizeof(_regs); \
+ ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v); \
+ })
+# define ARCH_SETREGS(_regs) ({ \
+ struct iovec __v; \
+ __v.iov_base = &(_regs); \
+ __v.iov_len = sizeof(_regs); \
+ ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v); \
+ })
#endif
/* Architecture-specific syscall fetching routine. */
int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
{
ARCH_REGS regs;
-#ifdef HAVE_GETREGS
- EXPECT_EQ(0, ptrace(PTRACE_GETREGS, tracee, 0, &regs)) {
- TH_LOG("PTRACE_GETREGS failed");
- return -1;
- }
-#else
- struct iovec iov;
- iov.iov_base = &regs;
- iov.iov_len = sizeof(regs);
- EXPECT_EQ(0, ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov)) {
- TH_LOG("PTRACE_GETREGSET failed");
+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
return -1;
}
-#endif
-#if defined(__mips__)
- if (regs.SYSCALL_NUM == __NR_O32_Linux)
- return regs.SYSCALL_SYSCALL_NUM;
-#endif
- return regs.SYSCALL_NUM;
+ return SYSCALL_NUM(regs);
}
/* Architecture-specific syscall changing routine. */
-void change_syscall(struct __test_metadata *_metadata,
- pid_t tracee, int syscall, int result)
+void __change_syscall(struct __test_metadata *_metadata,
+ pid_t tracee, long *syscall, long *ret)
{
- int ret;
- ARCH_REGS regs;
-#ifdef HAVE_GETREGS
- ret = ptrace(PTRACE_GETREGS, tracee, 0, &regs);
-#else
- struct iovec iov;
- iov.iov_base = &regs;
- iov.iov_len = sizeof(regs);
- ret = ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &iov);
-#endif
- EXPECT_EQ(0, ret) {}
+ ARCH_REGS orig, regs;
-#if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \
- defined(__s390__) || defined(__hppa__) || defined(__riscv) || \
- defined(__xtensa__) || defined(__csky__) || defined(__sh__)
- {
- regs.SYSCALL_NUM = syscall;
- }
-#elif defined(__mips__)
- {
- if (regs.SYSCALL_NUM == __NR_O32_Linux)
- regs.SYSCALL_SYSCALL_NUM = syscall;
- else
- regs.SYSCALL_NUM = syscall;
- }
+ /* Do not get/set registers if we have nothing to do. */
+ if (!syscall && !ret)
+ return;
-#elif defined(__arm__)
-# ifndef PTRACE_SET_SYSCALL
-# define PTRACE_SET_SYSCALL 23
-# endif
- {
- ret = ptrace(PTRACE_SET_SYSCALL, tracee, NULL, syscall);
- EXPECT_EQ(0, ret);
+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
+ return;
}
+ orig = regs;
-#elif defined(__aarch64__)
-# ifndef NT_ARM_SYSTEM_CALL
-# define NT_ARM_SYSTEM_CALL 0x404
-# endif
- {
- iov.iov_base = &syscall;
- iov.iov_len = sizeof(syscall);
- ret = ptrace(PTRACE_SETREGSET, tracee, NT_ARM_SYSTEM_CALL,
- &iov);
- EXPECT_EQ(0, ret);
- }
+ if (syscall)
+ SYSCALL_NUM_SET(regs, *syscall);
-#else
- ASSERT_EQ(1, 0) {
- TH_LOG("How is the syscall changed on this architecture?");
- }
-#endif
+ if (ret)
+ SYSCALL_RET_SET(regs, *ret);
- /* If syscall is skipped, change return value. */
- if (syscall == -1)
-#ifdef SYSCALL_NUM_RET_SHARE_REG
- TH_LOG("Can't modify syscall return on this architecture");
+ /* Flush any register changes made. */
+ if (memcmp(&orig, &regs, sizeof(orig)) != 0)
+ EXPECT_EQ(0, ARCH_SETREGS(regs));
+}
-#elif defined(__xtensa__)
- regs.SYSCALL_RET(regs) = result;
-#else
- regs.SYSCALL_RET = result;
-#endif
+/* Change only syscall number. */
+void change_syscall_nr(struct __test_metadata *_metadata,
+ pid_t tracee, long syscall)
+{
+ __change_syscall(_metadata, tracee, &syscall, NULL);
+}
-#ifdef HAVE_GETREGS
- ret = ptrace(PTRACE_SETREGS, tracee, 0, &regs);
-#else
- iov.iov_base = &regs;
- iov.iov_len = sizeof(regs);
- ret = ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &iov);
-#endif
- EXPECT_EQ(0, ret);
+/* Change syscall return value (and set syscall number to -1). */
+void change_syscall_ret(struct __test_metadata *_metadata,
+ pid_t tracee, long ret)
+{
+ long syscall = -1;
+
+ __change_syscall(_metadata, tracee, &syscall, &ret);
}
void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
@@ -1872,17 +1957,17 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
case 0x1002:
/* change getpid to getppid. */
EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
- change_syscall(_metadata, tracee, __NR_getppid, 0);
+ change_syscall_nr(_metadata, tracee, __NR_getppid);
break;
case 0x1003:
/* skip gettid with valid return code. */
EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
- change_syscall(_metadata, tracee, -1, 45000);
+ change_syscall_ret(_metadata, tracee, 45000);
break;
case 0x1004:
/* skip openat with error. */
EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
- change_syscall(_metadata, tracee, -1, -ESRCH);
+ change_syscall_ret(_metadata, tracee, -ESRCH);
break;
case 0x1005:
/* do nothing (allow getppid) */
@@ -1897,12 +1982,21 @@ void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
}
+FIXTURE(TRACE_syscall) {
+ struct sock_fprog prog;
+ pid_t tracer, mytid, mypid, parent;
+ long syscall_nr;
+};
+
void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
int status, void *args)
{
- int ret, nr;
+ int ret;
unsigned long msg;
static bool entry;
+ long syscall_nr_val, syscall_ret_val;
+ long *syscall_nr = NULL, *syscall_ret = NULL;
+ FIXTURE_DATA(TRACE_syscall) *self = args;
/*
* The traditional way to tell PTRACE_SYSCALL entry/exit
@@ -1916,24 +2010,48 @@ void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
: PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
- if (!entry)
- return;
+ /*
+ * Some architectures only support setting return values during
+ * syscall exit under ptrace, and on exit the syscall number may
+ * no longer be available. Therefore, save the initial sycall
+ * number here, so it can be examined during both entry and exit
+ * phases.
+ */
+ if (entry)
+ self->syscall_nr = get_syscall(_metadata, tracee);
- nr = get_syscall(_metadata, tracee);
+ /*
+ * Depending on the architecture's syscall setting abilities, we
+ * pick which things to set during this phase (entry or exit).
+ */
+ if (entry == ptrace_entry_set_syscall_nr)
+ syscall_nr = &syscall_nr_val;
+ if (entry == ptrace_entry_set_syscall_ret)
+ syscall_ret = &syscall_ret_val;
+
+ /* Now handle the actual rewriting cases. */
+ switch (self->syscall_nr) {
+ case __NR_getpid:
+ syscall_nr_val = __NR_getppid;
+ /* Never change syscall return for this case. */
+ syscall_ret = NULL;
+ break;
+ case __NR_gettid:
+ syscall_nr_val = -1;
+ syscall_ret_val = 45000;
+ break;
+ case __NR_openat:
+ syscall_nr_val = -1;
+ syscall_ret_val = -ESRCH;
+ break;
+ default:
+ /* Unhandled, do nothing. */
+ return;
+ }
- if (nr == __NR_getpid)
- change_syscall(_metadata, tracee, __NR_getppid, 0);
- if (nr == __NR_gettid)
- change_syscall(_metadata, tracee, -1, 45000);
- if (nr == __NR_openat)
- change_syscall(_metadata, tracee, -1, -ESRCH);
+ __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
}
-FIXTURE(TRACE_syscall) {
- struct sock_fprog prog;
- pid_t tracer, mytid, mypid, parent;
-};
-
FIXTURE_VARIANT(TRACE_syscall) {
/*
* All of the SECCOMP_RET_TRACE behaviors can be tested with either
@@ -1992,7 +2110,7 @@ FIXTURE_SETUP(TRACE_syscall)
self->tracer = setup_trace_fixture(_metadata,
variant->use_ptrace ? tracer_ptrace
: tracer_seccomp,
- NULL, variant->use_ptrace);
+ self, variant->use_ptrace);
ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
ASSERT_EQ(0, ret);
@@ -3142,11 +3260,11 @@ skip:
static int user_notif_syscall(int nr, unsigned int flags)
{
struct sock_filter filter[] = {
- BPF_STMT(BPF_LD+BPF_W+BPF_ABS,
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, nr)),
- BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, nr, 0, 1),
- BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_USER_NOTIF),
- BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
@@ -3699,7 +3817,7 @@ TEST(user_notification_filter_empty)
long ret;
int status;
struct pollfd pollfd;
- struct clone_args args = {
+ struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};
@@ -3715,7 +3833,7 @@ TEST(user_notification_filter_empty)
if (pid == 0) {
int listener;
- listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
if (listener < 0)
_exit(EXIT_FAILURE);
@@ -3753,7 +3871,7 @@ TEST(user_notification_filter_empty_threaded)
long ret;
int status;
struct pollfd pollfd;
- struct clone_args args = {
+ struct __clone_args args = {
.flags = CLONE_FILES,
.exit_signal = SIGCHLD,
};