diff options
Diffstat (limited to 'init')
-rw-r--r-- | init/.gitignore | 2 | ||||
-rw-r--r-- | init/Kconfig | 886 | ||||
-rw-r--r-- | init/Makefile | 59 | ||||
-rwxr-xr-x | init/build-version | 10 | ||||
-rw-r--r-- | init/do_mounts.c | 452 | ||||
-rw-r--r-- | init/do_mounts.h | 28 | ||||
-rw-r--r-- | init/do_mounts_initrd.c | 79 | ||||
-rw-r--r-- | init/do_mounts_md.c | 304 | ||||
-rw-r--r-- | init/do_mounts_rd.c | 101 | ||||
-rw-r--r-- | init/init_task.c | 39 | ||||
-rw-r--r-- | init/initramfs.c | 322 | ||||
-rw-r--r-- | init/main.c | 466 | ||||
-rw-r--r-- | init/noinitramfs.c | 10 | ||||
-rw-r--r-- | init/version-timestamp.c | 31 | ||||
-rw-r--r-- | init/version.c | 60 |
15 files changed, 1294 insertions, 1555 deletions
diff --git a/init/.gitignore b/init/.gitignore new file mode 100644 index 000000000000..cbbe270cec00 --- /dev/null +++ b/init/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/utsversion-tmp.h diff --git a/init/Kconfig b/init/Kconfig index 4f717bfdbfe2..abf65098f1b6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1,34 +1,93 @@ # SPDX-License-Identifier: GPL-2.0-only -config DEFCONFIG_LIST +config CC_VERSION_TEXT string - depends on !UML - option defconfig_list - default "/lib/modules/$(shell,uname -r)/.config" - default "/etc/kernel-config" - default "/boot/config-$(shell,uname -r)" - default ARCH_DEFCONFIG - default "arch/$(ARCH)/defconfig" + default "$(CC_VERSION_TEXT)" + help + This is used in unclear ways: + + - Re-run Kconfig when the compiler is updated + The 'default' property references the environment variable, + CC_VERSION_TEXT so it is recorded in include/config/auto.conf.cmd. + When the compiler is updated, Kconfig will be invoked. + + - Ensure full rebuild when the compiler is updated + include/linux/compiler-version.h contains this option in the comment + line so fixdep adds include/config/CC_VERSION_TEXT into the + auto-generated dependency. When the compiler is updated, syncconfig + will touch it and then every file will be rebuilt. config CC_IS_GCC - def_bool $(success,$(CC) --version | head -n 1 | grep -q gcc) + def_bool $(success,test "$(cc-name)" = GCC) config GCC_VERSION int - default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC + default $(cc-version) if CC_IS_GCC default 0 config CC_IS_CLANG - def_bool $(success,$(CC) --version | head -n 1 | grep -q clang) + def_bool $(success,test "$(cc-name)" = Clang) config CLANG_VERSION int - default $(shell,$(srctree)/scripts/clang-version.sh $(CC)) + default $(cc-version) if CC_IS_CLANG + default 0 + +config AS_IS_GNU + def_bool $(success,test "$(as-name)" = GNU) + +config AS_IS_LLVM + def_bool $(success,test "$(as-name)" = LLVM) + +config AS_VERSION + int + # Use clang version if this is the integrated assembler + default CLANG_VERSION if AS_IS_LLVM + default $(as-version) + +config LD_IS_BFD + def_bool $(success,test "$(ld-name)" = BFD) + +config LD_VERSION + int + default $(ld-version) if LD_IS_BFD + default 0 + +config LD_IS_LLD + def_bool $(success,test "$(ld-name)" = LLD) + +config LLD_VERSION + int + default $(ld-version) if LD_IS_LLD + default 0 + +config RUST_IS_AVAILABLE + def_bool $(success,$(srctree)/scripts/rust_is_available.sh) + help + This shows whether a suitable Rust toolchain is available (found). + + Please see Documentation/rust/quick-start.rst for instructions on how + to satisfy the build requirements of Rust support. + + In particular, the Makefile target 'rustavailable' is useful to check + why the Rust toolchain is not being detected. config CC_CAN_LINK - def_bool $(success,$(srctree)/scripts/cc-can-link.sh $(CC)) + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag)) + +config CC_CAN_LINK_STATIC + bool + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag) -static) if 64BIT + default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m32-flag) -static) -config CC_HAS_ASM_GOTO - def_bool $(success,$(srctree)/scripts/gcc-goto.sh $(CC)) +config CC_HAS_ASM_GOTO_OUTPUT + def_bool $(success,echo 'int foo(int x) { asm goto ("": "=r"(x) ::: bar); return x; bar: return 0; }' | $(CC) -x c - -c -o /dev/null) + +config CC_HAS_ASM_GOTO_TIED_OUTPUT + depends on CC_HAS_ASM_GOTO_OUTPUT + # Detect buggy gcc and clang, fixed in gcc-11 clang-14. + def_bool $(success,echo 'int foo(int *x) { asm goto (".long (%l[bar]) - .\n": "+m"(*x) ::: bar); return *x; bar: return 0; }' | $CC -x c - -c -o /dev/null) config TOOLS_SUPPORT_RELR def_bool $(success,env "CC=$(CC)" "LD=$(LD)" "NM=$(NM)" "OBJCOPY=$(OBJCOPY)" $(srctree)/scripts/tools-support-relr.sh) @@ -36,25 +95,15 @@ config TOOLS_SUPPORT_RELR config CC_HAS_ASM_INLINE def_bool $(success,echo 'void foo(void) { asm inline (""); }' | $(CC) -x c - -c -o /dev/null) -config CC_HAS_WARN_MAYBE_UNINITIALIZED - def_bool $(cc-option,-Wmaybe-uninitialized) - help - GCC >= 4.7 supports this option. +config CC_HAS_NO_PROFILE_FN_ATTR + def_bool $(success,echo '__attribute__((no_profile_instrument_function)) int x();' | $(CC) -x c - -c -o /dev/null -Werror) -config CC_DISABLE_WARN_MAYBE_UNINITIALIZED - bool - depends on CC_HAS_WARN_MAYBE_UNINITIALIZED - default CC_IS_GCC && GCC_VERSION < 40900 # unreliable for GCC < 4.9 - help - GCC's -Wmaybe-uninitialized is not reliable by definition. - Lots of false positive warnings are produced in some cases. - - If this option is enabled, -Wno-maybe-uninitialzed is passed - to the compiler to suppress maybe-uninitialized warnings. +config PAHOLE_VERSION + int + default $(shell,$(srctree)/scripts/pahole-version.sh $(PAHOLE)) config CONSTRUCTORS bool - depends on !UML config IRQ_WORK bool @@ -92,8 +141,7 @@ config INIT_ENV_ARG_LIMIT config COMPILE_TEST bool "Compile also drivers which will not load" - depends on !UML - default n + depends on HAS_IOMEM help Some drivers can be compiled on a different platform than they are intended to be run on. Despite they cannot be loaded there (or even @@ -105,6 +153,21 @@ config COMPILE_TEST here. If you are a user/distributor, say N here to exclude useless drivers to be distributed. +config WERROR + bool "Compile the kernel with warnings as errors" + default COMPILE_TEST + help + A kernel build should not cause any compiler warnings, and this + enables the '-Werror' (for C) and '-Dwarnings' (for Rust) flags + to enforce that rule by default. + + However, if you have a new (or very old) compiler with odd and + unusual warnings, or you have some architecture with problems, + you may need to disable this config option in order to + successfully build the kernel. + + If in doubt, say Y. + config UAPI_HEADER_TEST bool "Compile test UAPI headers" depends on HEADERS_INSTALL && CC_CAN_LINK @@ -173,13 +236,16 @@ config HAVE_KERNEL_LZO config HAVE_KERNEL_LZ4 bool +config HAVE_KERNEL_ZSTD + bool + config HAVE_KERNEL_UNCOMPRESSED bool choice prompt "Kernel compression mode" default KERNEL_GZIP - depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_UNCOMPRESSED + depends on HAVE_KERNEL_GZIP || HAVE_KERNEL_BZIP2 || HAVE_KERNEL_LZMA || HAVE_KERNEL_XZ || HAVE_KERNEL_LZO || HAVE_KERNEL_LZ4 || HAVE_KERNEL_ZSTD || HAVE_KERNEL_UNCOMPRESSED help The linux kernel is a kind of self-extracting executable. Several compression algorithms are available, which differ @@ -258,6 +324,16 @@ config KERNEL_LZ4 is about 8% bigger than LZO. But the decompression speed is faster than LZO. +config KERNEL_ZSTD + bool "ZSTD" + depends on HAVE_KERNEL_ZSTD + help + ZSTD is a compression algorithm targeting intermediate compression + with fast decompression speed. It will compress better than GZIP and + decompress around the same speed as LZO, but slower than LZ4. You + will need at least 192 KB RAM or more for booting. The zstd command + line tool is required for compression. + config KERNEL_UNCOMPRESSED bool "None" depends on HAVE_KERNEL_UNCOMPRESSED @@ -270,6 +346,16 @@ config KERNEL_UNCOMPRESSED endchoice +config DEFAULT_INIT + string "Default init path" + default "" + help + This option determines the default init for the system if no init= + option is passed on the kernel command line. If the requested path is + not present, we will still then move on to attempting further + locations (e.g. /sbin/init, etc). If this is empty, we will just use + the fallback list when init= is not passed. + config DEFAULT_HOSTNAME string "Default hostname" default "(none)" @@ -279,26 +365,9 @@ config DEFAULT_HOSTNAME but you may wish to use a different default here to make a minimal system more usable with less configuration. -# -# For some reason microblaze and nios2 hard code SWAP=n. Hopefully we can -# add proper SWAP support to them, in which case this can be remove. -# -config ARCH_NO_SWAP - bool - -config SWAP - bool "Support for paging of anonymous memory (swap)" - depends on MMU && BLOCK && !ARCH_NO_SWAP - default y - help - This option allows you to choose whether you want to have support - for so called swap devices or swap files in your kernel that are - used to provide more virtual memory than the actual RAM present - in your computer. If unsure say Y. - config SYSVIPC bool "System V IPC" - ---help--- + help Inter Process Communication is a suite of library functions and system calls which let processes (running programs) synchronize and exchange information. It is generally considered to be a good thing, @@ -317,10 +386,14 @@ config SYSVIPC_SYSCTL depends on SYSCTL default y +config SYSVIPC_COMPAT + def_bool y + depends on COMPAT && SYSVIPC + config POSIX_MQUEUE bool "POSIX Message Queues" depends on NET - ---help--- + help POSIX variant of message queues is a part of IPC. In POSIX message queues every message has a priority which decides about succession of receiving it by a process. If you want to compile and run @@ -339,6 +412,18 @@ config POSIX_MQUEUE_SYSCTL depends on SYSCTL default y +config WATCH_QUEUE + bool "General notification queue" + default n + help + + This is a general notification queue for the kernel to pass events to + userspace by splicing them into pipes. It can be used in conjunction + with watches for key/keyring change notifications and device + notifications. + + See Documentation/core-api/watch_queue.rst + config CROSS_MEMORY_ATTACH bool "Enable process_vm_readv/writev syscalls" depends on MMU @@ -350,8 +435,8 @@ config CROSS_MEMORY_ATTACH See the man page for more details. config USELIB - bool "uselib syscall" - def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION + bool "uselib syscall (for libc5 and earlier)" + default ALPHA || M68K || SPARC help This option enables the uselib syscall, a system call used in the dynamic linker from libc5 and earlier. glibc does not use this @@ -378,6 +463,7 @@ config AUDITSYSCALL source "kernel/irq/Kconfig" source "kernel/time/Kconfig" +source "kernel/bpf/Kconfig" source "kernel/Kconfig.preempt" menu "CPU/Task time and stats accounting" @@ -387,8 +473,7 @@ config VIRT_CPU_ACCOUNTING choice prompt "Cputime accounting" - default TICK_CPU_ACCOUNTING if !PPC64 - default VIRT_CPU_ACCOUNTING_NATIVE if PPC64 + default TICK_CPU_ACCOUNTING # Kind of a stub config for the pure tick based cputime accounting config TICK_CPU_ACCOUNTING @@ -416,11 +501,11 @@ config VIRT_CPU_ACCOUNTING_NATIVE config VIRT_CPU_ACCOUNTING_GEN bool "Full dynticks CPU time accounting" - depends on HAVE_CONTEXT_TRACKING + depends on HAVE_CONTEXT_TRACKING_USER depends on HAVE_VIRT_CPU_ACCOUNTING_GEN depends on GENERIC_CLOCKEVENTS select VIRT_CPU_ACCOUNTING - select CONTEXT_TRACKING + select CONTEXT_TRACKING_USER help Select this option to enable task and CPU time accounting on full dynticks systems. This accounting is implemented by watching every @@ -451,6 +536,25 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP +config SCHED_THERMAL_PRESSURE + bool + default y if ARM && ARM_CPU_TOPOLOGY + default y if ARM64 + depends on SMP + depends on CPU_FREQ_THERMAL + help + Select this option to enable thermal pressure accounting in the + scheduler. Thermal pressure is the value conveyed to the scheduler + that reflects the reduction in CPU compute capacity resulted from + thermal throttling. Thermal throttling occurs when the performance of + a CPU is capped due to high operating temperatures. + + If selected, the scheduler will be able to balance tasks accordingly, + i.e. put less load on throttled CPUs than on non/less throttled ones. + + This requires the architecture to implement + arch_update_thermal_pressure() and arch_scale_thermal_pressure(). + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER @@ -582,7 +686,7 @@ config BUILD_BIN2C config IKCONFIG tristate "Kernel .config support" - ---help--- + help This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation of which kernel options are used in a running kernel or in an @@ -595,7 +699,7 @@ config IKCONFIG config IKCONFIG_PROC bool "Enable access to .config through /proc/config.gz" depends on IKCONFIG && PROC_FS - ---help--- + help This option enables access to the kernel configuration file through /proc/config.gz. @@ -646,7 +750,7 @@ config LOG_CPU_MAX_BUF_SHIFT with more CPUs. Therefore this value is used only when the sum of contributions is greater than the half of the default kernel ring buffer as defined by LOG_BUF_SHIFT. The default values are set - so that more than 64 CPUs are needed to trigger the allocation. + so that more than 16 CPUs are needed to trigger the allocation. Also this option is ignored when "log_buf_len" kernel parameter is used as it forces an exact (power of two) size of the ring buffer. @@ -687,6 +791,20 @@ config PRINTK_SAFE_LOG_BUF_SHIFT 13 => 8 KB for each CPU 12 => 4 KB for each CPU +config PRINTK_INDEX + bool "Printk indexing debugfs interface" + depends on PRINTK && DEBUG_FS + help + Add support for indexing of all printk formats known at compile time + at <debugfs>/printk/index/<module>. + + This can be used as part of maintaining daemons which monitor + /dev/kmsg, as it permits auditing the printk formats present in a + kernel, allowing detection of cases where monitored printks are + changed or no longer present. + + There is no additional runtime cost to printk with this enabled. + # # Architectures with an unreliable sched_clock() should select this: # @@ -769,6 +887,20 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH config CC_HAS_INT128 def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT +config CC_IMPLICIT_FALLTHROUGH + string + default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5) + default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough) + +# Currently, disable gcc-12 array-bounds globally. +# We may want to target only particular configurations some day. +config GCC12_NO_ARRAY_BOUNDS + def_bool y + +config CC_NO_ARRAY_BOUNDS + bool + default y if CC_IS_GCC && GCC_VERSION >= 120000 && GCC_VERSION < 130000 && GCC12_NO_ARRAY_BOUNDS + # # For architectures that know their GCC __int128 support is sound # @@ -785,7 +917,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when @@ -820,6 +952,16 @@ if CGROUPS config PAGE_COUNTER bool +config CGROUP_FAVOR_DYNMODS + bool "Favor dynamic modification latency reduction by default" + help + This option enables the "favordynmods" mount option by default + which reduces the latencies of dynamic cgroup modifications such + as task migrations and controller on/offs at the cost of making + hot path operations such as forks and exits more expensive. + + Say N if unsure. + config MEMCG bool "Memory controller" select PAGE_COUNTER @@ -827,26 +969,6 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup. -config MEMCG_SWAP - bool "Swap controller" - depends on MEMCG && SWAP - help - Provides control over the swap space consumed by tasks in a cgroup. - -config MEMCG_SWAP_ENABLED - bool "Swap controller enabled by default" - depends on MEMCG_SWAP - default y - help - Memory Resource Controller Swap Extension comes with its price in - a bigger memory consumption. General purpose distribution kernels - which want to enable the feature but keep it disabled by default - and let the user enable it by swapaccount=1 boot command line - parameter should have this option unselected. - For those who want to have the feature enabled by default should - select this option (if, for some reason, they need to disable it - then swapaccount=0 does the trick). - config MEMCG_KMEM bool depends on MEMCG && !SLOB @@ -856,7 +978,7 @@ config BLK_CGROUP bool "IO controller" depends on BLOCK default n - ---help--- + help Generic block IO controller cgroup interface. This is the common cgroup interface which should be used by various IO controlling policies. @@ -869,7 +991,7 @@ config BLK_CGROUP This option only enables generic Block IO controller infrastructure. One needs to also enable actual IO controlling logic/policy. For enabling proportional weight division of disk bandwidth in CFQ, set - CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. @@ -1026,7 +1148,8 @@ config CGROUP_PERF help This option extends the perf per-cpu mode to restrict monitoring to threads which belong to the cgroup specified and run on the - designated cpu. + designated cpu. Or this can be used to have cgroup ID in samples + so that it can monitor performance events among cgroups. Say N if unsure. @@ -1043,6 +1166,20 @@ config CGROUP_BPF BPF_CGROUP_INET_INGRESS will be executed on the ingress path of inet sockets. +config CGROUP_MISC + bool "Misc resource controller" + default n + help + Provides a controller for miscellaneous resources on a host. + + Miscellaneous scalar resources are the resources on the host system + which cannot be abstracted like the other cgroups. This controller + tracks and limits the miscellaneous resources used by a process + attached to a cgroup hierarchy. + + For more information, please check misc cgroup section in + /Documentation/admin-guide/cgroup-v2.rst. + config CGROUP_DEBUG bool "Debug controller" default n @@ -1130,7 +1267,9 @@ endif # NAMESPACES config CHECKPOINT_RESTORE bool "Checkpoint/restore support" + depends on PROC_FS select PROC_CHILDREN + select KCMP default n help Enables additional kernel features in a sake of checkpoint/restore. @@ -1225,7 +1364,7 @@ endif config BOOT_CONFIG bool "Boot config support" - select BLK_DEV_INITRD + select BLK_DEV_INITRD if !BOOT_CONFIG_EMBED help Extra boot config allows system admin to pass a config file as complemental extension of kernel cmdline when booting. @@ -1235,6 +1374,35 @@ config BOOT_CONFIG If unsure, say Y. +config BOOT_CONFIG_EMBED + bool "Embed bootconfig file in the kernel" + depends on BOOT_CONFIG + help + Embed a bootconfig file given by BOOT_CONFIG_EMBED_FILE in the + kernel. Usually, the bootconfig file is loaded with the initrd + image. But if the system doesn't support initrd, this option will + help you by embedding a bootconfig file while building the kernel. + + If unsure, say N. + +config BOOT_CONFIG_EMBED_FILE + string "Embedded bootconfig file path" + depends on BOOT_CONFIG_EMBED + help + Specify a bootconfig file which will be embedded to the kernel. + This bootconfig will be used if there is no initrd or no other + bootconfig in the initrd. + +config INITRAMFS_PRESERVE_MTIME + bool "Preserve cpio archive mtimes in initramfs" + default y + help + Each entry in an initramfs cpio archive carries an mtime value. When + enabled, extracted cpio items take this mtime, with directory mtime + setting deferred until after creation of any child entries. + + If unsure, say Y. + choice prompt "Compiler optimization level" default CC_OPTIMIZE_FOR_PERFORMANCE @@ -1246,17 +1414,8 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. -config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" - depends on ARC - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. - config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" - imply CC_DISABLE_WARN_MAYBE_UNINITIALIZED # avoid false positives help Choosing this option will pass "-Os" to your compiler resulting in a smaller kernel. @@ -1277,7 +1436,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION bool "Dead code and data elimination (EXPERIMENTAL)" depends on HAVE_LD_DEAD_CODE_DATA_ELIMINATION depends on EXPERT - depends on !(FUNCTION_TRACER && CC_IS_GCC && GCC_VERSION < 40800) depends on $(cc-option,-ffunction-sections -fdata-sections) depends on $(ld-option,--gc-sections) help @@ -1292,6 +1450,11 @@ config LD_DEAD_CODE_DATA_ELIMINATION present. This option is not well tested yet, so use at your own risk. +config LD_ORPHAN_WARN + def_bool y + depends on ARCH_WANT_LD_ORPHAN_WARN + depends on $(ld-option,--orphan-handling=warn) + config SYSCTL bool @@ -1324,6 +1487,7 @@ config HAVE_PCSPKR_PLATFORM # interpreter that classic socket filters depend on config BPF bool + select CRYPTO_LIB_SHA1 menuconfig EXPERT bool "Configure standard kernel features (expert users)" @@ -1359,7 +1523,7 @@ config MULTIUSER config SGETMASK_SYSCALL bool "sgetmask/ssetmask syscalls support" if EXPERT def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH - ---help--- + help sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc but still enabled by default in some architectures. @@ -1369,7 +1533,7 @@ config SGETMASK_SYSCALL config SYSFS_SYSCALL bool "Sysfs syscall support" if EXPERT default y - ---help--- + help sys_sysfs is an obsolete system call no longer supported in libc. Note that disabling this option is more secure but might break compatibility with some systems. @@ -1417,11 +1581,6 @@ config PRINTK very difficult to diagnose system problems, saying N here is strongly discouraged. -config PRINTK_NMI - def_bool y - depends on PRINTK - depends on HAVE_NMI - config BUG bool "BUG() support" if EXPERT default y @@ -1459,6 +1618,7 @@ config BASE_FULL config FUTEX bool "Enable futex support" if EXPERT + depends on !(SPARC32 && SMP) default y imply RT_MUTEXES help @@ -1471,14 +1631,6 @@ config FUTEX_PI depends on FUTEX && RT_MUTEXES default y -config HAVE_FUTEX_CMPXCHG - bool - depends on FUTEX - help - Architectures should select this if futex_atomic_cmpxchg_inatomic() - is implemented and always working. This removes a couple of runtime - checks. - config EPOLL bool "Enable eventpoll support" if EXPERT default y @@ -1534,7 +1686,6 @@ config AIO config IO_URING bool "Enable IO uring support" if EXPERT - select ANON_INODES select IO_WQ default y help @@ -1578,16 +1729,17 @@ config KALLSYMS_ALL help Normally kallsyms only contains the symbols of functions for nicer OOPS messages and backtraces (i.e., symbols from the text and inittext - sections). This is sufficient for most cases. And only in very rare - cases (e.g., when a debugger is used) all symbols are required (e.g., - names of variables from the data sections, etc). + sections). This is sufficient for most cases. And only if you want to + enable kernel live patching, or other less common use cases (e.g., + when a debugger is used) all symbols are required (i.e., names of + variables from the data sections, etc). This option makes sure that all symbols are loaded into the kernel image (i.e., symbols from all sections) in cost of increased kernel size (depending on the kernel configuration, it may be 300KiB or something like this). - Say N unless you really need all symbols. + Say N unless you really need all symbols, or kernel live patching. config KALLSYMS_ABSOLUTE_PERCPU bool @@ -1615,35 +1767,6 @@ config KALLSYMS_BASE_RELATIVE # end of the "standard kernel features (expert users)" menu # syscall, maps, verifier -config BPF_SYSCALL - bool "Enable bpf() system call" - select BPF - select IRQ_WORK - default n - help - Enable the bpf() system call that allows to manipulate eBPF - programs and maps via file descriptors. - -config ARCH_WANT_DEFAULT_BPF_JIT - bool - -config BPF_JIT_ALWAYS_ON - bool "Permanently enable BPF JIT and remove BPF interpreter" - depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT - help - Enables BPF JIT and removes BPF interpreter to avoid - speculative execution of BPF instructions by the interpreter - -config BPF_JIT_DEFAULT_ON - def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON - depends on HAVE_EBPF_JIT && BPF_JIT - -config USERFAULTFD - bool "Enable userfaultfd() system call" - depends on MMU - help - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. config ARCH_HAS_MEMBARRIER_CALLBACKS bool @@ -1651,6 +1774,16 @@ config ARCH_HAS_MEMBARRIER_CALLBACKS config ARCH_HAS_MEMBARRIER_SYNC_CORE bool +config KCMP + bool "Enable kcmp() system call" if EXPERT + help + Enable the kernel resource comparison system call. It provides + user-space with the ability to compare two processes to see if they + share a common resource, such as a file descriptor or even virtual + memory space. + + If unsure, say N. + config RSEQ bool "Enable rseq() system call" if EXPERT default y @@ -1676,7 +1809,6 @@ config DEBUG_RSEQ config EMBEDDED bool "Embedded system" - option allnoconfig_y select EXPERT help This option should be enabled if compiling the kernel for @@ -1688,6 +1820,10 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. +config GUEST_PERF_EVENTS + bool + depends on HAVE_PERF_EVENTS + config PERF_USE_VMALLOC bool help @@ -1746,175 +1882,6 @@ config DEBUG_PERF_USE_VMALLOC endmenu -config VM_EVENT_COUNTERS - default y - bool "Enable VM event counters for /proc/vmstat" if EXPERT - help - VM event counters are needed for event counts to be shown. - This option allows the disabling of the VM event counters - on EXPERT systems. /proc/vmstat will only show page counts - if VM event counters are disabled. - -config SLUB_DEBUG - default y - bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS - help - SLUB has extensive debug support features. Disabling these can - result in significant savings in code size. This also disables - SLUB sysfs support. /sys/slab will not exist and there will be - no support for cache validation etc. - -config SLUB_MEMCG_SYSFS_ON - default n - bool "Enable memcg SLUB sysfs support by default" if EXPERT - depends on SLUB && SYSFS && MEMCG - help - SLUB creates a directory under /sys/kernel/slab for each - allocation cache to host info and debug files. If memory - cgroup is enabled, each cache can have per memory cgroup - caches. SLUB can create the same sysfs directories for these - caches under /sys/kernel/slab/CACHE/cgroup but it can lead - to a very high number of debug files being created. This is - controlled by slub_memcg_sysfs boot parameter and this - config option determines the parameter's default value. - -config COMPAT_BRK - bool "Disable heap randomization" - default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). - This option changes the bootup default to heap randomization - disabled, and can be overridden at runtime by setting - /proc/sys/kernel/randomize_va_space to 2. - - On non-ancient distros (post-2000 ones) N is usually a safe choice. - -choice - prompt "Choose SLAB allocator" - default SLUB - help - This option allows to select a slab allocator. - -config SLAB - bool "SLAB" - select HAVE_HARDENED_USERCOPY_ALLOCATOR - help - The regular slab allocator that is established and known to work - well in all environments. It organizes cache hot objects in - per cpu and per node queues. - -config SLUB - bool "SLUB (Unqueued Allocator)" - select HAVE_HARDENED_USERCOPY_ALLOCATOR - help - SLUB is a slab allocator that minimizes cache line usage - instead of managing queues of cached objects (SLAB approach). - Per cpu caching is realized using slabs of objects instead - of queues of objects. SLUB can use memory efficiently - and has enhanced diagnostics. SLUB is the default choice for - a slab allocator. - -config SLOB - depends on EXPERT - bool "SLOB (Simple Allocator)" - help - SLOB replaces the stock allocator with a drastically simpler - allocator. SLOB is generally more space efficient but - does not perform as well on large systems. - -endchoice - -config SLAB_MERGE_DEFAULT - bool "Allow slab caches to be merged" - default y - help - For reduced kernel memory fragmentation, slab caches can be - merged when they share the same size and other characteristics. - This carries a risk of kernel heap overflows being able to - overwrite objects from merged caches (and more easily control - cache layout), which makes such heap attacks easier to exploit - by attackers. By keeping caches unmerged, these kinds of exploits - can usually only damage objects in the same cache. To disable - merging at runtime, "slab_nomerge" can be passed on the kernel - command line. - -config SLAB_FREELIST_RANDOM - default n - depends on SLAB || SLUB - bool "SLAB freelist randomization" - help - Randomizes the freelist order used on creating new pages. This - security feature reduces the predictability of the kernel slab - allocator against heap overflows. - -config SLAB_FREELIST_HARDENED - bool "Harden slab freelist metadata" - depends on SLUB - help - Many kernel heap attacks try to target slab cache metadata and - other infrastructure. This options makes minor performance - sacrifices to harden the kernel slab allocator against common - freelist exploit methods. - -config SHUFFLE_PAGE_ALLOCATOR - bool "Page allocator randomization" - default SLAB_FREELIST_RANDOM && ACPI_NUMA - help - Randomization of the page allocator improves the average - utilization of a direct-mapped memory-side-cache. See section - 5.2.27 Heterogeneous Memory Attribute Table (HMAT) in the ACPI - 6.2a specification for an example of how a platform advertises - the presence of a memory-side-cache. There are also incidental - security benefits as it reduces the predictability of page - allocations to compliment SLAB_FREELIST_RANDOM, but the - default granularity of shuffling on the "MAX_ORDER - 1" i.e, - 10th order of pages is selected based on cache utilization - benefits on x86. - - While the randomization improves cache utilization it may - negatively impact workloads on platforms without a cache. For - this reason, by default, the randomization is enabled only - after runtime detection of a direct-mapped memory-side-cache. - Otherwise, the randomization may be force enabled with the - 'page_alloc.shuffle' kernel command line parameter. - - Say Y if unsure. - -config SLUB_CPU_PARTIAL - default y - depends on SLUB && SMP - bool "SLUB per cpu partial cache" - help - Per cpu partial caches accelerate objects allocation and freeing - that is local to a processor at the price of more indeterminism - in the latency of the free. On overflow these caches will be cleared - which requires the taking of locks that may cause latency spikes. - Typically one would choose no for a realtime system. - -config MMAP_ALLOW_UNINITIALIZED - bool "Allow mmapped anonymous memory to be uninitialized" - depends on EXPERT && !MMU - default n - help - Normally, and according to the Linux spec, anonymous memory obtained - from mmap() has its contents cleared before it is passed to - userspace. Enabling this config option allows you to request that - mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus - providing a huge performance boost. If this option is not enabled, - then the flag will be ignored. - - This is taken advantage of by uClibc's malloc(), and also by - ELF-FDPIC binfmt's brk and stack allocator. - - Because of the obvious security issues, this option should only be - enabled on embedded devices where you control what is run in - userspace. Since that isn't generally a problem on no-MMU systems, - it is normally safe to say Y here. - - See Documentation/nommu-mmap.txt for more information. - config SYSTEM_DATA_VERIFICATION def_bool n select SYSTEM_TRUSTED_KEYRING @@ -1937,7 +1904,39 @@ config PROFILING bool "Profiling support" help Say Y here to enable the extended profiling support mechanisms used - by profilers such as OProfile. + by profilers. + +config RUST + bool "Rust support" + depends on HAVE_RUST + depends on RUST_IS_AVAILABLE + depends on !MODVERSIONS + depends on !GCC_PLUGINS + depends on !RANDSTRUCT + depends on !DEBUG_INFO_BTF + select CONSTRUCTORS + help + Enables Rust support in the kernel. + + This allows other Rust-related options, like drivers written in Rust, + to be selected. + + It is also required to be able to load external kernel modules + written in Rust. + + See Documentation/rust/ for more information. + + If unsure, say N. + +config RUSTC_VERSION_TEXT + string + depends on RUST + default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n) + +config BINDGEN_VERSION_TEXT + string + depends on RUST + default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) # # Place an empty function call at each tracepoint site. Can be @@ -1952,6 +1951,7 @@ source "arch/Kconfig" config RT_MUTEXES bool + default y if PREEMPT_RT config BASE_SMALL int @@ -1962,256 +1962,7 @@ config MODULE_SIG_FORMAT def_bool n select SYSTEM_DATA_VERIFICATION -menuconfig MODULES - bool "Enable loadable module support" - option modules - help - Kernel modules are small pieces of compiled code which can - be inserted in the running kernel, rather than being - permanently built into the kernel. You use the "modprobe" - tool to add (and sometimes remove) them. If you say Y here, - many parts of the kernel can be built as modules (by - answering M instead of Y where indicated): this is most - useful for infrequently used options which are not required - for booting. For more information, see the man pages for - modprobe, lsmod, modinfo, insmod and rmmod. - - If you say Y here, you will need to run "make - modules_install" to put the modules under /lib/modules/ - where modprobe can find them (you may need to be root to do - this). - - If unsure, say Y. - -if MODULES - -config MODULE_FORCE_LOAD - bool "Forced module loading" - default n - help - Allow loading of modules without version information (ie. modprobe - --force). Forced module loading sets the 'F' (forced) taint flag and - is usually a really bad idea. - -config MODULE_UNLOAD - bool "Module unloading" - help - Without this option you will not be able to unload any - modules (note that some modules may not be unloadable - anyway), which makes your kernel smaller, faster - and simpler. If unsure, say Y. - -config MODULE_FORCE_UNLOAD - bool "Forced module unloading" - depends on MODULE_UNLOAD - help - This option allows you to force a module to unload, even if the - kernel believes it is unsafe: the kernel will remove the module - without waiting for anyone to stop using it (using the -f option to - rmmod). This is mainly for kernel developers and desperate users. - If unsure, say N. - -config MODVERSIONS - bool "Module versioning support" - help - Usually, you have to use modules compiled with your kernel. - Saying Y here makes it sometimes possible to use modules - compiled for different kernels, by adding enough information - to the modules to (hopefully) spot any changes which would - make them incompatible with the kernel you are running. If - unsure, say N. - -config ASM_MODVERSIONS - bool - default HAVE_ASM_MODVERSIONS && MODVERSIONS - help - This enables module versioning for exported symbols also from - assembly. This can be enabled only when the target architecture - supports it. - -config MODULE_REL_CRCS - bool - depends on MODVERSIONS - -config MODULE_SRCVERSION_ALL - bool "Source checksum for all modules" - help - Modules which contain a MODULE_VERSION get an extra "srcversion" - field inserted into their modinfo section, which contains a - sum of the source files which made it. This helps maintainers - see exactly which source was used to build a module (since - others sometimes change the module source without updating - the version). With this option, such a "srcversion" field - will be created for all modules. If unsure, say N. - -config MODULE_SIG - bool "Module signature verification" - select MODULE_SIG_FORMAT - help - Check modules for valid signatures upon load: the signature - is simply appended to the module. For more information see - <file:Documentation/admin-guide/module-signing.rst>. - - Note that this option adds the OpenSSL development packages as a - kernel build dependency so that the signing tool can use its crypto - library. - - You should enable this option if you wish to use either - CONFIG_SECURITY_LOCKDOWN_LSM or lockdown functionality imposed via - another LSM - otherwise unsigned modules will be loadable regardless - of the lockdown policy. - - !!!WARNING!!! If you enable this option, you MUST make sure that the - module DOES NOT get stripped after being signed. This includes the - debuginfo strip done by some packagers (such as rpmbuild) and - inclusion into an initramfs that wants the module size reduced. - -config MODULE_SIG_FORCE - bool "Require modules to be validly signed" - depends on MODULE_SIG - help - Reject unsigned modules or signed modules for which we don't have a - key. Without this, such modules will simply taint the kernel. - -config MODULE_SIG_ALL - bool "Automatically sign all modules" - default y - depends on MODULE_SIG - help - Sign all modules during make modules_install. Without this option, - modules must be signed manually, using the scripts/sign-file tool. - -comment "Do not forget to sign required modules with scripts/sign-file" - depends on MODULE_SIG_FORCE && !MODULE_SIG_ALL - -choice - prompt "Which hash algorithm should modules be signed with?" - depends on MODULE_SIG - help - This determines which sort of hashing algorithm will be used during - signature generation. This algorithm _must_ be built into the kernel - directly so that signature verification can take place. It is not - possible to load a signed module containing the algorithm to check - the signature on that module. - -config MODULE_SIG_SHA1 - bool "Sign modules with SHA-1" - select CRYPTO_SHA1 - -config MODULE_SIG_SHA224 - bool "Sign modules with SHA-224" - select CRYPTO_SHA256 - -config MODULE_SIG_SHA256 - bool "Sign modules with SHA-256" - select CRYPTO_SHA256 - -config MODULE_SIG_SHA384 - bool "Sign modules with SHA-384" - select CRYPTO_SHA512 - -config MODULE_SIG_SHA512 - bool "Sign modules with SHA-512" - select CRYPTO_SHA512 - -endchoice - -config MODULE_SIG_HASH - string - depends on MODULE_SIG - default "sha1" if MODULE_SIG_SHA1 - default "sha224" if MODULE_SIG_SHA224 - default "sha256" if MODULE_SIG_SHA256 - default "sha384" if MODULE_SIG_SHA384 - default "sha512" if MODULE_SIG_SHA512 - -config MODULE_COMPRESS - bool "Compress modules on installation" - help - - Compresses kernel modules when 'make modules_install' is run; gzip or - xz depending on "Compression algorithm" below. - - module-init-tools MAY support gzip, and kmod MAY support gzip and xz. - - Out-of-tree kernel modules installed using Kbuild will also be - compressed upon installation. - - Note: for modules inside an initrd or initramfs, it's more efficient - to compress the whole initrd or initramfs instead. - - Note: This is fully compatible with signed modules. - - If in doubt, say N. - -choice - prompt "Compression algorithm" - depends on MODULE_COMPRESS - default MODULE_COMPRESS_GZIP - help - This determines which sort of compression will be used during - 'make modules_install'. - - GZIP (default) and XZ are supported. - -config MODULE_COMPRESS_GZIP - bool "GZIP" - -config MODULE_COMPRESS_XZ - bool "XZ" - -endchoice - -config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS - bool "Allow loading of modules with missing namespace imports" - help - Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in - a namespace. A module that makes use of a symbol exported with such a - namespace is required to import the namespace via MODULE_IMPORT_NS(). - There is no technical reason to enforce correct namespace imports, - but it creates consistency between symbols defining namespaces and - users importing namespaces they make use of. This option relaxes this - requirement and lifts the enforcement when loading a module. - - If unsure, say N. - -config UNUSED_SYMBOLS - bool "Enable unused/obsolete exported symbols" - default y if X86 - help - Unused but exported symbols make the kernel needlessly bigger. For - that reason most of these unused exports will soon be removed. This - option is provided temporarily to provide a transition period in case - some external kernel module needs one of these symbols anyway. If you - encounter such a case in your module, consider if you are actually - using the right API. (rationale: since nobody in the kernel is using - this in a module, there is a pretty good chance it's actually the - wrong interface to use). If you really need the symbol, please send a - mail to the linux kernel mailing list mentioning the symbol and why - you really need it, and what the merge plan to the mainline kernel for - your module is. - -config TRIM_UNUSED_KSYMS - bool "Trim unused exported kernel symbols" - depends on !UNUSED_SYMBOLS - help - The kernel and some modules make many symbols available for - other modules to use via EXPORT_SYMBOL() and variants. Depending - on the set of modules being selected in your kernel configuration, - many of those exported symbols might never be used. - - This option allows for unused exported symbols to be dropped from - the build. In turn, this provides the compiler more opportunities - (especially when using LTO) for optimizing the code and reducing - binary size. This might have some security advantages as well. - - If unsure, or if you need to build out-of-tree modules, say N. - -endif # MODULES - -config MODULES_TREE_LOOKUP - def_bool y - depends on PERF_EVENTS || TRACING +source "kernel/module/Kconfig" config INIT_ALL_POSSIBLE bool @@ -2241,6 +1992,9 @@ config ASN1 source "kernel/Kconfig.locks" +config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + bool + config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool diff --git a/init/Makefile b/init/Makefile index 6246a06364d0..8316c23bead2 100644 --- a/init/Makefile +++ b/init/Makefile @@ -18,21 +18,44 @@ obj-y += init_task.o mounts-y := do_mounts.o mounts-$(CONFIG_BLK_DEV_RAM) += do_mounts_rd.o mounts-$(CONFIG_BLK_DEV_INITRD) += do_mounts_initrd.o -mounts-$(CONFIG_BLK_DEV_MD) += do_mounts_md.o - -# dependencies on generated files need to be listed explicitly -$(obj)/version.o: include/generated/compile.h - -# compile.h changes depending on hostname, generation number, etc, -# so we regenerate it always. -# mkcompile_h will make sure to only update the -# actual file if its content has changed. - - chk_compile.h = : - quiet_chk_compile.h = echo ' CHK $@' -silent_chk_compile.h = : -include/generated/compile.h: FORCE - @$($(quiet)chk_compile.h) - $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ - "$(CONFIG_PREEMPT_RT)" "$(CC) $(KBUILD_CFLAGS)" + +# +# UTS_VERSION +# + +smp-flag-$(CONFIG_SMP) := SMP +preempt-flag-$(CONFIG_PREEMPT_BUILD) := PREEMPT +preempt-flag-$(CONFIG_PREEMPT_DYNAMIC) := PREEMPT_DYNAMIC +preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT + +build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto)) +build-timestamp = $(or $(KBUILD_BUILD_TIMESTAMP), $(build-timestamp-auto)) + +# Maximum length of UTS_VERSION is 64 chars +filechk_uts_version = \ + utsver=$$(echo '$(pound)'"$(build-version)" $(smp-flag-y) $(preempt-flag-y) "$(build-timestamp)" | cut -b -64); \ + echo '$(pound)'define UTS_VERSION \""$${utsver}"\" + +# +# Build version.c with temporary UTS_VERSION +# + +$(obj)/utsversion-tmp.h: FORCE + $(call filechk,uts_version) + +clean-files += utsversion-tmp.h + +$(obj)/version.o: $(obj)/utsversion-tmp.h +CFLAGS_version.o := -include $(obj)/utsversion-tmp.h + +# +# Build version-timestamp.c with final UTS_VERSION +# + +include/generated/utsversion.h: build-version-auto = $(shell $(srctree)/$(src)/build-version) +include/generated/utsversion.h: build-timestamp-auto = $(shell LC_ALL=C date) +include/generated/utsversion.h: FORCE + $(call filechk,uts_version) + +$(obj)/version-timestamp.o: include/generated/utsversion.h +CFLAGS_version-timestamp.o := -include include/generated/utsversion.h diff --git a/init/build-version b/init/build-version new file mode 100755 index 000000000000..537d45815083 --- /dev/null +++ b/init/build-version @@ -0,0 +1,10 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only + +prev_ver=$(cat .version 2>/dev/null) && +ver=$(expr ${prev_ver} + 1 2>/dev/null) || +ver=1 + +echo ${ver} > .version + +echo ${ver} diff --git a/init/do_mounts.c b/init/do_mounts.c index 0ae9cc22f2ae..811e94daf0a8 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -8,7 +8,6 @@ #include <linux/root_dev.h> #include <linux/security.h> #include <linux/delay.h> -#include <linux/genhd.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/init.h> @@ -23,12 +22,11 @@ #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> #include <linux/nfs_mount.h> +#include <linux/raid/detect.h> #include <uapi/linux/mount.h> #include "do_mounts.h" -int __initdata rd_doload; /* 1 = load RAM disk, 0 = don't load */ - int root_mountflags = MS_RDONLY | MS_SILENT; static char * __initdata root_device_name; static char __initdata saved_root_name[64]; @@ -38,7 +36,7 @@ dev_t ROOT_DEV; static int __init load_ramdisk(char *str) { - rd_doload = simple_strtol(str,NULL,0) & 3; + pr_warn("ignoring the deprecated load_ramdisk= option\n"); return 1; } __setup("load_ramdisk=", load_ramdisk); @@ -77,21 +75,15 @@ struct uuidcmp { */ static int match_dev_by_uuid(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const struct uuidcmp *cmp = data; - struct hd_struct *part = dev_to_part(dev); - - if (!part->info) - goto no_match; - - if (strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) - goto no_match; + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) + return 0; return 1; -no_match: - return 0; } - /** * devt_from_partuuid - looks up the dev_t of a partition by its UUID * @uuid_str: char array containing ascii UUID @@ -107,13 +99,10 @@ no_match: */ static dev_t devt_from_partuuid(const char *uuid_str) { - dev_t res = 0; struct uuidcmp cmp; struct device *dev = NULL; - struct gendisk *disk; - struct hd_struct *part; + dev_t devt = 0; int offset = 0; - bool clear_root_wait = false; char *slash; cmp.uuid = uuid_str; @@ -122,52 +111,43 @@ static dev_t devt_from_partuuid(const char *uuid_str) /* Check for optional partition number offset attributes. */ if (slash) { char c = 0; + /* Explicitly fail on poor PARTUUID syntax. */ - if (sscanf(slash + 1, - "PARTNROFF=%d%c", &offset, &c) != 1) { - clear_root_wait = true; - goto done; - } + if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) + goto clear_root_wait; cmp.len = slash - uuid_str; } else { cmp.len = strlen(uuid_str); } - if (!cmp.len) { - clear_root_wait = true; - goto done; - } + if (!cmp.len) + goto clear_root_wait; - dev = class_find_device(&block_class, NULL, &cmp, - &match_dev_by_uuid); + dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); if (!dev) - goto done; - - res = dev->devt; - - /* Attempt to find the partition by offset. */ - if (!offset) - goto no_offset; + return 0; - res = 0; - disk = part_to_disk(dev_to_part(dev)); - part = disk_get_part(disk, dev_to_part(dev)->partno + offset); - if (part) { - res = part_devt(part); - put_device(part_to_dev(part)); + if (offset) { + /* + * Attempt to find the requested partition by adding an offset + * to the partition number found by UUID. + */ + devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); + } else { + devt = dev->devt; } -no_offset: put_device(dev); -done: - if (clear_root_wait) { - pr_err("VFS: PARTUUID= is invalid.\n" - "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); - if (root_wait) - pr_err("Disabling rootwait; root= is invalid.\n"); - root_wait = 0; - } - return res; + return devt; + +clear_root_wait: + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + if (root_wait) + pr_err("Disabling rootwait; root= is invalid.\n"); + root_wait = 0; + return 0; } /** @@ -179,15 +159,90 @@ done: */ static int match_dev_by_label(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const char *label = data; - struct hd_struct *part = dev_to_part(dev); - if (part->info && !strcmp(label, part->info->volname)) - return 1; + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) + return 0; + return 1; +} - return 0; +static dev_t devt_from_partlabel(const char *label) +{ + struct device *dev; + dev_t devt = 0; + + dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); + if (dev) { + devt = dev->devt; + put_device(dev); + } + + return devt; +} + +static dev_t devt_from_devname(const char *name) +{ + dev_t devt = 0; + int part; + char s[32]; + char *p; + + if (strlen(name) > 31) + return 0; + strcpy(s, name); + for (p = s; *p; p++) { + if (*p == '/') + *p = '!'; + } + + devt = blk_lookup_devt(s, 0); + if (devt) + return devt; + + /* + * Try non-existent, but valid partition, which may only exist after + * opening the device, like partitioned md devices. + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + return 0; + + /* try disk name without <part number> */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + devt = blk_lookup_devt(s, part); + if (devt) + return devt; + + /* try disk name without p<part number> */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + return 0; + p[-1] = '\0'; + return blk_lookup_devt(s, part); +} +#endif /* CONFIG_BLOCK */ + +static dev_t devt_from_devnum(const char *name) +{ + unsigned maj, min, offset; + dev_t devt = 0; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { + devt = MKDEV(maj, min); + if (maj != MAJOR(devt) || min != MINOR(devt)) + return 0; + } else { + devt = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + return 0; + } + + return devt; } -#endif /* * Convert a name into device number. We accept the following variants: @@ -219,107 +274,29 @@ static int match_dev_by_label(struct device *dev, const void *data) * name contains slashes, the device name has them replaced with * bangs. */ - dev_t name_to_dev_t(const char *name) { - char s[32]; - char *p; - dev_t res = 0; - int part; - + if (strcmp(name, "/dev/nfs") == 0) + return Root_NFS; + if (strcmp(name, "/dev/cifs") == 0) + return Root_CIFS; + if (strcmp(name, "/dev/ram") == 0) + return Root_RAM0; #ifdef CONFIG_BLOCK - if (strncmp(name, "PARTUUID=", 9) == 0) { - name += 9; - res = devt_from_partuuid(name); - if (!res) - goto fail; - goto done; - } else if (strncmp(name, "PARTLABEL=", 10) == 0) { - struct device *dev; - - dev = class_find_device(&block_class, NULL, name + 10, - &match_dev_by_label); - if (!dev) - goto fail; - - res = dev->devt; - put_device(dev); - goto done; - } + if (strncmp(name, "PARTUUID=", 9) == 0) + return devt_from_partuuid(name + 9); + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10); + if (strncmp(name, "/dev/", 5) == 0) + return devt_from_devname(name + 5); #endif - - if (strncmp(name, "/dev/", 5) != 0) { - unsigned maj, min, offset; - char dummy; - - if ((sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2) || - (sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3)) { - res = MKDEV(maj, min); - if (maj != MAJOR(res) || min != MINOR(res)) - goto fail; - } else { - res = new_decode_dev(simple_strtoul(name, &p, 16)); - if (*p) - goto fail; - } - goto done; - } - - name += 5; - res = Root_NFS; - if (strcmp(name, "nfs") == 0) - goto done; - res = Root_CIFS; - if (strcmp(name, "cifs") == 0) - goto done; - res = Root_RAM0; - if (strcmp(name, "ram") == 0) - goto done; - - if (strlen(name) > 31) - goto fail; - strcpy(s, name); - for (p = s; *p; p++) - if (*p == '/') - *p = '!'; - res = blk_lookup_devt(s, 0); - if (res) - goto done; - - /* - * try non-existent, but valid partition, which may only exist - * after revalidating the disk, like partitioned md devices - */ - while (p > s && isdigit(p[-1])) - p--; - if (p == s || !*p || *p == '0') - goto fail; - - /* try disk name without <part number> */ - part = simple_strtoul(p, NULL, 10); - *p = '\0'; - res = blk_lookup_devt(s, part); - if (res) - goto done; - - /* try disk name without p<part number> */ - if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') - goto fail; - p[-1] = '\0'; - res = blk_lookup_devt(s, part); - if (res) - goto done; - -fail: - return 0; -done: - return res; + return devt_from_devnum(name); } EXPORT_SYMBOL_GPL(name_to_dev_t); static int __init root_dev_setup(char *line) { - strlcpy(saved_root_name, line, sizeof(saved_root_name)); + strscpy(saved_root_name, line, sizeof(saved_root_name)); return 1; } @@ -360,31 +337,21 @@ __setup("rootflags=", root_data_setup); __setup("rootfstype=", fs_names_setup); __setup("rootdelay=", root_delay_setup); -static void __init get_fs_names(char *page) +/* This can return zero length strings. Caller should check */ +static int __init split_fs_names(char *page, size_t size, char *names) { - char *s = page; - - if (root_fs_names) { - strcpy(page, root_fs_names); - while (*s++) { - if (s[-1] == ',') - s[-1] = '\0'; - } - } else { - int len = get_filesystem_list(page); - char *p, *next; - - page[len] = '\0'; - for (p = page-1; p; p = next) { - next = strchr(++p, '\n'); - if (*p++ != '\t') - continue; - while ((*s++ = *p++) != '\n') - ; - s[-1] = '\0'; + int count = 1; + char *p = page; + + strscpy(p, root_fs_names, size); + while (*p++) { + if (p[-1] == ',') { + p[-1] = '\0'; + count++; } } - *s = '\0'; + + return count; } static int __init do_mount_root(const char *name, const char *fs, @@ -396,20 +363,20 @@ static int __init do_mount_root(const char *name, const char *fs, int ret; if (data) { - /* do_mount() requires a full page as fifth argument */ + /* init_mount() requires a full page as fifth argument */ p = alloc_page(GFP_KERNEL); if (!p) return -ENOMEM; data_page = page_address(p); - /* zero-pad. do_mount() will make sure it's terminated */ + /* zero-pad. init_mount() will make sure it's terminated */ strncpy(data_page, data, PAGE_SIZE); } - ret = do_mount(name, "/root", fs, flags, data_page); + ret = init_mount(name, "/root", fs, flags, data_page); if (ret) goto out; - ksys_chdir("/root"); + init_chdir("/root"); s = current->fs->pwd.dentry->d_sb; ROOT_DEV = s->s_dev; printk(KERN_INFO @@ -429,16 +396,22 @@ void __init mount_block_root(char *name, int flags) struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; -#ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; -#else - const char *b = name; -#endif - - get_fs_names(fs_names); + int num_fs, i; + + scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); + if (root_fs_names) + num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); + else + num_fs = list_bdev_fs_names(fs_names, PAGE_SIZE); retry: - for (p = fs_names; *p; p += strlen(p)+1) { - int err = do_mount_root(name, p, flags, root_mount_data); + for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) { + int err; + + if (!*p) + continue; + err = do_mount_root(name, p, flags, root_mount_data); switch (err) { case 0: goto out; @@ -451,18 +424,11 @@ retry: * and bad superblock on root device. * and give them a list of the available devices */ -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", root_device_name, b, err); printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " - "explicit textual name for \"root=\" boot option.\n"); -#endif panic("VFS: Unable to mount root fs on %s", b); } if (!(flags & SB_RDONLY)) { @@ -473,12 +439,9 @@ retry: printk("List of all partitions:\n"); printk_all_partitions(); printk("No filesystem could mount root, tried: "); - for (p = fs_names; *p; p += strlen(p)+1) + for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1) printk(" %s", p); printk("\n"); -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif panic("VFS: Unable to mount root fs on %s", b); out: put_page(page); @@ -560,68 +523,66 @@ static int __init mount_cifs_root(void) } #endif -#if defined(CONFIG_BLK_DEV_RAM) || defined(CONFIG_BLK_DEV_FD) -void __init change_floppy(char *fmt, ...) +static bool __init fs_is_nodev(char *fstype) { - struct termios termios; - char buf[80]; - char c; - int fd; - va_list args; - va_start(args, fmt); - vsprintf(buf, fmt, args); - va_end(args); - fd = ksys_open("/dev/root", O_RDWR | O_NDELAY, 0); - if (fd >= 0) { - ksys_ioctl(fd, FDEJECT, 0); - ksys_close(fd); + struct file_system_type *fs = get_fs_type(fstype); + bool ret = false; + + if (fs) { + ret = !(fs->fs_flags & FS_REQUIRES_DEV); + put_filesystem(fs); } - printk(KERN_NOTICE "VFS: Insert %s and press ENTER\n", buf); - fd = ksys_open("/dev/console", O_RDWR, 0); - if (fd >= 0) { - ksys_ioctl(fd, TCGETS, (long)&termios); - termios.c_lflag &= ~ICANON; - ksys_ioctl(fd, TCSETSF, (long)&termios); - ksys_read(fd, &c, 1); - termios.c_lflag |= ICANON; - ksys_ioctl(fd, TCSETSF, (long)&termios); - ksys_close(fd); + + return ret; +} + +static int __init mount_nodev_root(void) +{ + char *fs_names, *fstype; + int err = -EINVAL; + int num_fs, i; + + fs_names = (void *)__get_free_page(GFP_KERNEL); + if (!fs_names) + return -EINVAL; + num_fs = split_fs_names(fs_names, PAGE_SIZE, root_fs_names); + + for (i = 0, fstype = fs_names; i < num_fs; + i++, fstype += strlen(fstype) + 1) { + if (!*fstype) + continue; + if (!fs_is_nodev(fstype)) + continue; + err = do_mount_root(root_device_name, fstype, root_mountflags, + root_mount_data); + if (!err) + break; } + + free_page((unsigned long)fs_names); + return err; } -#endif void __init mount_root(void) { #ifdef CONFIG_ROOT_NFS if (ROOT_DEV == Root_NFS) { - if (mount_nfs_root()) - return; - - printk(KERN_ERR "VFS: Unable to mount root fs via NFS, trying floppy.\n"); - ROOT_DEV = Root_FD0; + if (!mount_nfs_root()) + printk(KERN_ERR "VFS: Unable to mount root fs via NFS.\n"); + return; } #endif #ifdef CONFIG_CIFS_ROOT if (ROOT_DEV == Root_CIFS) { - if (mount_cifs_root()) - return; - - printk(KERN_ERR "VFS: Unable to mount root fs via SMB, trying floppy.\n"); - ROOT_DEV = Root_FD0; + if (!mount_cifs_root()) + printk(KERN_ERR "VFS: Unable to mount root fs via SMB.\n"); + return; } #endif -#ifdef CONFIG_BLK_DEV_FD - if (MAJOR(ROOT_DEV) == FLOPPY_MAJOR) { - /* rd_doload is 2 for a dual initrd/ramload setup */ - if (rd_doload==2) { - if (rd_load_disk(1)) { - ROOT_DEV = Root_RAM1; - root_device_name = NULL; - } - } else - change_floppy("root floppy"); + if (ROOT_DEV == 0 && root_device_name && root_fs_names) { + if (mount_nodev_root() == 0) + return; } -#endif #ifdef CONFIG_BLOCK { int err = create_dev("/dev/root", ROOT_DEV); @@ -638,8 +599,6 @@ void __init mount_root(void) */ void __init prepare_namespace(void) { - int is_floppy; - if (root_delay) { printk(KERN_INFO "Waiting %d sec before mounting root device...\n", root_delay); @@ -682,16 +641,11 @@ void __init prepare_namespace(void) async_synchronize_full(); } - is_floppy = MAJOR(ROOT_DEV) == FLOPPY_MAJOR; - - if (is_floppy && rd_doload && rd_load_disk(0)) - ROOT_DEV = Root_RAM0; - mount_root(); out: devtmpfs_mount(); - do_mount(".", "/", NULL, MS_MOVE, NULL); - ksys_chroot("."); + init_mount(".", "/", NULL, MS_MOVE, NULL); + init_chroot("."); } static bool is_tmpfs; diff --git a/init/do_mounts.h b/init/do_mounts.h index 0bb0806de4ce..7a29ac3e427b 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -8,26 +8,16 @@ #include <linux/mount.h> #include <linux/major.h> #include <linux/root_dev.h> +#include <linux/init_syscalls.h> -void change_floppy(char *fmt, ...); void mount_block_root(char *name, int flags); void mount_root(void); extern int root_mountflags; -static inline int create_dev(char *name, dev_t dev) +static inline __init int create_dev(char *name, dev_t dev) { - ksys_unlink(name); - return ksys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); -} - -static inline u32 bstat(char *name) -{ - struct kstat stat; - if (vfs_stat(name, &stat) != 0) - return 0; - if (!S_ISBLK(stat.mode)) - return 0; - return stat.rdev; + init_unlink(name); + return init_mknod(name, S_IFBLK | 0600, new_encode_dev(dev)); } #ifdef CONFIG_BLK_DEV_RAM @@ -51,13 +41,3 @@ bool __init initrd_load(void); static inline bool initrd_load(void) { return false; } #endif - -#ifdef CONFIG_BLK_DEV_MD - -void md_run_setup(void); - -#else - -static inline void md_run_setup(void) {} - -#endif diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index dab8b1151b56..34731241377d 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -14,12 +14,32 @@ unsigned long initrd_start, initrd_end; int initrd_below_start_ok; -unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ +static unsigned int real_root_dev; /* do_proc_dointvec cannot handle kdev_t */ static int __initdata mount_initrd = 1; phys_addr_t phys_initrd_start __initdata; unsigned long phys_initrd_size __initdata; +#ifdef CONFIG_SYSCTL +static struct ctl_table kern_do_mounts_initrd_table[] = { + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + +static __init int kernel_do_mounts_initrd_sysctls_init(void) +{ + register_sysctl_init("kernel", kern_do_mounts_initrd_table); + return 0; +} +late_initcall(kernel_do_mounts_initrd_sysctls_init); +#endif /* CONFIG_SYSCTL */ + static int __init no_initrd(char *str) { mount_initrd = 0; @@ -28,7 +48,7 @@ static int __init no_initrd(char *str) __setup("noinitrd", no_initrd); -static int __init early_initrd(char *p) +static int __init early_initrdmem(char *p) { phys_addr_t start; unsigned long size; @@ -43,16 +63,22 @@ static int __init early_initrd(char *p) } return 0; } +early_param("initrdmem", early_initrdmem); + +static int __init early_initrd(char *p) +{ + return early_initrdmem(p); +} early_param("initrd", early_initrd); -static int init_linuxrc(struct subprocess_info *info, struct cred *new) +static int __init init_linuxrc(struct subprocess_info *info, struct cred *new) { ksys_unshare(CLONE_FS | CLONE_FILES); console_on_rootfs(); /* move initrd over / and chdir/chroot in initrd root */ - ksys_chdir("/root"); - do_mount(".", "/", NULL, MS_MOVE, NULL); - ksys_chroot("."); + init_chdir("/root"); + init_mount(".", "/", NULL, MS_MOVE, NULL); + init_chroot("."); ksys_setsid(); return 0; } @@ -64,61 +90,46 @@ static void __init handle_initrd(void) extern char *envp_init[]; int error; + pr_warn("using deprecated initrd support, will be removed in 2021.\n"); + real_root_dev = new_encode_dev(ROOT_DEV); create_dev("/dev/root.old", Root_RAM0); /* mount initrd on rootfs' /root */ mount_block_root("/dev/root.old", root_mountflags & ~MS_RDONLY); - ksys_mkdir("/old", 0700); - ksys_chdir("/old"); - - /* - * In case that a resume from disk is carried out by linuxrc or one of - * its children, we need to tell the freezer not to wait for us. - */ - current->flags |= PF_FREEZER_SKIP; + init_mkdir("/old", 0700); + init_chdir("/old"); info = call_usermodehelper_setup("/linuxrc", argv, envp_init, GFP_KERNEL, init_linuxrc, NULL, NULL); if (!info) return; - call_usermodehelper_exec(info, UMH_WAIT_PROC); - - current->flags &= ~PF_FREEZER_SKIP; + call_usermodehelper_exec(info, UMH_WAIT_PROC|UMH_FREEZABLE); /* move initrd to rootfs' /old */ - do_mount("..", ".", NULL, MS_MOVE, NULL); + init_mount("..", ".", NULL, MS_MOVE, NULL); /* switch root and cwd back to / of rootfs */ - ksys_chroot(".."); + init_chroot(".."); if (new_decode_dev(real_root_dev) == Root_RAM0) { - ksys_chdir("/old"); + init_chdir("/old"); return; } - ksys_chdir("/"); + init_chdir("/"); ROOT_DEV = new_decode_dev(real_root_dev); mount_root(); printk(KERN_NOTICE "Trying to move old root to /initrd ... "); - error = do_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); + error = init_mount("/old", "/root/initrd", NULL, MS_MOVE, NULL); if (!error) printk("okay\n"); else { - int fd = ksys_open("/dev/root.old", O_RDWR, 0); if (error == -ENOENT) printk("/initrd does not exist. Ignored.\n"); else printk("failed\n"); printk(KERN_NOTICE "Unmounting old root\n"); - ksys_umount("/old", MNT_DETACH); - printk(KERN_NOTICE "Trying to free ramdisk memory ... "); - if (fd < 0) { - error = fd; - } else { - error = ksys_ioctl(fd, BLKFLSBUF, 0); - ksys_close(fd); - } - printk(!error ? "okay\n" : "failed\n"); + init_umount("/old", MNT_DETACH); } } @@ -133,11 +144,11 @@ bool __init initrd_load(void) * mounted in the normal path. */ if (rd_load_image("/initrd.image") && ROOT_DEV != Root_RAM0) { - ksys_unlink("/initrd.image"); + init_unlink("/initrd.image"); handle_initrd(); return true; } } - ksys_unlink("/initrd.image"); + init_unlink("/initrd.image"); return false; } diff --git a/init/do_mounts_md.c b/init/do_mounts_md.c deleted file mode 100644 index b84031528dd4..000000000000 --- a/init/do_mounts_md.c +++ /dev/null @@ -1,304 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/delay.h> -#include <linux/raid/md_u.h> -#include <linux/raid/md_p.h> - -#include "do_mounts.h" - -/* - * When md (and any require personalities) are compiled into the kernel - * (not a module), arrays can be assembles are boot time using with AUTODETECT - * where specially marked partitions are registered with md_autodetect_dev(), - * and with MD_BOOT where devices to be collected are given on the boot line - * with md=..... - * The code for that is here. - */ - -#ifdef CONFIG_MD_AUTODETECT -static int __initdata raid_noautodetect; -#else -static int __initdata raid_noautodetect=1; -#endif -static int __initdata raid_autopart; - -static struct { - int minor; - int partitioned; - int level; - int chunk; - char *device_names; -} md_setup_args[256] __initdata; - -static int md_setup_ents __initdata; - -/* - * Parse the command-line parameters given our kernel, but do not - * actually try to invoke the MD device now; that is handled by - * md_setup_drive after the low-level disk drivers have initialised. - * - * 27/11/1999: Fixed to work correctly with the 2.3 kernel (which - * assigns the task of parsing integer arguments to the - * invoked program now). Added ability to initialise all - * the MD devices (by specifying multiple "md=" lines) - * instead of just one. -- KTK - * 18May2000: Added support for persistent-superblock arrays: - * md=n,0,factor,fault,device-list uses RAID0 for device n - * md=n,-1,factor,fault,device-list uses LINEAR for device n - * md=n,device-list reads a RAID superblock from the devices - * elements in device-list are read by name_to_kdev_t so can be - * a hex number or something like /dev/hda1 /dev/sdb - * 2001-06-03: Dave Cinege <dcinege@psychosis.com> - * Shifted name_to_kdev_t() and related operations to md_set_drive() - * for later execution. Rewrote section to make devfs compatible. - */ -static int __init md_setup(char *str) -{ - int minor, level, factor, fault, partitioned = 0; - char *pername = ""; - char *str1; - int ent; - - if (*str == 'd') { - partitioned = 1; - str++; - } - if (get_option(&str, &minor) != 2) { /* MD Number */ - printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); - return 0; - } - str1 = str; - for (ent=0 ; ent< md_setup_ents ; ent++) - if (md_setup_args[ent].minor == minor && - md_setup_args[ent].partitioned == partitioned) { - printk(KERN_WARNING "md: md=%s%d, Specified more than once. " - "Replacing previous definition.\n", partitioned?"d":"", minor); - break; - } - if (ent >= ARRAY_SIZE(md_setup_args)) { - printk(KERN_WARNING "md: md=%s%d - too many md initialisations\n", partitioned?"d":"", minor); - return 0; - } - if (ent >= md_setup_ents) - md_setup_ents++; - switch (get_option(&str, &level)) { /* RAID level */ - case 2: /* could be 0 or -1.. */ - if (level == 0 || level == LEVEL_LINEAR) { - if (get_option(&str, &factor) != 2 || /* Chunk Size */ - get_option(&str, &fault) != 2) { - printk(KERN_WARNING "md: Too few arguments supplied to md=.\n"); - return 0; - } - md_setup_args[ent].level = level; - md_setup_args[ent].chunk = 1 << (factor+12); - if (level == LEVEL_LINEAR) - pername = "linear"; - else - pername = "raid0"; - break; - } - /* FALL THROUGH */ - case 1: /* the first device is numeric */ - str = str1; - /* FALL THROUGH */ - case 0: - md_setup_args[ent].level = LEVEL_NONE; - pername="super-block"; - } - - printk(KERN_INFO "md: Will configure md%d (%s) from %s, below.\n", - minor, pername, str); - md_setup_args[ent].device_names = str; - md_setup_args[ent].partitioned = partitioned; - md_setup_args[ent].minor = minor; - - return 1; -} - -static void __init md_setup_drive(void) -{ - int minor, i, ent, partitioned; - dev_t dev; - dev_t devices[MD_SB_DISKS+1]; - - for (ent = 0; ent < md_setup_ents ; ent++) { - int fd; - int err = 0; - char *devname; - mdu_disk_info_t dinfo; - char name[16]; - - minor = md_setup_args[ent].minor; - partitioned = md_setup_args[ent].partitioned; - devname = md_setup_args[ent].device_names; - - sprintf(name, "/dev/md%s%d", partitioned?"_d":"", minor); - if (partitioned) - dev = MKDEV(mdp_major, minor << MdpMinorShift); - else - dev = MKDEV(MD_MAJOR, minor); - create_dev(name, dev); - for (i = 0; i < MD_SB_DISKS && devname != NULL; i++) { - char *p; - char comp_name[64]; - u32 rdev; - - p = strchr(devname, ','); - if (p) - *p++ = 0; - - dev = name_to_dev_t(devname); - if (strncmp(devname, "/dev/", 5) == 0) - devname += 5; - snprintf(comp_name, 63, "/dev/%s", devname); - rdev = bstat(comp_name); - if (rdev) - dev = new_decode_dev(rdev); - if (!dev) { - printk(KERN_WARNING "md: Unknown device name: %s\n", devname); - break; - } - - devices[i] = dev; - - devname = p; - } - devices[i] = 0; - - if (!i) - continue; - - printk(KERN_INFO "md: Loading md%s%d: %s\n", - partitioned ? "_d" : "", minor, - md_setup_args[ent].device_names); - - fd = ksys_open(name, 0, 0); - if (fd < 0) { - printk(KERN_ERR "md: open failed - cannot start " - "array %s\n", name); - continue; - } - if (ksys_ioctl(fd, SET_ARRAY_INFO, 0) == -EBUSY) { - printk(KERN_WARNING - "md: Ignoring md=%d, already autodetected. (Use raid=noautodetect)\n", - minor); - ksys_close(fd); - continue; - } - - if (md_setup_args[ent].level != LEVEL_NONE) { - /* non-persistent */ - mdu_array_info_t ainfo; - ainfo.level = md_setup_args[ent].level; - ainfo.size = 0; - ainfo.nr_disks =0; - ainfo.raid_disks =0; - while (devices[ainfo.raid_disks]) - ainfo.raid_disks++; - ainfo.md_minor =minor; - ainfo.not_persistent = 1; - - ainfo.state = (1 << MD_SB_CLEAN); - ainfo.layout = 0; - ainfo.chunk_size = md_setup_args[ent].chunk; - err = ksys_ioctl(fd, SET_ARRAY_INFO, (long)&ainfo); - for (i = 0; !err && i <= MD_SB_DISKS; i++) { - dev = devices[i]; - if (!dev) - break; - dinfo.number = i; - dinfo.raid_disk = i; - dinfo.state = (1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC); - dinfo.major = MAJOR(dev); - dinfo.minor = MINOR(dev); - err = ksys_ioctl(fd, ADD_NEW_DISK, - (long)&dinfo); - } - } else { - /* persistent */ - for (i = 0; i <= MD_SB_DISKS; i++) { - dev = devices[i]; - if (!dev) - break; - dinfo.major = MAJOR(dev); - dinfo.minor = MINOR(dev); - ksys_ioctl(fd, ADD_NEW_DISK, (long)&dinfo); - } - } - if (!err) - err = ksys_ioctl(fd, RUN_ARRAY, 0); - if (err) - printk(KERN_WARNING "md: starting md%d failed\n", minor); - else { - /* reread the partition table. - * I (neilb) and not sure why this is needed, but I cannot - * boot a kernel with devfs compiled in from partitioned md - * array without it - */ - ksys_close(fd); - fd = ksys_open(name, 0, 0); - ksys_ioctl(fd, BLKRRPART, 0); - } - ksys_close(fd); - } -} - -static int __init raid_setup(char *str) -{ - int len, pos; - - len = strlen(str) + 1; - pos = 0; - - while (pos < len) { - char *comma = strchr(str+pos, ','); - int wlen; - if (comma) - wlen = (comma-str)-pos; - else wlen = (len-1)-pos; - - if (!strncmp(str, "noautodetect", wlen)) - raid_noautodetect = 1; - if (!strncmp(str, "autodetect", wlen)) - raid_noautodetect = 0; - if (strncmp(str, "partitionable", wlen)==0) - raid_autopart = 1; - if (strncmp(str, "part", wlen)==0) - raid_autopart = 1; - pos += wlen+1; - } - return 1; -} - -__setup("raid=", raid_setup); -__setup("md=", md_setup); - -static void __init autodetect_raid(void) -{ - int fd; - - /* - * Since we don't want to detect and use half a raid array, we need to - * wait for the known devices to complete their probing - */ - printk(KERN_INFO "md: Waiting for all devices to be available before autodetect\n"); - printk(KERN_INFO "md: If you don't use raid, use raid=noautodetect\n"); - - wait_for_device_probe(); - - fd = ksys_open("/dev/md0", 0, 0); - if (fd >= 0) { - ksys_ioctl(fd, RAID_AUTORUN, raid_autopart); - ksys_close(fd); - } -} - -void __init md_run_setup(void) -{ - create_dev("/dev/md0", MKDEV(MD_MAJOR, 0)); - - if (raid_noautodetect) - printk(KERN_INFO "md: Skipping autodetection of RAID arrays. (raid=autodetect will force)\n"); - else - autodetect_raid(); - md_setup_drive(); -} diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index 32fb049d18f9..ac021ae6e6fa 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -14,12 +14,12 @@ #include <linux/decompress/generic.h> - -int __initdata rd_prompt = 1;/* 1 = prompt for RAM disk, 0 = don't prompt */ +static struct file *in_file, *out_file; +static loff_t in_pos, out_pos; static int __init prompt_ramdisk(char *str) { - rd_prompt = simple_strtol(str,NULL,0) & 1; + pr_warn("ignoring the deprecated prompt_ramdisk= option\n"); return 1; } __setup("prompt_ramdisk=", prompt_ramdisk); @@ -33,7 +33,7 @@ static int __init ramdisk_start_setup(char *str) } __setup("ramdisk_start=", ramdisk_start_setup); -static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); +static int __init crd_load(decompress_fn deco); /* * This routine tries to find a RAM disk image to load, and returns the @@ -55,7 +55,8 @@ static int __init crd_load(int in_fd, int out_fd, decompress_fn deco); * lz4 */ static int __init -identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) +identify_ramdisk_image(struct file *file, loff_t pos, + decompress_fn *decompressor) { const int size = 512; struct minix_super_block *minixsb; @@ -66,6 +67,7 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) unsigned char *buf; const char *compress_name; unsigned long n; + int start_block = rd_image_start; buf = kmalloc(size, GFP_KERNEL); if (!buf) @@ -80,8 +82,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) /* * Read block 0 to test for compressed kernel */ - ksys_lseek(fd, start_block * BLOCK_SIZE, 0); - ksys_read(fd, buf, size); + pos = start_block * BLOCK_SIZE; + kernel_read(file, buf, size, &pos); *decompressor = decompress_method(buf, size, &compress_name); if (compress_name) { @@ -126,8 +128,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) /* * Read 512 bytes further to check if cramfs is padded */ - ksys_lseek(fd, start_block * BLOCK_SIZE + 0x200, 0); - ksys_read(fd, buf, size); + pos = start_block * BLOCK_SIZE + 0x200; + kernel_read(file, buf, size, &pos); if (cramfsb->magic == CRAMFS_MAGIC) { printk(KERN_NOTICE @@ -140,8 +142,8 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) /* * Read block 1 to test for minix and ext2 superblock */ - ksys_lseek(fd, (start_block+1) * BLOCK_SIZE, 0); - ksys_read(fd, buf, size); + pos = (start_block + 1) * BLOCK_SIZE; + kernel_read(file, buf, size, &pos); /* Try minix */ if (minixsb->s_magic == MINIX_SUPER_MAGIC || @@ -168,17 +170,24 @@ identify_ramdisk_image(int fd, int start_block, decompress_fn *decompressor) start_block); done: - ksys_lseek(fd, start_block * BLOCK_SIZE, 0); kfree(buf); return nblocks; } +static unsigned long nr_blocks(struct file *file) +{ + struct inode *inode = file->f_mapping->host; + + if (!S_ISBLK(inode->i_mode)) + return 0; + return i_size_read(inode) >> 10; +} + int __init rd_load_image(char *from) { int res = 0; - int in_fd, out_fd; unsigned long rd_blocks, devblocks; - int nblocks, i, disk; + int nblocks, i; char *buf = NULL; unsigned short rotate = 0; decompress_fn decompressor = NULL; @@ -186,20 +195,21 @@ int __init rd_load_image(char *from) char rotator[4] = { '|' , '/' , '-' , '\\' }; #endif - out_fd = ksys_open("/dev/ram", O_RDWR, 0); - if (out_fd < 0) + out_file = filp_open("/dev/ram", O_RDWR, 0); + if (IS_ERR(out_file)) goto out; - in_fd = ksys_open(from, O_RDONLY, 0); - if (in_fd < 0) + in_file = filp_open(from, O_RDONLY, 0); + if (IS_ERR(in_file)) goto noclose_input; - nblocks = identify_ramdisk_image(in_fd, rd_image_start, &decompressor); + in_pos = rd_image_start * BLOCK_SIZE; + nblocks = identify_ramdisk_image(in_file, in_pos, &decompressor); if (nblocks < 0) goto done; if (nblocks == 0) { - if (crd_load(in_fd, out_fd, decompressor) == 0) + if (crd_load(decompressor) == 0) goto successful_load; goto done; } @@ -208,11 +218,7 @@ int __init rd_load_image(char *from) * NOTE NOTE: nblocks is not actually blocks but * the number of kibibytes of data to load into a ramdisk. */ - if (ksys_ioctl(out_fd, BLKGETSIZE, (unsigned long)&rd_blocks) < 0) - rd_blocks = 0; - else - rd_blocks >>= 1; - + rd_blocks = nr_blocks(out_file); if (nblocks > rd_blocks) { printk("RAMDISK: image too big! (%dKiB/%ldKiB)\n", nblocks, rd_blocks); @@ -222,13 +228,10 @@ int __init rd_load_image(char *from) /* * OK, time to copy in the data */ - if (ksys_ioctl(in_fd, BLKGETSIZE, (unsigned long)&devblocks) < 0) - devblocks = 0; - else - devblocks >>= 1; - if (strcmp(from, "/initrd.image") == 0) devblocks = nblocks; + else + devblocks = nr_blocks(in_file); if (devblocks == 0) { printk(KERN_ERR "RAMDISK: could not determine device size\n"); @@ -243,24 +246,15 @@ int __init rd_load_image(char *from) printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); - for (i = 0, disk = 1; i < nblocks; i++) { + for (i = 0; i < nblocks; i++) { if (i && (i % devblocks == 0)) { - pr_cont("done disk #%d.\n", disk++); + pr_cont("done disk #1.\n"); rotate = 0; - if (ksys_close(in_fd)) { - printk("Error closing the disk.\n"); - goto noclose_input; - } - change_floppy("disk #%d", disk); - in_fd = ksys_open(from, O_RDONLY, 0); - if (in_fd < 0) { - printk("Error opening disk.\n"); - goto noclose_input; - } - printk("Loading disk #%d... ", disk); + fput(in_file); + break; } - ksys_read(in_fd, buf, BLOCK_SIZE); - ksys_write(out_fd, buf, BLOCK_SIZE); + kernel_read(in_file, buf, BLOCK_SIZE, &in_pos); + kernel_write(out_file, buf, BLOCK_SIZE, &out_pos); #if !defined(CONFIG_S390) if (!(i % 16)) { pr_cont("%c\b", rotator[rotate & 0x3]); @@ -273,19 +267,17 @@ int __init rd_load_image(char *from) successful_load: res = 1; done: - ksys_close(in_fd); + fput(in_file); noclose_input: - ksys_close(out_fd); + fput(out_file); out: kfree(buf); - ksys_unlink("/dev/ram"); + init_unlink("/dev/ram"); return res; } int __init rd_load_disk(int n) { - if (rd_prompt) - change_floppy("root floppy disk to be loaded into RAM disk"); create_dev("/dev/root", ROOT_DEV); create_dev("/dev/ram", MKDEV(RAMDISK_MAJOR, n)); return rd_load_image("/dev/root"); @@ -293,11 +285,10 @@ int __init rd_load_disk(int n) static int exit_code; static int decompress_error; -static int crd_infd, crd_outfd; static long __init compr_fill(void *buf, unsigned long len) { - long r = ksys_read(crd_infd, buf, len); + long r = kernel_read(in_file, buf, len, &in_pos); if (r < 0) printk(KERN_ERR "RAMDISK: error while reading compressed data"); else if (r == 0) @@ -307,7 +298,7 @@ static long __init compr_fill(void *buf, unsigned long len) static long __init compr_flush(void *window, unsigned long outcnt) { - long written = ksys_write(crd_outfd, window, outcnt); + long written = kernel_write(out_file, window, outcnt, &out_pos); if (written != outcnt) { if (decompress_error == 0) printk(KERN_ERR @@ -326,11 +317,9 @@ static void __init error(char *x) decompress_error = 1; } -static int __init crd_load(int in_fd, int out_fd, decompress_fn deco) +static int __init crd_load(decompress_fn deco) { int result; - crd_infd = in_fd; - crd_outfd = out_fd; if (!deco) { pr_emerg("Invalid ramdisk decompression routine. " diff --git a/init/init_task.c b/init/init_task.c index 9e5cbe5eab7b..ff6c4b9bfe6b 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -11,8 +11,8 @@ #include <linux/mm.h> #include <linux/audit.h> #include <linux/numa.h> +#include <linux/scs.h> -#include <asm/pgtable.h> #include <linux/uaccess.h> static struct signal_struct init_signals = { @@ -26,6 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), + .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { @@ -49,6 +50,13 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), }; +#ifdef CONFIG_SHADOW_CALL_STACK +unsigned long init_shadow_call_stack[SCS_SIZE / sizeof(long)] + __init_task_data = { + [(SCS_SIZE / sizeof(long)) - 1] = SCS_END_MAGIC +}; +#endif + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) @@ -57,12 +65,13 @@ struct task_struct init_task #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK __init_task_data #endif + __aligned(L1_CACHE_BYTES) = { #ifdef CONFIG_THREAD_INFO_IN_TASK .thread_info = INIT_THREAD_INFO(init_task), .stack_refcount = REFCOUNT_INIT(1), #endif - .state = 0, + .__state = 0, .stack = init_stack, .usage = REFCOUNT_INIT(2), .flags = PF_KTHREAD, @@ -71,6 +80,7 @@ struct task_struct init_task .normal_prio = MAX_PRIO - 20, .policy = SCHED_NORMAL, .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, .mm = NULL, @@ -105,6 +115,9 @@ struct task_struct init_task .thread = INIT_THREAD, .fs = &init_fs, .files = &init_files, +#ifdef CONFIG_IO_URING + .io_uring = NULL, +#endif .signal = &init_signals, .sighand = &init_sighand, .nsproxy = &init_nsproxy, @@ -140,8 +153,15 @@ struct task_struct init_task .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), .rcu_tasks_idle_cpu = -1, #endif +#ifdef CONFIG_TASKS_TRACE_RCU + .trc_reader_nesting = 0, + .trc_reader_special.s = 0, + .trc_holdout_list = LIST_HEAD_INIT(init_task.trc_holdout_list), + .trc_blkd_node = LIST_HEAD_INIT(init_task.trc_blkd_node), +#endif #ifdef CONFIG_CPUSETS - .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), + .mems_allowed_seq = SEQCNT_SPINLOCK_ZERO(init_task.mems_allowed_seq, + &init_task.alloc_lock), #endif #ifdef CONFIG_RT_MUTEXES .pi_waiters = RB_ROOT_CACHED, @@ -158,9 +178,14 @@ struct task_struct init_task .numa_group = NULL, .numa_faults = NULL, #endif -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) .kasan_depth = 1, #endif +#ifdef CONFIG_KCSAN + .kcsan_ctx = { + .scoped_accesses = {LIST_POISON1, NULL}, + }, +#endif #ifdef CONFIG_TRACE_IRQFLAGS .softirqs_enabled = 1, #endif @@ -170,7 +195,8 @@ struct task_struct init_task .lockdep_recursion = 0, #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER - .ret_stack = NULL, + .ret_stack = NULL, + .tracing_graph_pause = ATOMIC_INIT(0), #endif #if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION) .trace_recursion = 0, @@ -181,6 +207,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_SECCOMP_FILTER + .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/init/initramfs.c b/init/initramfs.c index 8ec1be4d7d51..2f5bfb7d7652 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> +#include <linux/async.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/types.h> @@ -11,14 +12,23 @@ #include <linux/utime.h> #include <linux/file.h> #include <linux/memblock.h> +#include <linux/mm.h> +#include <linux/namei.h> +#include <linux/init_syscalls.h> +#include <linux/task_work.h> +#include <linux/umh.h> -static ssize_t __init xwrite(int fd, const char *p, size_t count) +static __initdata bool csum_present; +static __initdata u32 io_csum; + +static ssize_t __init xwrite(struct file *file, const unsigned char *p, + size_t count, loff_t *pos) { ssize_t out = 0; /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ while (count) { - ssize_t rv = ksys_write(fd, p, count); + ssize_t rv = kernel_write(file, p, count, pos); if (rv < 0) { if (rv == -EINTR || rv == -EAGAIN) @@ -27,6 +37,13 @@ static ssize_t __init xwrite(int fd, const char *p, size_t count) } else if (rv == 0) break; + if (csum_present) { + ssize_t i; + + for (i = 0; i < rv; i++) + io_csum += p[i]; + } + p += rv; out += rv; count -= rv; @@ -42,6 +59,16 @@ static void __init error(char *x) message = x; } +static void panic_show_mem(const char *fmt, ...) +{ + va_list args; + + show_mem(0, NULL); + va_start(args, fmt); + panic(fmt, args); + va_end(args); +} + /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) @@ -77,7 +104,7 @@ static char __init *find_link(int major, int minor, int ino, } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) - panic("can't allocate link hash entry"); + panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; @@ -100,32 +127,36 @@ static void __init free_hash(void) } } -static long __init do_utime(char *filename, time64_t mtime) +#ifdef CONFIG_INITRAMFS_PRESERVE_MTIME +static void __init do_utime(char *filename, time64_t mtime) { - struct timespec64 t[2]; - - t[0].tv_sec = mtime; - t[0].tv_nsec = 0; - t[1].tv_sec = mtime; - t[1].tv_nsec = 0; + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + init_utimes(filename, t); +} - return do_utimes(AT_FDCWD, filename, t, AT_SYMLINK_NOFOLLOW); +static void __init do_utime_path(const struct path *path, time64_t mtime) +{ + struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; + vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; - char *name; time64_t mtime; + char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { - struct dir_entry *de = kmalloc(sizeof(struct dir_entry), GFP_KERNEL); + size_t nlen = strlen(name) + 1; + struct dir_entry *de; + + de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) - panic("can't allocate dir_entry buffer"); + panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); - de->name = kstrdup(name, GFP_KERNEL); + strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } @@ -136,10 +167,15 @@ static void __init dir_utime(void) list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); - kfree(de->name); kfree(de); } } +#else +static void __init do_utime(char *filename, time64_t mtime) {} +static void __init do_utime_path(const struct path *path, time64_t mtime) {} +static void __init dir_add(const char *name, time64_t mtime) {} +static void __init dir_utime(void) {} +#endif static __initdata time64_t mtime; @@ -151,15 +187,16 @@ static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; +static __initdata u32 hdr_csum; static void __init parse_header(char *s) { - unsigned long parsed[12]; + unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; - for (i = 0, s += 6; i < 12; i++, s += 8) { + for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } @@ -174,6 +211,7 @@ static void __init parse_header(char *s) minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; + hdr_csum = parsed[12]; } /* FSM */ @@ -200,7 +238,6 @@ static inline void __init eat(unsigned n) byte_count -= n; } -static __initdata char *vcollected; static __initdata char *collected; static long remains __initdata; static __initdata char *collect; @@ -243,12 +280,15 @@ static int __init do_collect(void) static int __init do_header(void) { - if (memcmp(collected, "070707", 6)==0) { - error("incorrect cpio method used: use -H newc option"); - return 1; - } - if (memcmp(collected, "070701", 6)) { - error("no cpio magic"); + if (!memcmp(collected, "070701", 6)) { + csum_present = false; + } else if (!memcmp(collected, "070702", 6)) { + csum_present = true; + } else { + if (memcmp(collected, "070707", 6) == 0) + error("incorrect cpio method used: use -H newc option"); + else + error("no cpio magic"); return 1; } parse_header(collected); @@ -296,11 +336,12 @@ static void __init clean_path(char *path, umode_t fmode) { struct kstat st; - if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) && + (st.mode ^ fmode) & S_IFMT) { if (S_ISDIR(st.mode)) - ksys_rmdir(path); + init_rmdir(path); else - ksys_unlink(path); + init_unlink(path); } } @@ -310,13 +351,14 @@ static int __init maybe_link(void) char *old = find_link(major, minor, ino, mode, collected); if (old) { clean_path(collected, 0); - return (ksys_link(old, collected) < 0) ? -1 : 1; + return (init_link(old, collected) < 0) ? -1 : 1; } } return 0; } -static __initdata int wfd; +static __initdata struct file *wfile; +static __initdata loff_t wfile_pos; static int __init do_name(void) { @@ -333,28 +375,29 @@ static int __init do_name(void) int openflags = O_WRONLY|O_CREAT; if (ml != 1) openflags |= O_TRUNC; - wfd = ksys_open(collected, openflags, mode); - - if (wfd >= 0) { - ksys_fchown(wfd, uid, gid); - ksys_fchmod(wfd, mode); - if (body_len) - ksys_ftruncate(wfd, body_len); - vcollected = kstrdup(collected, GFP_KERNEL); - state = CopyFile; - } + wfile = filp_open(collected, openflags, mode); + if (IS_ERR(wfile)) + return 0; + wfile_pos = 0; + io_csum = 0; + + vfs_fchown(wfile, uid, gid); + vfs_fchmod(wfile, mode); + if (body_len) + vfs_truncate(&wfile->f_path, body_len); + state = CopyFile; } } else if (S_ISDIR(mode)) { - ksys_mkdir(collected, mode); - ksys_chown(collected, uid, gid); - ksys_chmod(collected, mode); + init_mkdir(collected, mode); + init_chown(collected, uid, gid, 0); + init_chmod(collected, mode); dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { - ksys_mknod(collected, mode, rdev); - ksys_chown(collected, uid, gid); - ksys_chmod(collected, mode); + init_mknod(collected, mode, rdev); + init_chown(collected, uid, gid, 0); + init_chmod(collected, mode); do_utime(collected, mtime); } } @@ -364,16 +407,18 @@ static int __init do_name(void) static int __init do_copy(void) { if (byte_count >= body_len) { - if (xwrite(wfd, victim, body_len) != body_len) + if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); - ksys_close(wfd); - do_utime(vcollected, mtime); - kfree(vcollected); + + do_utime_path(&wfile->f_path, mtime); + fput(wfile); + if (csum_present && io_csum != hdr_csum) + error("bad data checksum"); eat(body_len); state = SkipIt; return 0; } else { - if (xwrite(wfd, victim, byte_count) != byte_count) + if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count) error("write error"); body_len -= byte_count; eat(byte_count); @@ -385,8 +430,8 @@ static int __init do_symlink(void) { collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); - ksys_symlink(collected + N_ALIGN(name_len), collected); - ksys_lchown(collected, uid, gid); + init_symlink(collected + N_ALIGN(name_len), collected); + init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW); do_utime(collected, mtime); state = SkipIt; next_state = Reset; @@ -437,7 +482,7 @@ static long __init flush_buffer(void *bufv, unsigned long len) return origLen; } -static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include <linux/decompress/generic.h> @@ -453,7 +498,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned long len) name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) - panic("can't allocate buffers"); + panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; @@ -523,18 +568,71 @@ static int __init keepinitrd_setup(char *__unused) __setup("keepinitrd", keepinitrd_setup); #endif +static bool __initdata initramfs_async = true; +static int __init initramfs_async_setup(char *str) +{ + strtobool(str, &initramfs_async); + return 1; +} +__setup("initramfs_async=", initramfs_async_setup); + extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> -void __weak free_initrd_mem(unsigned long start, unsigned long end) +void __init reserve_initrd_mem(void) +{ + phys_addr_t start; + unsigned long size; + + /* Ignore the virtul address computed during device tree parsing */ + initrd_start = initrd_end = 0; + + if (!phys_initrd_size) + return; + /* + * Round the memory region to page boundaries as per free_initrd_mem() + * This allows us to detect whether the pages overlapping the initrd + * are in use, but more importantly, reserves the entire set of pages + * as we don't want these pages allocated for other purposes. + */ + start = round_down(phys_initrd_start, PAGE_SIZE); + size = phys_initrd_size + (phys_initrd_start - start); + size = round_up(size, PAGE_SIZE); + + if (!memblock_is_region_memory(start, size)) { + pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", + (u64)start, size); + goto disable; + } + + if (memblock_is_region_reserved(start, size)) { + pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", + (u64)start, size); + goto disable; + } + + memblock_reserve(start, size); + /* Now convert initrd to virtual addresses */ + initrd_start = (unsigned long)__va(phys_initrd_start); + initrd_end = initrd_start + phys_initrd_size; + initrd_below_start_ok = 1; + + return; +disable: + pr_cont(" - disabling initrd\n"); + initrd_start = 0; + initrd_end = 0; +} + +void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { #ifdef CONFIG_ARCH_KEEP_MEMBLOCK unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); - memblock_free(__pa(aligned_start), aligned_end - aligned_start); + memblock_free((void *)aligned_start, aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, @@ -542,7 +640,7 @@ void __weak free_initrd_mem(unsigned long start, unsigned long end) } #ifdef CONFIG_KEXEC_CORE -static bool kexec_free_initrd(void) +static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); @@ -572,91 +670,35 @@ static inline bool kexec_free_initrd(void) #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM -#define BUF_SIZE 1024 -static void __init clean_rootfs(void) -{ - int fd; - void *buf; - struct linux_dirent64 *dirp; - int num; - - fd = ksys_open("/", O_RDONLY, 0); - WARN_ON(fd < 0); - if (fd < 0) - return; - buf = kzalloc(BUF_SIZE, GFP_KERNEL); - WARN_ON(!buf); - if (!buf) { - ksys_close(fd); - return; - } - - dirp = buf; - num = ksys_getdents64(fd, dirp, BUF_SIZE); - while (num > 0) { - while (num > 0) { - struct kstat st; - int ret; - - ret = vfs_lstat(dirp->d_name, &st); - WARN_ON_ONCE(ret); - if (!ret) { - if (S_ISDIR(st.mode)) - ksys_rmdir(dirp->d_name); - else - ksys_unlink(dirp->d_name); - } - - num -= dirp->d_reclen; - dirp = (void *)dirp + dirp->d_reclen; - } - dirp = buf; - memset(buf, 0, BUF_SIZE); - num = ksys_getdents64(fd, dirp, BUF_SIZE); - } - - ksys_close(fd); - kfree(buf); -} -#else -static inline void clean_rootfs(void) -{ -} -#endif /* CONFIG_BLK_DEV_RAM */ - -#ifdef CONFIG_BLK_DEV_RAM static void __init populate_initrd_image(char *err) { ssize_t written; - int fd; + struct file *file; + loff_t pos = 0; unpack_to_rootfs(__initramfs_start, __initramfs_size); printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); - fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700); - if (fd < 0) + file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + if (IS_ERR(file)) return; - written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start); + written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start, + &pos); if (written != initrd_end - initrd_start) pr_err("/initrd.image: incomplete write (%zd != %ld)\n", written, initrd_end - initrd_start); - ksys_close(fd); -} -#else -static void __init populate_initrd_image(char *err) -{ - printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); + fput(file); } #endif /* CONFIG_BLK_DEV_RAM */ -static int __init populate_rootfs(void) +static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) - panic("%s", err); /* Failed to decompress INTERNAL initramfs */ + panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; @@ -668,8 +710,11 @@ static int __init populate_rootfs(void) err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); if (err) { - clean_rootfs(); +#ifdef CONFIG_BLK_DEV_RAM populate_initrd_image(err); +#else + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +#endif } done: @@ -683,6 +728,35 @@ done: initrd_end = 0; flush_delayed_fput(); + task_work_run(); +} + +static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); +static async_cookie_t initramfs_cookie; + +void wait_for_initramfs(void) +{ + if (!initramfs_cookie) { + /* + * Something before rootfs_initcall wants to access + * the filesystem/initramfs. Probably a bug. Make a + * note, avoid deadlocking the machine, and let the + * caller's access fail as it used to. + */ + pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n"); + return; + } + async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain); +} +EXPORT_SYMBOL_GPL(wait_for_initramfs); + +static int __init populate_rootfs(void) +{ + initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, + &initramfs_domain); + usermodehelper_enable(); + if (!initramfs_async) + wait_for_initramfs(); return 0; } rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index ee4947af823f..aa21add5f7c5 100644 --- a/init/main.c +++ b/init/main.c @@ -33,15 +33,20 @@ #include <linux/nmi.h> #include <linux/percpu.h> #include <linux/kmod.h> +#include <linux/kprobes.h> +#include <linux/kmsan.h> #include <linux/vmalloc.h> #include <linux/kernel_stat.h> #include <linux/start_kernel.h> #include <linux/security.h> #include <linux/smp.h> #include <linux/profile.h> +#include <linux/kfence.h> #include <linux/rcupdate.h> +#include <linux/srcu.h> #include <linux/moduleparam.h> #include <linux/kallsyms.h> +#include <linux/buildid.h> #include <linux/writeback.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -57,12 +62,12 @@ #include <linux/rmap.h> #include <linux/mempolicy.h> #include <linux/key.h> -#include <linux/buffer_head.h> #include <linux/page_ext.h> #include <linux/debug_locks.h> #include <linux/debugobjects.h> #include <linux/lockdep.h> #include <linux/kmemleak.h> +#include <linux/padata.h> #include <linux/pid_namespace.h> #include <linux/device/driver.h> #include <linux/kthread.h> @@ -73,14 +78,12 @@ #include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/async.h> -#include <linux/sfi.h> #include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/pti.h> #include <linux/blkdev.h> -#include <linux/elevator.h> #include <linux/sched/clock.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> @@ -94,6 +97,11 @@ #include <linux/rodata_test.h> #include <linux/jump_label.h> #include <linux/mem_encrypt.h> +#include <linux/kcsan.h> +#include <linux/init_syscalls.h> +#include <linux/stackdepot.h> +#include <linux/randomize_kstack.h> +#include <net/net_namespace.h> #include <asm/io.h> #include <asm/bugs.h> @@ -104,10 +112,13 @@ #define CREATE_TRACE_POINTS #include <trace/events/initcall.h> +#include <kunit/test.h> + static int kernel_init(void *); extern void init_IRQ(void); extern void radix_tree_init(void); +extern void maple_tree_init(void); /* * Debug helper: via this flag we know that we are in 'early bootup code' @@ -145,14 +156,14 @@ static char *extra_init_args; #ifdef CONFIG_BOOT_CONFIG /* Is bootconfig on command line? */ static bool bootconfig_found; -static bool initargs_found; +static size_t initargs_offs; #else # define bootconfig_found false -# define initargs_found false +# define initargs_offs 0 #endif static char *execute_command; -static char *ramdisk_execute_command; +static char *ramdisk_execute_command = "/init"; /* * Used to generate warnings if static_key manipulation functions are used @@ -257,9 +268,63 @@ static int __init loglevel(char *str) early_param("loglevel", loglevel); +#ifdef CONFIG_BLK_DEV_INITRD +static void * __init get_boot_config_from_initrd(size_t *_size) +{ + u32 size, csum; + char *data; + u32 *hdr; + int i; + + if (!initrd_end) + return NULL; + + data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; + /* + * Since Grub may align the size of initrd to 4, we must + * check the preceding 3 bytes as well. + */ + for (i = 0; i < 4; i++) { + if (!memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) + goto found; + data--; + } + return NULL; + +found: + hdr = (u32 *)(data - 8); + size = le32_to_cpu(hdr[0]); + csum = le32_to_cpu(hdr[1]); + + data = ((void *)hdr) - size; + if ((unsigned long)data < initrd_start) { + pr_err("bootconfig size %d is greater than initrd size %ld\n", + size, initrd_end - initrd_start); + return NULL; + } + + if (xbc_calc_checksum(data, size) != csum) { + pr_err("bootconfig checksum failed\n"); + return NULL; + } + + /* Remove bootconfig from initramfs/initrd */ + initrd_end = (unsigned long)data; + if (_size) + *_size = size; + + return data; +} +#else +static void * __init get_boot_config_from_initrd(size_t *_size) +{ + return NULL; +} +#endif + #ifdef CONFIG_BOOT_CONFIG -char xbc_namebuf[XBC_KEYLEN_MAX] __initdata; +static char xbc_namebuf[XBC_KEYLEN_MAX] __initdata; #define rest(dst, end) ((end) > (dst) ? (end) - (dst) : 0) @@ -323,108 +388,106 @@ static char * __init xbc_make_cmdline(const char *key) ret = xbc_snprint_cmdline(new_cmdline, len + 1, root); if (ret < 0 || ret > len) { pr_err("Failed to print extra kernel cmdline.\n"); + memblock_free(new_cmdline, len + 1); return NULL; } return new_cmdline; } -static u32 boot_config_checksum(unsigned char *p, u32 size) -{ - u32 ret = 0; - - while (size--) - ret += *p++; - - return ret; -} - static int __init bootconfig_params(char *param, char *val, const char *unused, void *arg) { if (strcmp(param, "bootconfig") == 0) { bootconfig_found = true; - } else if (strcmp(param, "--") == 0) { - initargs_found = true; } return 0; } -static void __init setup_boot_config(const char *cmdline) +static int __init warn_bootconfig(char *str) { - static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; - u32 size, csum; - char *data, *copy; - u32 *hdr; - int ret; - - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, - bootconfig_params); + /* The 'bootconfig' has been handled by bootconfig_params(). */ + return 0; +} - if (!bootconfig_found) +static void __init setup_boot_config(void) +{ + static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; + const char *msg, *data; + int pos, ret; + size_t size; + char *err; + + /* Cut out the bootconfig data even if we have no bootconfig option */ + data = get_boot_config_from_initrd(&size); + /* If there is no bootconfig in initrd, try embedded one. */ + if (!data) + data = xbc_get_embedded_bootconfig(&size); + + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL, + bootconfig_params); + + if (IS_ERR(err) || !bootconfig_found) return; - if (!initrd_end) - goto not_found; + /* parse_args() stops at the next param of '--' and returns an address */ + if (err) + initargs_offs = err - tmp_cmdline; - data = (char *)initrd_end - BOOTCONFIG_MAGIC_LEN; - if (memcmp(data, BOOTCONFIG_MAGIC, BOOTCONFIG_MAGIC_LEN)) - goto not_found; - - hdr = (u32 *)(data - 8); - size = hdr[0]; - csum = hdr[1]; - - if (size >= XBC_DATA_MAX) { - pr_err("bootconfig size %d greater than max size %d\n", - size, XBC_DATA_MAX); + if (!data) { + pr_err("'bootconfig' found on command line, but no bootconfig found\n"); return; } - data = ((void *)hdr) - size; - if ((unsigned long)data < initrd_start) - goto not_found; - - if (boot_config_checksum((unsigned char *)data, size) != csum) { - pr_err("bootconfig checksum failed\n"); - return; - } - - copy = memblock_alloc(size + 1, SMP_CACHE_BYTES); - if (!copy) { - pr_err("Failed to allocate memory for bootconfig\n"); + if (size >= XBC_DATA_MAX) { + pr_err("bootconfig size %ld greater than max size %d\n", + (long)size, XBC_DATA_MAX); return; } - memcpy(copy, data, size); - copy[size] = '\0'; - - ret = xbc_init(copy); - if (ret < 0) - pr_err("Failed to parse bootconfig\n"); - else { - pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); + ret = xbc_init(data, size, &msg, &pos); + if (ret < 0) { + if (pos < 0) + pr_err("Failed to init bootconfig: %s.\n", msg); + else + pr_err("Failed to parse bootconfig: %s at %d.\n", + msg, pos); + } else { + xbc_get_info(&ret, NULL); + pr_info("Load bootconfig: %ld bytes %d nodes\n", (long)size, ret); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); /* Also, "init." keys are init arguments */ extra_init_args = xbc_make_cmdline("init"); } return; -not_found: - pr_err("'bootconfig' found on command line, but no bootconfig found\n"); } -#else -#define setup_boot_config(cmdline) do { } while (0) + +static void __init exit_boot_config(void) +{ + xbc_exit(); +} + +#else /* !CONFIG_BOOT_CONFIG */ + +static void __init setup_boot_config(void) +{ + /* Remove bootconfig data from initrd */ + get_boot_config_from_initrd(NULL); +} static int __init warn_bootconfig(char *str) { - pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOTCONFIG is not set.\n"); + pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; } -early_param("bootconfig", warn_bootconfig); -#endif +#define exit_boot_config() do {} while (0) + +#endif /* CONFIG_BOOT_CONFIG */ + +early_param("bootconfig", warn_bootconfig); /* Change NUL term back to "=", to make "param" the whole string. */ static void __init repair_env_string(char *param, char *val) @@ -588,16 +651,21 @@ static void __init setup_command_line(char *command_line) * Append supplemental init boot args to saved_command_line * so that user can check what command line options passed * to init. + * The order should always be + * " -- "[bootconfig init-param][cmdline init-param] */ - len = strlen(saved_command_line); - if (initargs_found) { - saved_command_line[len++] = ' '; + if (initargs_offs) { + len = xlen + initargs_offs; + strcpy(saved_command_line + len, extra_init_args); + len += ilen - 4; /* strlen(extra_init_args) */ + strcpy(saved_command_line + len, + boot_command_line + initargs_offs - 1); } else { + len = strlen(saved_command_line); strcpy(saved_command_line + len, " -- "); len += 4; + strcpy(saved_command_line + len, extra_init_args); } - - strcpy(saved_command_line + len, extra_init_args); } } @@ -623,7 +691,7 @@ noinline void __ref rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - pid = kernel_thread(kernel_init, NULL, CLONE_FS); + pid = user_mode_thread(kernel_init, NULL, CLONE_FS); /* * Pin init on the boot CPU. Task migration is not properly working * until sched_init_smp() has been run. It will set the allowed @@ -631,6 +699,7 @@ noinline void __ref rest_init(void) */ rcu_read_lock(); tsk = find_task_by_pid_ns(pid, &init_pid_ns); + tsk->flags |= PF_NO_SETAFFINITY; set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id())); rcu_read_unlock(); @@ -695,7 +764,7 @@ void __init parse_early_param(void) return; /* All fall through to do_early_param. */ - strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strscpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); parse_early_options(tmp_cmdline); done = 1; } @@ -718,6 +787,8 @@ void __init __weak poking_init(void) { } void __init __weak pgtable_cache_init(void) { } +void __init __weak trap_init(void) { } + bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); @@ -734,14 +805,16 @@ static void __init report_meminit(void) { const char *stack; - if (IS_ENABLED(CONFIG_INIT_STACK_ALL)) - stack = "all"; + if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN)) + stack = "all(pattern)"; + else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO)) + stack = "all(zero)"; else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL)) - stack = "byref_all"; + stack = "byref_all(zero)"; else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF)) - stack = "byref"; + stack = "byref(zero)"; else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER)) - stack = "__user"; + stack = "__user(zero)"; else stack = "off"; @@ -762,27 +835,105 @@ static void __init mm_init(void) * bigger than MAX_ORDER unless SPARSEMEM. */ page_ext_init_flatmem(); - init_debug_pagealloc(); + init_mem_debugging_and_hardening(); + kfence_alloc_pool(); report_meminit(); + kmsan_init_shadow(); + stack_depot_early_init(); mem_init(); + mem_init_print_info(); kmem_cache_init(); + /* + * page_owner must be initialized after buddy is ready, and also after + * slab is ready so that stack_depot_init() works properly + */ + page_ext_init_flatmem_late(); kmemleak_init(); pgtable_init(); debug_objects_mem_init(); vmalloc_init(); - ioremap_huge_init(); + /* Should be run after vmap initialization */ + if (early_page_ext_enabled()) + page_ext_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); + kmsan_init_runtime(); } +#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET +DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, + randomize_kstack_offset); +DEFINE_PER_CPU(u32, kstack_offset); + +static int __init early_randomize_kstack_offset(char *buf) +{ + int ret; + bool bool_result; + + ret = kstrtobool(buf, &bool_result); + if (ret) + return ret; + + if (bool_result) + static_branch_enable(&randomize_kstack_offset); + else + static_branch_disable(&randomize_kstack_offset); + return 0; +} +early_param("randomize_kstack_offset", early_randomize_kstack_offset); +#endif + void __init __weak arch_call_rest_init(void) { rest_init(); } -asmlinkage __visible void __init start_kernel(void) +static void __init print_unknown_bootoptions(void) +{ + char *unknown_options; + char *end; + const char *const *p; + size_t len; + + if (panic_later || (!argv_init[1] && !envp_init[2])) + return; + + /* + * Determine how many options we have to print out, plus a space + * before each + */ + len = 1; /* null terminator */ + for (p = &argv_init[1]; *p; p++) { + len++; + len += strlen(*p); + } + for (p = &envp_init[2]; *p; p++) { + len++; + len += strlen(*p); + } + + unknown_options = memblock_alloc(len, SMP_CACHE_BYTES); + if (!unknown_options) { + pr_err("%s: Failed to allocate %zu bytes\n", + __func__, len); + return; + } + end = unknown_options; + + for (p = &argv_init[1]; *p; p++) + end += sprintf(end, " %s", *p); + for (p = &envp_init[2]; *p; p++) + end += sprintf(end, " %s", *p); + + /* Start at unknown_options[1] to skip the initial space */ + pr_notice("Unknown kernel command line parameters \"%s\", will be passed to user space.\n", + &unknown_options[1]); + memblock_free(unknown_options, len); +} + +asmlinkage __visible void __init __no_sanitize_address start_kernel(void) { char *command_line; char *after_dashes; @@ -790,6 +941,7 @@ asmlinkage __visible void __init start_kernel(void) set_task_stack_end_magic(&init_task); smp_setup_processor_id(); debug_objects_early_init(); + init_vmlinux_build_id(); cgroup_init_early(); @@ -805,7 +957,7 @@ asmlinkage __visible void __init start_kernel(void) pr_notice("%s", linux_banner); early_security_init(); setup_arch(&command_line); - setup_boot_config(command_line); + setup_boot_config(); setup_command_line(command_line); setup_nr_cpu_ids(); setup_per_cpu_areas(); @@ -823,6 +975,7 @@ asmlinkage __visible void __init start_kernel(void) static_command_line, __start___param, __stop___param - __start___param, -1, -1, NULL, &unknown_bootoption); + print_unknown_bootoptions(); if (!IS_ERR_OR_NULL(after_dashes)) parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, NULL, set_init_arg); @@ -830,6 +983,9 @@ asmlinkage __visible void __init start_kernel(void) parse_args("Setting extra init args", extra_init_args, NULL, 0, -1, -1, NULL, set_init_arg); + /* Architectural and non-timekeeping rng init, before allocator init */ + random_init_early(command_line); + /* * These use large bootmem allocations and must precede * kmem_cache_init() @@ -851,15 +1007,12 @@ asmlinkage __visible void __init start_kernel(void) * time - but meanwhile we still have a functioning scheduler. */ sched_init(); - /* - * Disable preemption - early bootup scheduling is extremely - * fragile until we cpu_idle() for the first time. - */ - preempt_disable(); + if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); + maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound @@ -889,25 +1042,19 @@ asmlinkage __visible void __init start_kernel(void) tick_init(); rcu_init_nohz(); init_timers(); + srcu_init(); hrtimers_init(); softirq_init(); timekeeping_init(); + time_init(); - /* - * For best initial stack canary entropy, prepare it after: - * - setup_arch() for any UEFI RNG entropy and boot cmdline access - * - timekeeping_init() for ktime entropy used in rand_initialize() - * - rand_initialize() to get any arch-specific entropy like RDRAND - * - add_latent_entropy() to get any latent entropy - * - adding command line entropy - */ - rand_initialize(); - add_latent_entropy(); - add_device_randomness(command_line, strlen(command_line)); + /* This must be after timekeeping is initialized */ + random_init(); + + /* These make use of the fully initialized rng */ + kfence_init(); boot_init_stack_canary(); - time_init(); - printk_safe_init(); perf_event_init(); profile_init(); call_function_init(); @@ -972,10 +1119,10 @@ asmlinkage __visible void __init start_kernel(void) fork_init(); proc_caches_init(); uts_ns_init(); - buffer_init(); key_init(); security_init(); dbg_late_init(); + net_ns_init(); vfs_caches_init(); pagecache_init(); signals_init(); @@ -992,16 +1139,24 @@ asmlinkage __visible void __init start_kernel(void) acpi_subsystem_init(); arch_post_acpi_subsys_init(); - sfi_init_late(); + kcsan_init(); /* Do the rest non-__init'ed, we're now alive */ arch_call_rest_init(); + + prevent_tail_call_optimization(); } /* Call all constructor functions linked into the kernel. */ static void __init do_ctors(void) { -#ifdef CONFIG_CONSTRUCTORS +/* + * For UML, the constructors have already been called by the + * normal setup code as it's just a normal ELF binary, so we + * cannot do it again - but we do need CONFIG_CONSTRUCTORS + * even on UML for modules. + */ +#if defined(CONFIG_CONSTRUCTORS) && !defined(CONFIG_UML) ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; for (; fn < (ctor_fn_t *) __ctors_end; fn++) @@ -1042,7 +1197,7 @@ static int __init initcall_blacklist(char *str) } } while (str_entry); - return 0; + return 1; } static bool __init_or_module initcall_blacklisted(initcall_t fn) @@ -1089,7 +1244,7 @@ __setup("initcall_blacklist=", initcall_blacklist); static __init_or_module void trace_initcall_start_cb(void *data, initcall_t fn) { - ktime_t *calltime = (ktime_t *)data; + ktime_t *calltime = data; printk(KERN_DEBUG "calling %pS @ %i\n", fn, task_pid_nr(current)); *calltime = ktime_get(); @@ -1098,15 +1253,11 @@ trace_initcall_start_cb(void *data, initcall_t fn) static __init_or_module void trace_initcall_finish_cb(void *data, initcall_t fn, int ret) { - ktime_t *calltime = (ktime_t *)data; - ktime_t delta, rettime; - unsigned long long duration; + ktime_t rettime, *calltime = data; rettime = ktime_get(); - delta = ktime_sub(rettime, *calltime); - duration = (unsigned long long) ktime_to_ns(delta) >> 10; printk(KERN_DEBUG "initcall %pS returned %d after %lld usecs\n", - fn, ret, duration); + fn, ret, (unsigned long long)ktime_us_delta(rettime, *calltime)); } static ktime_t initcall_calltime; @@ -1257,7 +1408,6 @@ static void __init do_basic_setup(void) driver_init(); init_irq_proc(); do_ctors(); - usermodehelper_enable(); do_initcalls(); } @@ -1282,9 +1432,7 @@ static int run_init_process(const char *init_filename) pr_debug(" with environment:\n"); for (p = envp_init; *p; p++) pr_debug(" %s\n", *p); - return do_execve(getname_kernel(init_filename), - (const char __user *const __user *)argv_init, - (const char __user *const __user *)envp_init); + return kernel_execve(init_filename, argv_init, envp_init); } static int try_to_run_init_process(const char *init_filename) @@ -1305,11 +1453,25 @@ static noinline void __init kernel_init_freeable(void); #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX) bool rodata_enabled __ro_after_init = true; + +#ifndef arch_parse_debug_rodata +static inline bool arch_parse_debug_rodata(char *str) { return false; } +#endif + static int __init set_debug_rodata(char *str) { - return strtobool(str, &rodata_enabled); + if (arch_parse_debug_rodata(str)) + return 0; + + if (str && !strcmp(str, "on")) + rodata_enabled = true; + else if (str && !strcmp(str, "off")) + rodata_enabled = false; + else + pr_warn("Invalid option string for rodata: '%s'\n", str); + return 0; } -__setup("rodata=", set_debug_rodata); +early_param("rodata", set_debug_rodata); #endif #ifdef CONFIG_STRICT_KERNEL_RWX @@ -1349,10 +1511,20 @@ static int __ref kernel_init(void *unused) { int ret; + /* + * Wait until kthreadd is all set-up. + */ + wait_for_completion(&kthreadd_done); + kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + + system_state = SYSTEM_FREEING_INITMEM; + kprobe_free_init_mem(); ftrace_free_init_mem(); + kgdb_free_init_mem(); + exit_boot_config(); free_initmem(); mark_readonly(); @@ -1367,6 +1539,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) @@ -1388,6 +1562,16 @@ static int __ref kernel_init(void *unused) panic("Requested init %s failed (error %d).", execute_command, ret); } + + if (CONFIG_DEFAULT_INIT[0] != '\0') { + ret = run_init_process(CONFIG_DEFAULT_INIT); + if (ret) + pr_err("Default init %s failed (error %d)\n", + CONFIG_DEFAULT_INIT, ret); + else + return 0; + } + if (!try_to_run_init_process("/sbin/init") || !try_to_run_init_process("/etc/init") || !try_to_run_init_process("/bin/init") || @@ -1398,24 +1582,23 @@ static int __ref kernel_init(void *unused) "See Linux Documentation/admin-guide/init.rst for guidance."); } -void console_on_rootfs(void) +/* Open /dev/console, for stdin/stdout/stderr, this should never fail */ +void __init console_on_rootfs(void) { - /* Open the /dev/console as stdin, this should never fail */ - if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) - pr_err("Warning: unable to open an initial console.\n"); + struct file *file = filp_open("/dev/console", O_RDWR, 0); - /* create stdout/stderr */ - (void) ksys_dup(0); - (void) ksys_dup(0); + if (IS_ERR(file)) { + pr_err("Warning: unable to open an initial console.\n"); + return; + } + init_dup(file); + init_dup(file); + init_dup(file); + fput(file); } static noinline void __init kernel_init_freeable(void) { - /* - * Wait until kthreadd is all set-up. - */ - wait_for_completion(&kthreadd_done); - /* Now the scheduler is fully set up and can do blocking allocations */ gfp_allowed_mask = __GFP_BITS_MASK; @@ -1424,7 +1607,7 @@ static noinline void __init kernel_init_freeable(void) */ set_mems_allowed(node_states[N_MEMORY]); - cad_pid = task_pid(current); + cad_pid = get_pid(task_pid(current)); smp_prepare_cpus(setup_max_cpus); @@ -1432,30 +1615,31 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); + rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); smp_init(); sched_init_smp(); + padata_init(); page_alloc_init_late(); /* Initialize page ext after all struct pages are initialized. */ - page_ext_init(); + if (!early_page_ext_enabled()) + page_ext_init(); do_basic_setup(); + kunit_run_all_tests(); + + wait_for_initramfs(); console_on_rootfs(); /* * check if there is an early userspace init. If yes, let it do all * the work */ - - if (!ramdisk_execute_command) - ramdisk_execute_command = "/init"; - - if (ksys_access((const char __user *) - ramdisk_execute_command, 0) != 0) { + if (init_eaccess(ramdisk_execute_command) != 0) { ramdisk_execute_command = NULL; prepare_namespace(); } diff --git a/init/noinitramfs.c b/init/noinitramfs.c index fa9cdfa7101d..d1d26b93d25c 100644 --- a/init/noinitramfs.c +++ b/init/noinitramfs.c @@ -9,6 +9,8 @@ #include <linux/stat.h> #include <linux/kdev_t.h> #include <linux/syscalls.h> +#include <linux/init_syscalls.h> +#include <linux/umh.h> /* * Create a simple rootfs that is similar to the default initramfs @@ -17,17 +19,17 @@ static int __init default_rootfs(void) { int err; - err = ksys_mkdir((const char __user __force *) "/dev", 0755); + usermodehelper_enable(); + err = init_mkdir("/dev", 0755); if (err < 0) goto out; - err = ksys_mknod((const char __user __force *) "/dev/console", - S_IFCHR | S_IRUSR | S_IWUSR, + err = init_mknod("/dev/console", S_IFCHR | S_IRUSR | S_IWUSR, new_encode_dev(MKDEV(5, 1))); if (err < 0) goto out; - err = ksys_mkdir((const char __user __force *) "/root", 0700); + err = init_mkdir("/root", 0700); if (err < 0) goto out; diff --git a/init/version-timestamp.c b/init/version-timestamp.c new file mode 100644 index 000000000000..179e93bae539 --- /dev/null +++ b/init/version-timestamp.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <generated/compile.h> +#include <generated/utsrelease.h> +#include <linux/version.h> +#include <linux/proc_ns.h> +#include <linux/refcount.h> +#include <linux/uts.h> +#include <linux/utsname.h> + +struct uts_namespace init_uts_ns = { + .ns.count = REFCOUNT_INIT(2), + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, + .user_ns = &init_user_ns, + .ns.inum = PROC_UTS_INIT_INO, +#ifdef CONFIG_UTS_NS + .ns.ops = &utsns_operations, +#endif +}; + +/* FIXED STRINGS! Don't touch! */ +const char linux_banner[] = + "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" + LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; diff --git a/init/version.c b/init/version.c index cba341161b58..01d4ab05f0ba 100644 --- a/init/version.c +++ b/init/version.c @@ -9,43 +9,29 @@ #include <generated/compile.h> #include <linux/build-salt.h> +#include <linux/elfnote-lto.h> #include <linux/export.h> +#include <linux/init.h> +#include <linux/printk.h> #include <linux/uts.h> #include <linux/utsname.h> #include <generated/utsrelease.h> -#include <linux/version.h> #include <linux/proc_ns.h> -#ifndef CONFIG_KALLSYMS -#define version(a) Version_ ## a -#define version_string(a) version(a) - -extern int version_string(LINUX_VERSION_CODE); -int version_string(LINUX_VERSION_CODE); -#endif - -struct uts_namespace init_uts_ns = { - .kref = KREF_INIT(2), - .name = { - .sysname = UTS_SYSNAME, - .nodename = UTS_NODENAME, - .release = UTS_RELEASE, - .version = UTS_VERSION, - .machine = UTS_MACHINE, - .domainname = UTS_DOMAINNAME, - }, - .user_ns = &init_user_ns, - .ns.inum = PROC_UTS_INIT_INO, -#ifdef CONFIG_UTS_NS - .ns.ops = &utsns_operations, -#endif -}; -EXPORT_SYMBOL_GPL(init_uts_ns); +static int __init early_hostname(char *arg) +{ + size_t bufsize = sizeof(init_uts_ns.name.nodename); + size_t maxlen = bufsize - 1; + size_t arglen; -/* FIXED STRINGS! Don't touch! */ -const char linux_banner[] = - "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" - LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION "\n"; + arglen = strlcpy(init_uts_ns.name.nodename, arg, bufsize); + if (arglen > maxlen) { + pr_warn("hostname parameter exceeds %zd characters and will be truncated", + maxlen); + } + return 0; +} +early_param("hostname", early_hostname); const char linux_proc_banner[] = "%s version %s" @@ -53,3 +39,17 @@ const char linux_proc_banner[] = " (" LINUX_COMPILER ") %s\n"; BUILD_SALT; +BUILD_LTO_INFO; + +/* + * init_uts_ns and linux_banner contain the build version and timestamp, + * which are really fixed at the very last step of build process. + * They are compiled with __weak first, and without __weak later. + */ + +struct uts_namespace init_uts_ns __weak; +const char linux_banner[] __weak; + +#include "version-timestamp.c" + +EXPORT_SYMBOL_GPL(init_uts_ns); |