From 2a86f6612164a10a3cdb6a499bddf329c4d69f84 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Fri, 28 Feb 2020 12:46:40 +0900 Subject: kbuild: use KBUILD_DEFCONFIG as the fallback for DEFCONFIG_LIST Most of the Kconfig commands (except defconfig and all*config) read the .config file as a base set of CONFIG options. When it does not exist, the files in DEFCONFIG_LIST are searched in this order and loaded if found. I do not see much sense in the last two lines in DEFCONFIG_LIST. [1] ARCH_DEFCONFIG The entry for DEFCONFIG_LIST is guarded by 'depends on !UML'. So, the ARCH_DEFCONFIG definition in arch/x86/um/Kconfig is meaningless. arch/{sh,sparc,x86}/Kconfig define ARCH_DEFCONFIG depending on 32 or 64 bit variant symbols. This is a little bit strange; ARCH_DEFCONFIG should be a fixed string because the base config file is loaded before the symbol evaluation stage. Using KBUILD_DEFCONFIG makes more sense because it is fixed before Kconfig is invoked. Fortunately, arch/{sh,sparc,x86}/Makefile define it in the same way, and it works as expected. Hence, replace ARCH_DEFCONFIG with "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)". [2] arch/$(ARCH)/defconfig This file path is no longer valid. The defconfig files are always located in the arch configs/ directories. $ find arch -name defconfig | sort arch/alpha/configs/defconfig arch/arm64/configs/defconfig arch/csky/configs/defconfig arch/nds32/configs/defconfig arch/riscv/configs/defconfig arch/s390/configs/defconfig arch/unicore32/configs/defconfig The path arch/*/configs/defconfig is already covered by "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)". So, this file path is not necessary. I moved the default KBUILD_DEFCONFIG to the top Makefile. Otherwise, the 7 architectures listed above would end up with endless loop of syncconfig. Signed-off-by: Masahiro Yamada --- init/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..240c1ed15c69 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -6,8 +6,7 @@ config DEFCONFIG_LIST default "/lib/modules/$(shell,uname -r)/.config" default "/etc/kernel-config" default "/boot/config-$(shell,uname -r)" - default ARCH_DEFCONFIG - default "arch/$(ARCH)/defconfig" + default "arch/$(SRCARCH)/configs/$(KBUILD_DEFCONFIG)" config CC_IS_GCC def_bool $(success,$(CC) --version | head -n 1 | grep -q gcc) -- cgit v1.2.3-59-g8ed1b From 1518c633df78185f5cbd7322b52ef06358febac3 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Fri, 28 Feb 2020 17:20:13 +0000 Subject: kbuild: allow symbol whitelisting with TRIM_UNUSED_KSYMS CONFIG_TRIM_UNUSED_KSYMS currently removes all unused exported symbols from ksymtab. This works really well when using in-tree drivers, but cannot be used in its current form if some of them are out-of-tree. Indeed, even if the list of symbols required by out-of-tree drivers is known at compile time, the only solution today to guarantee these don't get trimmed is to set CONFIG_TRIM_UNUSED_KSYMS=n. This not only wastes space, but also makes it difficult to control the ABI usable by vendor modules in distribution kernels such as Android. Being able to control the kernel ABI surface is particularly useful to ship a unique Generic Kernel Image (GKI) for all vendors, which is a first step in the direction of getting all vendors to contribute their code upstream. As such, attempt to improve the situation by enabling users to specify a symbol 'whitelist' at compile time. Any symbol specified in this whitelist will be kept exported when CONFIG_TRIM_UNUSED_KSYMS is set, even if it has no in-tree user. The whitelist is defined as a simple text file, listing symbols, one per line. Acked-by: Jessica Yu Acked-by: Nicolas Pitre Tested-by: Matthias Maennich Reviewed-by: Matthias Maennich Signed-off-by: Quentin Perret Signed-off-by: Masahiro Yamada --- init/Kconfig | 13 +++++++++++++ scripts/adjust_autoksyms.sh | 12 ++++++++++++ 2 files changed, 25 insertions(+) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 240c1ed15c69..9f2f10cfb097 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2207,6 +2207,19 @@ config TRIM_UNUSED_KSYMS If unsure, or if you need to build out-of-tree modules, say N. +config UNUSED_KSYMS_WHITELIST + string "Whitelist of symbols to keep in ksymtab" + depends on TRIM_UNUSED_KSYMS + help + By default, all unused exported symbols will be un-exported from the + build when TRIM_UNUSED_KSYMS is selected. + + UNUSED_KSYMS_WHITELIST allows to whitelist symbols that must be kept + exported at all times, even in absence of in-tree users. The value to + set here is the path to a text file containing the list of symbols, + one per line. The path can be absolute, or relative to the kernel + source tree. + endif # MODULES config MODULES_TREE_LOOKUP diff --git a/scripts/adjust_autoksyms.sh b/scripts/adjust_autoksyms.sh index a904bf1f5e67..212ba45595d3 100755 --- a/scripts/adjust_autoksyms.sh +++ b/scripts/adjust_autoksyms.sh @@ -38,6 +38,17 @@ esac # We need access to CONFIG_ symbols . include/config/auto.conf +ksym_wl=/dev/null +if [ -n "$CONFIG_UNUSED_KSYMS_WHITELIST" ]; then + # Use 'eval' to expand the whitelist path and check if it is relative + eval ksym_wl="$CONFIG_UNUSED_KSYMS_WHITELIST" + [ "${ksym_wl}" != "${ksym_wl#/}" ] || ksym_wl="$abs_srctree/$ksym_wl" + if [ ! -f "$ksym_wl" ] || [ ! -r "$ksym_wl" ]; then + echo "ERROR: '$ksym_wl' whitelist file not found" >&2 + exit 1 + fi +fi + # Generate a new ksym list file with symbols needed by the current # set of modules. cat > "$new_ksyms_file" << EOT @@ -48,6 +59,7 @@ cat > "$new_ksyms_file" << EOT EOT sed 's/ko$/mod/' modules.order | xargs -n1 sed -n -e '2{s/ /\n/g;/^$/!p;}' -- | +cat - "$ksym_wl" | sort -u | sed -e 's/\(.*\)/#define __KSYM_\1 1/' >> "$new_ksyms_file" -- cgit v1.2.3-59-g8ed1b From 89b74cac7834734d6b2733204c639917d3826083 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 3 Mar 2020 20:24:50 +0900 Subject: tools/bootconfig: Show line and column in parse error Show line and column when we got a parse error in bootconfig tool. Current lib/bootconfig shows the parse error with byte offset, but that is not human readable. This makes xbc_init() not showing error message itself but able to pass the error message and position to caller, so that the caller can decode it and show the error message with line number and columns. With this patch, bootconfig tool shows an error with line:column as below. $ cat samples/bad-dotword.bconf # do not start keyword with . key { .word = 1 } $ ./bootconfig -a samples/bad-dotword.bconf initrd Parse Error: Invalid keyword at 3:3 Link: http://lkml.kernel.org/r/158323469002.10560.4023923847704522760.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/bootconfig.h | 3 ++- init/main.c | 14 ++++++++++---- lib/bootconfig.c | 35 ++++++++++++++++++++++++++--------- tools/bootconfig/main.c | 35 +++++++++++++++++++++++++++++++---- 4 files changed, 69 insertions(+), 18 deletions(-) (limited to 'init') diff --git a/include/linux/bootconfig.h b/include/linux/bootconfig.h index d11e183fcb54..9903088891fa 100644 --- a/include/linux/bootconfig.h +++ b/include/linux/bootconfig.h @@ -216,7 +216,8 @@ static inline int __init xbc_node_compose_key(struct xbc_node *node, } /* XBC node initializer */ -int __init xbc_init(char *buf); +int __init xbc_init(char *buf, const char **emsg, int *epos); + /* XBC cleanup data structures */ void __init xbc_destroy_all(void); diff --git a/init/main.c b/init/main.c index ee4947af823f..e488213857e2 100644 --- a/init/main.c +++ b/init/main.c @@ -353,6 +353,8 @@ static int __init bootconfig_params(char *param, char *val, static void __init setup_boot_config(const char *cmdline) { static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata; + const char *msg; + int pos; u32 size, csum; char *data, *copy; u32 *hdr; @@ -400,10 +402,14 @@ static void __init setup_boot_config(const char *cmdline) memcpy(copy, data, size); copy[size] = '\0'; - ret = xbc_init(copy); - if (ret < 0) - pr_err("Failed to parse bootconfig\n"); - else { + ret = xbc_init(copy, &msg, &pos); + if (ret < 0) { + if (pos < 0) + pr_err("Failed to init bootconfig: %s.\n", msg); + else + pr_err("Failed to parse bootconfig: %s at %d.\n", + msg, pos); + } else { pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret); /* keys starting with "kernel." are passed via cmdline */ extra_command_line = xbc_make_cmdline("kernel"); diff --git a/lib/bootconfig.c b/lib/bootconfig.c index ec3ce7fd299f..912ef4921398 100644 --- a/lib/bootconfig.c +++ b/lib/bootconfig.c @@ -29,12 +29,14 @@ static int xbc_node_num __initdata; static char *xbc_data __initdata; static size_t xbc_data_size __initdata; static struct xbc_node *last_parent __initdata; +static const char *xbc_err_msg __initdata; +static int xbc_err_pos __initdata; static int __init xbc_parse_error(const char *msg, const char *p) { - int pos = p - xbc_data; + xbc_err_msg = msg; + xbc_err_pos = (int)(p - xbc_data); - pr_err("Parse error at pos %d: %s\n", pos, msg); return -EINVAL; } @@ -738,33 +740,44 @@ void __init xbc_destroy_all(void) /** * xbc_init() - Parse given XBC file and build XBC internal tree * @buf: boot config text + * @emsg: A pointer of const char * to store the error message + * @epos: A pointer of int to store the error position * * This parses the boot config text in @buf. @buf must be a * null terminated string and smaller than XBC_DATA_MAX. * Return the number of stored nodes (>0) if succeeded, or -errno * if there is any error. + * In error cases, @emsg will be updated with an error message and + * @epos will be updated with the error position which is the byte offset + * of @buf. If the error is not a parser error, @epos will be -1. */ -int __init xbc_init(char *buf) +int __init xbc_init(char *buf, const char **emsg, int *epos) { char *p, *q; int ret, c; + if (epos) + *epos = -1; + if (xbc_data) { - pr_err("Error: bootconfig is already initialized.\n"); + if (emsg) + *emsg = "Bootconfig is already initialized"; return -EBUSY; } ret = strlen(buf); if (ret > XBC_DATA_MAX - 1 || ret == 0) { - pr_err("Error: Config data is %s.\n", - ret ? "too big" : "empty"); + if (emsg) + *emsg = ret ? "Config data is too big" : + "Config data is empty"; return -ERANGE; } xbc_nodes = memblock_alloc(sizeof(struct xbc_node) * XBC_NODE_MAX, SMP_CACHE_BYTES); if (!xbc_nodes) { - pr_err("Failed to allocate memory for bootconfig nodes.\n"); + if (emsg) + *emsg = "Failed to allocate bootconfig nodes"; return -ENOMEM; } memset(xbc_nodes, 0, sizeof(struct xbc_node) * XBC_NODE_MAX); @@ -814,9 +827,13 @@ int __init xbc_init(char *buf) if (!ret) ret = xbc_verify_tree(); - if (ret < 0) + if (ret < 0) { + if (epos) + *epos = xbc_err_pos; + if (emsg) + *emsg = xbc_err_msg; xbc_destroy_all(); - else + } else ret = xbc_node_num; return ret; diff --git a/tools/bootconfig/main.c b/tools/bootconfig/main.c index a9b97814d1a9..16b9a420e6fd 100644 --- a/tools/bootconfig/main.c +++ b/tools/bootconfig/main.c @@ -130,6 +130,7 @@ int load_xbc_from_initrd(int fd, char **buf) int ret; u32 size = 0, csum = 0, rcsum; char magic[BOOTCONFIG_MAGIC_LEN]; + const char *msg; ret = fstat(fd, &stat); if (ret < 0) @@ -182,10 +183,12 @@ int load_xbc_from_initrd(int fd, char **buf) return -EINVAL; } - ret = xbc_init(*buf); + ret = xbc_init(*buf, &msg, NULL); /* Wrong data */ - if (ret < 0) + if (ret < 0) { + pr_err("parse error: %s.\n", msg); return ret; + } return size; } @@ -244,11 +247,34 @@ int delete_xbc(const char *path) return ret; } +static void show_xbc_error(const char *data, const char *msg, int pos) +{ + int lin = 1, col, i; + + if (pos < 0) { + pr_err("Error: %s.\n", msg); + return; + } + + /* Note that pos starts from 0 but lin and col should start from 1. */ + col = pos + 1; + for (i = 0; i < pos; i++) { + if (data[i] == '\n') { + lin++; + col = pos - i; + } + } + pr_err("Parse Error: %s at %d:%d\n", msg, lin, col); + +} + int apply_xbc(const char *path, const char *xbc_path) { u32 size, csum; char *buf, *data; int ret, fd; + const char *msg; + int pos; ret = load_xbc_file(xbc_path, &buf); if (ret < 0) { @@ -267,11 +293,12 @@ int apply_xbc(const char *path, const char *xbc_path) *(u32 *)(data + size + 4) = csum; /* Check the data format */ - ret = xbc_init(buf); + ret = xbc_init(buf, &msg, &pos); if (ret < 0) { - pr_err("Failed to parse %s: %d\n", xbc_path, ret); + show_xbc_error(data, msg, pos); free(data); free(buf); + return ret; } printf("Apply %s to %s\n", xbc_path, path); -- cgit v1.2.3-59-g8ed1b From 765047932f153265db6ef15be208d6cbfc03dc62 Mon Sep 17 00:00:00 2001 From: Thara Gopinath Date: Fri, 21 Feb 2020 19:52:05 -0500 Subject: sched/pelt: Add support to track thermal pressure Extrapolating on the existing framework to track rt/dl utilization using pelt signals, add a similar mechanism to track thermal pressure. The difference here from rt/dl utilization tracking is that, instead of tracking time spent by a CPU running a RT/DL task through util_avg, the average thermal pressure is tracked through load_avg. This is because thermal pressure signal is weighted time "delta" capacity unlike util_avg which is binary. "delta capacity" here means delta between the actual capacity of a CPU and the decreased capacity a CPU due to a thermal event. In order to track average thermal pressure, a new sched_avg variable avg_thermal is introduced. Function update_thermal_load_avg can be called to do the periodic bookkeeping (accumulate, decay and average) of the thermal pressure. Reviewed-by: Vincent Guittot Signed-off-by: Thara Gopinath Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200222005213.3873-2-thara.gopinath@linaro.org --- include/trace/events/sched.h | 4 ++++ init/Kconfig | 4 ++++ kernel/sched/pelt.c | 31 +++++++++++++++++++++++++++++++ kernel/sched/pelt.h | 31 +++++++++++++++++++++++++++++++ kernel/sched/sched.h | 3 +++ 5 files changed, 73 insertions(+) (limited to 'init') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9c3ebb7c83a5..ed168b0e2c53 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -618,6 +618,10 @@ DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); +DECLARE_TRACE(pelt_thermal_tp, + TP_PROTO(struct rq *rq), + TP_ARGS(rq)); + DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..275c848b02c6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -451,6 +451,10 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP +config SCHED_THERMAL_PRESSURE + bool "Enable periodic averaging of thermal pressure" + depends on SMP + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index c40d57a2a248..b647d04d9c8b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -368,6 +368,37 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +/* + * thermal: + * + * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked + * + * util_avg and runnable_load_avg are not supported and meaningless. + * + * Unlike rt/dl utilization tracking that track time spent by a cpu + * running a rt/dl task through util_avg, the average thermal pressure is + * tracked through load_avg. This is because thermal pressure signal is + * time weighted "delta" capacity unlike util_avg which is binary. + * "delta capacity" = actual capacity - + * capped capacity a cpu due to a thermal event. + */ + +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + if (___update_load_sum(now, &rq->avg_thermal, + capacity, + capacity, + capacity)) { + ___update_load_avg(&rq->avg_thermal, 1); + trace_pelt_thermal_tp(rq); + return 1; + } + + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ /* * irq: diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..eb034d9f024d 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -7,6 +7,26 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); +#ifdef CONFIG_SCHED_THERMAL_PRESSURE +int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return READ_ONCE(rq->avg_thermal.load_avg); +} +#else +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} +#endif + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ int update_irq_load_avg(struct rq *rq, u64 running); #else @@ -158,6 +178,17 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +static inline int +update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +{ + return 0; +} + +static inline u64 thermal_load_avg(struct rq *rq) +{ + return 0; +} + static inline int update_irq_load_avg(struct rq *rq, u64 running) { diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2a0caf394dd4..6c839f829a25 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -960,6 +960,9 @@ struct rq { struct sched_avg avg_dl; #ifdef CONFIG_HAVE_SCHED_AVG_IRQ struct sched_avg avg_irq; +#endif +#ifdef CONFIG_SCHED_THERMAL_PRESSURE + struct sched_avg avg_thermal; #endif u64 idle_stamp; u64 avg_idle; -- cgit v1.2.3-59-g8ed1b From ea3edd4dc23027083fbb4a73b65114d08fe73a76 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Mar 2020 08:25:11 +0100 Subject: block: remove __bdevname There is no good reason for __bdevname to exist. Just open code printing the string in the callers. For three of them the format string can be trivially merged into existing printk statements, and in init/do_mounts.c we can at least do the scnprintf once at the start of the function, and unconditional of CONFIG_BLOCK to make the output for tiny configfs a little more helpful. Acked-by: Theodore Ts'o # for ext4 Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partition-generic.c | 14 -------------- drivers/md/md.c | 4 ++-- fs/ext4/super.c | 6 +++--- fs/reiserfs/journal.c | 5 ++--- include/linux/fs.h | 1 - init/do_mounts.c | 12 ++---------- 6 files changed, 9 insertions(+), 33 deletions(-) (limited to 'init') diff --git a/block/partition-generic.c b/block/partition-generic.c index 564fae77711d..98256e6beabb 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -57,20 +57,6 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); -/* - * There's very little reason to use this, you should really - * have a struct block_device just about everywhere and use - * bdevname() instead. - */ -const char *__bdevname(dev_t dev, char *buffer) -{ - scnprintf(buffer, BDEVNAME_SIZE, "unknown-block(%u,%u)", - MAJOR(dev), MINOR(dev)); - return buffer; -} - -EXPORT_SYMBOL(__bdevname); - static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 469f551863be..8238fb64f87f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2491,12 +2491,12 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) { int err = 0; struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, shared ? (struct md_rdev *)lock_rdev : rdev); if (IS_ERR(bdev)) { - pr_warn("md: could not open %s.\n", __bdevname(dev, b)); + pr_warn("md: could not open device unknown-block(%u,%u).\n", + MAJOR(dev), MINOR(dev)); return PTR_ERR(bdev); } rdev->bdev = bdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0c7c4adb664e..4cbd1cc34dfa 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -927,7 +927,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) { struct block_device *bdev; - char b[BDEVNAME_SIZE]; bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) @@ -935,8 +934,9 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) return bdev; fail: - ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld", - __bdevname(dev, b), PTR_ERR(bdev)); + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); return NULL; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 072156c4f895..5c766330e493 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2599,7 +2599,6 @@ static int journal_init_dev(struct super_block *super, int result; dev_t jdev; fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; - char b[BDEVNAME_SIZE]; result = 0; @@ -2621,8 +2620,8 @@ static int journal_init_dev(struct super_block *super, result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; reiserfs_warning(super, "sh-458", - "cannot init journal device '%s': %i", - __bdevname(jdev, b), result); + "cannot init journal device unknown-block(%u,%u): %i", + MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); diff --git a/include/linux/fs.h b/include/linux/fs.h index 3cd4fe6b845e..561b35e3b95b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2699,7 +2699,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 -extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern void blkdev_show(struct seq_file *,off_t); diff --git a/init/do_mounts.c b/init/do_mounts.c index 0ae9cc22f2ae..29d326b6c29d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -429,12 +429,10 @@ void __init mount_block_root(char *name, int flags) struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; -#ifdef CONFIG_BLOCK char b[BDEVNAME_SIZE]; -#else - const char *b = name; -#endif + scnprintf(b, BDEVNAME_SIZE, "unknown-block(%u,%u)", + MAJOR(ROOT_DEV), MINOR(ROOT_DEV)); get_fs_names(fs_names); retry: for (p = fs_names; *p; p += strlen(p)+1) { @@ -451,9 +449,6 @@ retry: * and bad superblock on root device. * and give them a list of the available devices */ -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif printk("VFS: Cannot open root device \"%s\" or %s: error %d\n", root_device_name, b, err); printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); @@ -476,9 +471,6 @@ retry: for (p = fs_names; *p; p += strlen(p)+1) printk(" %s", p); printk("\n"); -#ifdef CONFIG_BLOCK - __bdevname(ROOT_DEV, b); -#endif panic("VFS: Unable to mount root fs on %s", b); out: put_page(page); -- cgit v1.2.3-59-g8ed1b From eea9673250db4e854e9998ef9da6d4584857f0ea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:36 -0500 Subject: exec: Add exec_update_mutex to replace cred_guard_mutex The cred_guard_mutex is problematic as it is held over possibly indefinite waits for userspace. The possible indefinite waits for userspace that I have identified are: The cred_guard_mutex is held in PTRACE_EVENT_EXIT waiting for the tracer. The cred_guard_mutex is held over "put_user(0, tsk->clear_child_tid)" in exit_mm(). The cred_guard_mutex is held over "get_user(futex_offset, ...") in exit_robust_list. The cred_guard_mutex held over copy_strings. The functions get_user and put_user can trigger a page fault which can potentially wait indefinitely in the case of userfaultfd or if userspace implements part of the page fault path. In any of those cases the userspace process that the kernel is waiting for might make a different system call that winds up taking the cred_guard_mutex and result in deadlock. Holding a mutex over any of those possibly indefinite waits for userspace does not appear necessary. Add exec_update_mutex that will just cover updating the process during exec where the permissions and the objects pointed to by the task struct may be out of sync. The plan is to switch the users of cred_guard_mutex to exec_update_mutex one by one. This lets us move forward while still being careful and not introducing any regressions. Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/ Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/ Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/ Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/ Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/ Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") Reviewed-by: Kirill Tkhai Signed-off-by: "Eric W. Biederman" Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 22 +++++++++++++++++++--- include/linux/binfmts.h | 8 +++++++- include/linux/sched/signal.h | 9 ++++++++- init/init_task.c | 1 + kernel/fork.c | 1 + 5 files changed, 36 insertions(+), 5 deletions(-) (limited to 'init') diff --git a/fs/exec.c b/fs/exec.c index d820a7272a76..0e46ec57fe0a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1010,16 +1010,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) } EXPORT_SYMBOL(read_code); +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with the mutex + * exec_update_mutex locked. + */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + if (ret) + return ret; + if (old_mm) { sync_mm_rss(old_mm); /* @@ -1031,9 +1041,11 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); + mutex_unlock(&tsk->signal->exec_update_mutex); return -EINTR; } } + task_lock(tsk); active_mm = tsk->active_mm; membarrier_exec_mmap(mm); @@ -1288,11 +1300,12 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After clearing bprm->mm (to mark that current is using the - * prepared mm now), we have nothing left of the original + * After setting bprm->called_exec_mmap (to mark that current is + * using the prepared mm now), we have nothing left of the original * process. If anything from here on returns an error, the check * in search_binary_handler() will SEGV current. */ + bprm->called_exec_mmap = 1; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS @@ -1438,6 +1451,8 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { + if (bprm->called_exec_mmap) + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1487,6 +1502,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1678,7 +1694,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && !bprm->mm) { + if (retval < 0 && bprm->called_exec_mmap) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b40fc633f3be..a345d9fed3d8 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -44,7 +44,13 @@ struct linux_binprm { * exec has happened. Used to sanitize execution environment * and to set AT_SECURE auxv for glibc. */ - secureexec:1; + secureexec:1, + /* + * Set by flush_old_exec, when exec_mmap has been called. + * This is past the point of no return, when the + * exec_update_mutex has been taken. + */ + called_exec_mmap:1; #ifdef __alpha__ unsigned int taso:1; #endif diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 88050259c466..a29df79540ce 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -224,7 +224,14 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations - * (notably. ptrace) */ + * (notably. ptrace) + * Deprecated do not use in new code. + * Use exec_update_mutex instead. + */ + struct mutex exec_update_mutex; /* Held while task_struct is being + * updated during exec, and may have + * inconsistent permissions. + */ } __randomize_layout; /* diff --git a/init/init_task.c b/init/init_task.c index 9e5cbe5eab7b..bd403ed3e418 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -26,6 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..12896a6ecee6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); + mutex_init(&sig->exec_update_mutex); return 0; } -- cgit v1.2.3-59-g8ed1b From 6546b19f95acc986807de981402bbac6b3a94b0f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 25 Mar 2020 21:45:29 +0900 Subject: perf/core: Add PERF_SAMPLE_CGROUP feature The PERF_SAMPLE_CGROUP bit is to save (perf_event) cgroup information in the sample. It will add a 64-bit id to identify current cgroup and it's the file handle in the cgroup file system. Userspace should use this information with PERF_RECORD_CGROUP event to match which cgroup it belongs. I put it before PERF_SAMPLE_AUX for simplicity since it just needs a 64-bit word. But if we want bigger samples, I can work on that direction too. Committer testing: $ pahole perf_sample_data | grep -w cgroup -B5 -A5 /* --- cacheline 4 boundary (256 bytes) was 56 bytes ago --- */ struct perf_regs regs_intr; /* 312 16 */ /* --- cacheline 5 boundary (320 bytes) was 8 bytes ago --- */ u64 stack_user_size; /* 328 8 */ u64 phys_addr; /* 336 8 */ u64 cgroup; /* 344 8 */ /* size: 384, cachelines: 6, members: 22 */ /* padding: 32 */ }; $ Signed-off-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Johannes Weiner Cc: Mark Rutland Cc: Zefan Li Link: http://lore.kernel.org/lkml/20200325124536.2800725-3-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 3 ++- init/Kconfig | 3 ++- kernel/events/core.c | 22 ++++++++++++++++++++++ 4 files changed, 27 insertions(+), 2 deletions(-) (limited to 'init') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8768a39b5258..9c3e7619c929 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1020,6 +1020,7 @@ struct perf_sample_data { u64 stack_user_size; u64 phys_addr; + u64 cgroup; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index de95f6c7b273..7b2d6fc9e6ed 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -142,8 +142,9 @@ enum perf_event_sample_format { PERF_SAMPLE_REGS_INTR = 1U << 18, PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, - PERF_SAMPLE_MAX = 1U << 21, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..7766b06a0038 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1027,7 +1027,8 @@ config CGROUP_PERF help This option extends the perf per-cpu mode to restrict monitoring to threads which belong to the cgroup specified and run on the - designated cpu. + designated cpu. Or this can be used to have cgroup ID in samples + so that it can monitor performance events among cgroups. Say N if unsure. diff --git a/kernel/events/core.c b/kernel/events/core.c index 994932d5e474..1569979c8912 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1862,6 +1862,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PHYS_ADDR) size += sizeof(data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + size += sizeof(data->cgroup); + event->header_size = size; } @@ -6867,6 +6870,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_PHYS_ADDR) perf_output_put(handle, data->phys_addr); + if (sample_type & PERF_SAMPLE_CGROUP) + perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -7066,6 +7072,16 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); +#ifdef CONFIG_CGROUP_PERF + if (sample_type & PERF_SAMPLE_CGROUP) { + struct cgroup *cgrp; + + /* protected by RCU */ + cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup; + data->cgroup = cgroup_id(cgrp); + } +#endif + if (sample_type & PERF_SAMPLE_AUX) { u64 size; @@ -11264,6 +11280,12 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_REGS_INTR) ret = perf_reg_validate(attr->sample_regs_intr); + +#ifndef CONFIG_CGROUP_PERF + if (attr->sample_type & PERF_SAMPLE_CGROUP) + return -EINVAL; +#endif + out: return ret; -- cgit v1.2.3-59-g8ed1b From fc611f47f2188ade2b48ff6902d5cce8baac0c58 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Sun, 29 Mar 2020 01:43:49 +0100 Subject: bpf: Introduce BPF_PROG_TYPE_LSM Introduce types and configs for bpf programs that can be attached to LSM hooks. The programs can be enabled by the config option CONFIG_BPF_LSM. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Reviewed-by: Brendan Jackman Reviewed-by: Florent Revest Reviewed-by: Thomas Garnier Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Acked-by: James Morris Link: https://lore.kernel.org/bpf/20200329004356.27286-2-kpsingh@chromium.org --- MAINTAINERS | 1 + include/linux/bpf.h | 3 +++ include/linux/bpf_types.h | 4 ++++ include/uapi/linux/bpf.h | 2 ++ init/Kconfig | 12 ++++++++++++ kernel/bpf/Makefile | 1 + kernel/bpf/bpf_lsm.c | 17 +++++++++++++++++ kernel/trace/bpf_trace.c | 12 ++++++------ tools/include/uapi/linux/bpf.h | 2 ++ tools/lib/bpf/libbpf_probes.c | 1 + 10 files changed, 49 insertions(+), 6 deletions(-) create mode 100644 kernel/bpf/bpf_lsm.c (limited to 'init') diff --git a/MAINTAINERS b/MAINTAINERS index 5dbee41045bc..3197fe9256b2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3147,6 +3147,7 @@ R: Martin KaFai Lau R: Song Liu R: Yonghong Song R: Andrii Nakryiko +R: KP Singh L: netdev@vger.kernel.org L: bpf@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 372708eeaecd..3bde59a8453b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1515,6 +1515,9 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +const struct bpf_func_proto *bpf_tracing_func_proto( + enum bpf_func_id func_id, const struct bpf_prog *prog); + /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c81d4ece79a4..ba0c2d56f8a3 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -70,6 +70,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_STRUCT_OPS, bpf_struct_ops, void *, void *) BPF_PROG_TYPE(BPF_PROG_TYPE_EXT, bpf_extension, void *, void *) +#ifdef CONFIG_BPF_LSM +BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, + void *, void *) +#endif /* CONFIG_BPF_LSM */ #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 222ba11966e3..f1fbc36f58d3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -181,6 +181,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, }; enum bpf_attach_type { @@ -211,6 +212,7 @@ enum bpf_attach_type { BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, + BPF_LSM_MAC, __MAX_BPF_ATTACH_TYPE }; diff --git a/init/Kconfig b/init/Kconfig index 20a6ac33761c..deae572d1927 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1616,6 +1616,18 @@ config KALLSYMS_BASE_RELATIVE # end of the "standard kernel features (expert users)" menu # syscall, maps, verifier + +config BPF_LSM + bool "LSM Instrumentation with BPF" + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with eBPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + config BPF_SYSCALL bool "Enable bpf() system call" select BPF diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 046ce5d98033..f2d7be596966 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -29,4 +29,5 @@ obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o endif ifeq ($(CONFIG_BPF_JIT),y) obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o +obj-${CONFIG_BPF_LSM} += bpf_lsm.o endif diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c new file mode 100644 index 000000000000..82875039ca90 --- /dev/null +++ b/kernel/bpf/bpf_lsm.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2020 Google LLC. + */ + +#include +#include +#include + +const struct bpf_prog_ops lsm_prog_ops = { +}; + +const struct bpf_verifier_ops lsm_verifier_ops = { + .get_func_proto = bpf_tracing_func_proto, + .is_valid_access = btf_ctx_access, +}; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e619eedb5919..37ffceab608f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -779,8 +779,8 @@ static const struct bpf_func_proto bpf_send_signal_thread_proto = { .arg1_type = ARG_ANYTHING, }; -static const struct bpf_func_proto * -tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +const struct bpf_func_proto * +bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -865,7 +865,7 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -975,7 +975,7 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1082,7 +1082,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } @@ -1210,7 +1210,7 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_stack: return &bpf_get_stack_proto_raw_tp; default: - return tracing_func_proto(func_id, prog); + return bpf_tracing_func_proto(func_id, prog); } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 222ba11966e3..f1fbc36f58d3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -181,6 +181,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_TRACING, BPF_PROG_TYPE_STRUCT_OPS, BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, }; enum bpf_attach_type { @@ -211,6 +212,7 @@ enum bpf_attach_type { BPF_TRACE_FENTRY, BPF_TRACE_FEXIT, BPF_MODIFY_RETURN, + BPF_LSM_MAC, __MAX_BPF_ATTACH_TYPE }; diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index b782ebef6ac9..2c92059c0c90 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -108,6 +108,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: + case BPF_PROG_TYPE_LSM: default: break; } -- cgit v1.2.3-59-g8ed1b From 4edf16b72c57bb0faad5da143f812384df0c43f6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Mon, 30 Mar 2020 22:40:59 +0200 Subject: bpf, lsm: Make BPF_LSM depend on BPF_EVENTS LSM and tracing programs share their helpers with bpf_tracing_func_proto which is only defined (in bpf_trace.c) when BPF_EVENTS is enabled. Instead of adding __weak symbol, make BPF_LSM depend on BPF_EVENTS so that both tracing and LSM programs can actually share helpers. Fixes: fc611f47f218 ("bpf: Introduce BPF_PROG_TYPE_LSM") Reported-by: Randy Dunlap Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200330204059.13024-1-kpsingh@chromium.org --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index deae572d1927..7b7ea70e64ac 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1619,6 +1619,7 @@ config KALLSYMS_BASE_RELATIVE config BPF_LSM bool "LSM Instrumentation with BPF" + depends on BPF_EVENTS depends on BPF_SYSCALL depends on SECURITY depends on BPF_JIT -- cgit v1.2.3-59-g8ed1b From 9553d16fa671b9621c5e2847d08bd90d3be3349c Mon Sep 17 00:00:00 2001 From: Amit Daniel Kachhap Date: Mon, 30 Mar 2020 17:11:38 +0530 Subject: init/kconfig: Add LD_VERSION Kconfig This option can be used in Kconfig files to compare the ld version and enable/disable incompatible config options if required. This option is used in the subsequent patch along with GCC_VERSION to filter out an incompatible feature. Signed-off-by: Amit Daniel Kachhap Signed-off-by: Catalin Marinas --- init/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 452bc1835cd4..68ddbcd974c7 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -17,6 +17,10 @@ config GCC_VERSION default $(shell,$(srctree)/scripts/gcc-version.sh $(CC)) if CC_IS_GCC default 0 +config LD_VERSION + int + default $(shell,$(LD) --version | $(srctree)/scripts/ld-version.sh) + config CC_IS_CLANG def_bool $(success,$(CC) --version | head -n 1 | grep -q clang) -- cgit v1.2.3-59-g8ed1b From 5a281062af1d43d3f3956a6b429c2d727bc92603 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 6 Apr 2020 20:05:33 -0700 Subject: userfaultfd: wp: add WP pagetable tracking to x86 Accurate userfaultfd WP tracking is possible by tracking exactly which virtual memory ranges were writeprotected by userland. We can't relay only on the RW bit of the mapped pagetable because that information is destroyed by fork() or KSM or swap. If we were to relay on that, we'd need to stay on the safe side and generate false positive wp faults for every swapped out page. [peterx@redhat.com: append _PAGE_UFD_WP to _PAGE_CHG_MASK] Signed-off-by: Andrea Arcangeli Signed-off-by: Peter Xu Signed-off-by: Andrew Morton Reviewed-by: Jerome Glisse Reviewed-by: Mike Rapoport Cc: Bobby Powers Cc: Brian Geffon Cc: David Hildenbrand Cc: Denis Plotnikov Cc: "Dr . David Alan Gilbert" Cc: Hugh Dickins Cc: Johannes Weiner Cc: "Kirill A . Shutemov" Cc: Martin Cracauer Cc: Marty McFadden Cc: Maya Gokhale Cc: Mel Gorman Cc: Mike Kravetz Cc: Pavel Emelyanov Cc: Rik van Riel Cc: Shaohua Li Link: http://lkml.kernel.org/r/20200220163112.11409-4-peterx@redhat.com Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable.h | 52 ++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 8 +++++- arch/x86/include/asm/pgtable_types.h | 12 ++++++++- include/asm-generic/pgtable.h | 1 + include/asm-generic/pgtable_uffd.h | 51 +++++++++++++++++++++++++++++++++++ init/Kconfig | 5 ++++ 7 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 include/asm-generic/pgtable_uffd.h (limited to 'init') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1edf788d301c..8d078642b4be 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -149,6 +149,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64 + select HAVE_ARCH_USERFAULTFD_WP if USERFAULTFD select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_ASM_MODVERSIONS diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index afda66a6d325..c37e1649fb7e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -25,6 +25,7 @@ #include #include #include +#include extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -313,6 +314,23 @@ static inline pte_t pte_clear_flags(pte_t pte, pteval_t clear) return native_make_pte(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pte_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_UFFD_WP; +} + +static inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_UFFD_WP); +} + +static inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pte_t pte_mkclean(pte_t pte) { return pte_clear_flags(pte, _PAGE_DIRTY); @@ -392,6 +410,23 @@ static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) return native_make_pmd(v & ~clear); } +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline int pmd_uffd_wp(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_UFFD_WP; +} + +static inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_UFFD_WP); +} + +static inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + static inline pmd_t pmd_mkold(pmd_t pmd) { return pmd_clear_flags(pmd, _PAGE_ACCESSED); @@ -1374,6 +1409,23 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #endif #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte_set_flags(pte, _PAGE_SWP_UFFD_WP); +} + +static inline int pte_swp_uffd_wp(pte_t pte) +{ + return pte_flags(pte) & _PAGE_SWP_UFFD_WP; +} + +static inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte_clear_flags(pte, _PAGE_SWP_UFFD_WP); +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + #define PKRU_AD_BIT 0x1 #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0b6c4042942a..df1373415f11 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -189,7 +189,7 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * * | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names - * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|X|SD|0| <- swp entry + * | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry * * G (8) is aliased and used as a PROT_NONE indicator for * !present ptes. We need to start storing swap entries above @@ -197,9 +197,15 @@ extern void sync_global_pgds(unsigned long start, unsigned long end); * erratum where they can be incorrectly set by hardware on * non-present PTEs. * + * SD Bits 1-4 are not used in non-present format and available for + * special use described below: + * * SD (1) in swp entry is used to store soft dirty bit, which helps us * remember soft dirty over page migration * + * F (2) in swp entry is used to record when a pagetable is + * writeprotected by userfaultfd WP support. + * * Bit 7 in swp entry should be 0 because pmd_present checks not only P, * but also L and G. * diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 65c2ecd730c5..b6606fe6cfdf 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,6 +32,7 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -100,6 +101,14 @@ #define _PAGE_SWP_SOFT_DIRTY (_AT(pteval_t, 0)) #endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP +#define _PAGE_UFFD_WP (_AT(pteval_t, 1) << _PAGE_BIT_UFFD_WP) +#define _PAGE_SWP_UFFD_WP _PAGE_USER +#else +#define _PAGE_UFFD_WP (_AT(pteval_t, 0)) +#define _PAGE_SWP_UFFD_WP (_AT(pteval_t, 0)) +#endif + #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) @@ -118,7 +127,8 @@ */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC) + _PAGE_SOFT_DIRTY | _PAGE_DEVMAP | _PAGE_ENC | \ + _PAGE_UFFD_WP) #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) /* diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2e2bef07dd2..329b8c8ca703 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -10,6 +10,7 @@ #include #include #include +#include #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS diff --git a/include/asm-generic/pgtable_uffd.h b/include/asm-generic/pgtable_uffd.h new file mode 100644 index 000000000000..643d1bf559c2 --- /dev/null +++ b/include/asm-generic/pgtable_uffd.h @@ -0,0 +1,51 @@ +#ifndef _ASM_GENERIC_PGTABLE_UFFD_H +#define _ASM_GENERIC_PGTABLE_UFFD_H + +#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP +static __always_inline int pte_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline int pmd_uffd_wp(pmd_t pmd) +{ + return 0; +} + +static __always_inline pte_t pte_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_mkuffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_clear_uffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline pmd_t pmd_clear_uffd_wp(pmd_t pmd) +{ + return pmd; +} + +static __always_inline pte_t pte_swp_mkuffd_wp(pte_t pte) +{ + return pte; +} + +static __always_inline int pte_swp_uffd_wp(pte_t pte) +{ + return 0; +} + +static __always_inline pte_t pte_swp_clear_uffd_wp(pte_t pte) +{ + return pte; +} +#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ + +#endif /* _ASM_GENERIC_PGTABLE_UFFD_H */ diff --git a/init/Kconfig b/init/Kconfig index 1c12059e0f7e..2bc0e47689d8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1556,6 +1556,11 @@ config ADVISE_SYSCALLS applications use these syscalls, you can disable this option to save space. +config HAVE_ARCH_USERFAULTFD_WP + bool + help + Arch has userfaultfd write protection support + config MEMBARRIER bool "Enable membarrier() system call" if EXPERT default y -- cgit v1.2.3-59-g8ed1b From 7baf219982281034f8c3a806869873bf098fb611 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 6 Apr 2020 20:12:02 -0700 Subject: init/Kconfig: clean up ANON_INODES and old IO schedulers options CONFIG_ANON_INODES is gone since commit 5dd50aaeb185 ("Make anon_inodes unconditional"). CONFIG_CFQ_GROUP_IOSCHED was replaced with CONFIG_BFQ_GROUP_IOSCHED in commit f382fb0bcef4 ("block: remove legacy IO schedulers"). Signed-off-by: Krzysztof Kozlowski Signed-off-by: Andrew Morton Cc: Masahiro Yamada Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Link: http://lkml.kernel.org/r/20200130192419.3026-1-krzk@kernel.org Signed-off-by: Linus Torvalds --- init/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'init') diff --git a/init/Kconfig b/init/Kconfig index 2bc0e47689d8..574b7215aa6a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -872,7 +872,7 @@ config BLK_CGROUP This option only enables generic Block IO controller infrastructure. One needs to also enable actual IO controlling logic/policy. For enabling proportional weight division of disk bandwidth in CFQ, set - CONFIG_CFQ_GROUP_IOSCHED=y; for enabling throttling policy, set + CONFIG_BFQ_GROUP_IOSCHED=y; for enabling throttling policy, set CONFIG_BLK_DEV_THROTTLING=y. See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. @@ -1538,7 +1538,6 @@ config AIO config IO_URING bool "Enable IO uring support" if EXPERT - select ANON_INODES select IO_WQ default y help -- cgit v1.2.3-59-g8ed1b From 4dcc9a88448a65a1e855228917cfbb92ac4b4f45 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Apr 2020 01:18:37 -0700 Subject: kbuild: mkcompile_h: Include $LD version in /proc/version When doing Clang builds of the kernel, it is possible to link with either ld.bfd (binutils) or ld.lld (LLVM), but it is not possible to discover this from a running kernel. Add the "$LD -v" output to /proc/version. Signed-off-by: Kees Cook Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Reviewed-by: Fangrui Song Reviewed-by: Sedat Dilek Tested-by: Sedat Dilek Signed-off-by: Masahiro Yamada --- init/Makefile | 2 +- scripts/mkcompile_h | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'init') diff --git a/init/Makefile b/init/Makefile index 6246a06364d0..30aa8ab11120 100644 --- a/init/Makefile +++ b/init/Makefile @@ -35,4 +35,4 @@ include/generated/compile.h: FORCE @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ - "$(CONFIG_PREEMPT_RT)" "$(CC) $(KBUILD_CFLAGS)" + "$(CONFIG_PREEMPT_RT)" "$(CC) $(KBUILD_CFLAGS)" "$(LD)" diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h index 3ff26e5b2eac..5b80a4699740 100755 --- a/scripts/mkcompile_h +++ b/scripts/mkcompile_h @@ -7,6 +7,7 @@ SMP=$3 PREEMPT=$4 PREEMPT_RT=$5 CC=$6 +LD=$7 vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; } @@ -61,7 +62,10 @@ UTS_VERSION="$(echo $UTS_VERSION $CONFIG_FLAGS $TIMESTAMP | cut -b -$UTS_LEN)" printf '#define LINUX_COMPILE_BY "%s"\n' "$LINUX_COMPILE_BY" echo \#define LINUX_COMPILE_HOST \"$LINUX_COMPILE_HOST\" - echo \#define LINUX_COMPILER \"`$CC -v 2>&1 | grep ' version ' | sed 's/[[:space:]]*$//'`\" + CC_VERSION=$($CC -v 2>&1 | grep ' version ' | sed 's/[[:space:]]*$//') + LD_VERSION=$($LD -v | head -n1 | sed 's/(compatible with [^)]*)//' \ + | sed 's/[[:space:]]*$//') + printf '#define LINUX_COMPILER "%s"\n' "$CC_VERSION, $LD_VERSION" } > .tmpcompile # Only replace the real compile.h if the new one is different, -- cgit v1.2.3-59-g8ed1b From 01a6126b5f7efdf75480a2b970377f5724cb885a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 4 Apr 2020 06:24:59 +0900 Subject: kbuild: do not pass $(KBUILD_CFLAGS) to scripts/mkcompile_h scripts/mkcompile_h uses $(CC) only for getting the version string. I suspected there was a specific reason why the additional flags were needed, and dug the commit history. This code dates back to at least 2002 [1], but I could not get any more clue. Just get rid of it. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/history/history.git/commit/?id=29f3df7eba8ddf91a55183f9967f76fbcc3ab742 Signed-off-by: Masahiro Yamada Reviewed-by: Kees Cook --- init/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'init') diff --git a/init/Makefile b/init/Makefile index 30aa8ab11120..d45e967483b2 100644 --- a/init/Makefile +++ b/init/Makefile @@ -35,4 +35,4 @@ include/generated/compile.h: FORCE @$($(quiet)chk_compile.h) $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \ - "$(CONFIG_PREEMPT_RT)" "$(CC) $(KBUILD_CFLAGS)" "$(LD)" + "$(CONFIG_PREEMPT_RT)" "$(CC)" "$(LD)" -- cgit v1.2.3-59-g8ed1b From ab6f762f0f53162d41497708b33c9a3236d3609e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 3 Mar 2020 20:30:02 +0900 Subject: printk: queue wake_up_klogd irq_work only if per-CPU areas are ready printk_deferred(), similarly to printk_safe/printk_nmi, does not immediately attempt to print a new message on the consoles, avoiding calls into non-reentrant kernel paths, e.g. scheduler or timekeeping, which potentially can deadlock the system. Those printk() flavors, instead, rely on per-CPU flush irq_work to print messages from safer contexts. For same reasons (recursive scheduler or timekeeping calls) printk() uses per-CPU irq_work in order to wake up user space syslog/kmsg readers. However, only printk_safe/printk_nmi do make sure that per-CPU areas have been initialised and that it's safe to modify per-CPU irq_work. This means that, for instance, should printk_deferred() be invoked "too early", that is before per-CPU areas are initialised, printk_deferred() will perform illegal per-CPU access. Lech Perczak [0] reports that after commit 1b710b1b10ef ("char/random: silence a lockdep splat with printk()") user-space syslog/kmsg readers are not able to read new kernel messages. The reason is printk_deferred() being called too early (as was pointed out by Petr and John). Fix printk_deferred() and do not queue per-CPU irq_work before per-CPU areas are initialized. Link: https://lore.kernel.org/lkml/aa0732c6-5c4e-8a8b-a1c1-75ebe3dca05b@camlintechnologies.com/ Reported-by: Lech Perczak Signed-off-by: Sergey Senozhatsky Tested-by: Jann Horn Reviewed-by: Petr Mladek Cc: Greg Kroah-Hartman Cc: Theodore Ts'o Cc: John Ogness Signed-off-by: Linus Torvalds --- include/linux/printk.h | 5 ----- init/main.c | 1 - kernel/printk/internal.h | 5 +++++ kernel/printk/printk.c | 34 ++++++++++++++++++++++++++++++++++ kernel/printk/printk_safe.c | 11 +---------- 5 files changed, 40 insertions(+), 16 deletions(-) (limited to 'init') diff --git a/include/linux/printk.h b/include/linux/printk.h index 1e6108b8d15f..e061635e0409 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -202,7 +202,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack(void) __cold; -extern void printk_safe_init(void); extern void printk_safe_flush(void); extern void printk_safe_flush_on_panic(void); #else @@ -269,10 +268,6 @@ static inline void dump_stack(void) { } -static inline void printk_safe_init(void) -{ -} - static inline void printk_safe_flush(void) { } diff --git a/init/main.c b/init/main.c index e488213857e2..a48617f2e5e5 100644 --- a/init/main.c +++ b/init/main.c @@ -913,7 +913,6 @@ asmlinkage __visible void __init start_kernel(void) boot_init_stack_canary(); time_init(); - printk_safe_init(); perf_event_init(); profile_init(); call_function_init(); diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index c8e6ab689d42..b2b0f526f249 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -23,6 +23,9 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args); void __printk_safe_enter(void); void __printk_safe_exit(void); +void printk_safe_init(void); +bool printk_percpu_data_ready(void); + #define printk_safe_enter_irqsave(flags) \ do { \ local_irq_save(flags); \ @@ -64,4 +67,6 @@ __printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } #define printk_safe_enter_irq() local_irq_disable() #define printk_safe_exit_irq() local_irq_enable() +static inline void printk_safe_init(void) { } +static inline bool printk_percpu_data_ready(void) { return false; } #endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 633f41a11d75..9a9b6156270b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -460,6 +460,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; +/* + * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before + * per_cpu_areas are initialised. This variable is set to true when + * it's safe to access per-CPU data. + */ +static bool __printk_percpu_data_ready __read_mostly; + +bool printk_percpu_data_ready(void) +{ + return __printk_percpu_data_ready; +} + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -1146,12 +1158,28 @@ static void __init log_buf_add_cpu(void) static inline void log_buf_add_cpu(void) {} #endif /* CONFIG_SMP */ +static void __init set_percpu_data_ready(void) +{ + printk_safe_init(); + /* Make sure we set this flag only after printk_safe() init is done */ + barrier(); + __printk_percpu_data_ready = true; +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; unsigned int free; + /* + * Some archs call setup_log_buf() multiple times - first is very + * early, e.g. from setup_arch(), and second - when percpu_areas + * are initialised. + */ + if (!early) + set_percpu_data_ready(); + if (log_buf != __log_buf) return; @@ -2975,6 +3003,9 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { void wake_up_klogd(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); if (waitqueue_active(&log_wait)) { this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); @@ -2985,6 +3016,9 @@ void wake_up_klogd(void) void defer_console_output(void) { + if (!printk_percpu_data_ready()) + return; + preempt_disable(); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index b4045e782743..d9a659a686f3 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -27,7 +27,6 @@ * There are situations when we want to make sure that all buffers * were handled or when IRQs are blocked. */ -static int printk_safe_irq_ready __read_mostly; #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ sizeof(atomic_t) - \ @@ -51,7 +50,7 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); /* Get flushed in a more safe context. */ static void queue_flush_work(struct printk_safe_seq_buf *s) { - if (printk_safe_irq_ready) + if (printk_percpu_data_ready()) irq_work_queue(&s->work); } @@ -402,14 +401,6 @@ void __init printk_safe_init(void) #endif } - /* - * In the highly unlikely event that a NMI were to trigger at - * this moment. Make sure IRQ work is set up before this - * variable is set. - */ - barrier(); - printk_safe_irq_ready = 1; - /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } -- cgit v1.2.3-59-g8ed1b