aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJason Gunthorpe <jgg@nvidia.com>2020-11-23 16:50:59 -0400
committerJason Gunthorpe <jgg@nvidia.com>2020-11-23 16:50:59 -0400
commited92f6a52b84c0c03ae9d829cf118c6e38e456fb (patch)
tree7b85d805aa96e8b89e8be80bd03c6e43b1e6db62 /kernel
parentIB/qib: Use dma_set_mask_and_coherent to simplify code (diff)
parentLinux 5.10-rc5 (diff)
downloadlinux-dev-ed92f6a52b84c0c03ae9d829cf118c6e38e456fb.tar.xz
linux-dev-ed92f6a52b84c0c03ae9d829cf118c6e38e456fb.zip
Merge tag 'v5.10-rc5' into rdma.git for-next
For dependencies in following patches Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/bpf_lsm.c10
-rw-r--r--kernel/bpf/core.c2
-rw-r--r--kernel/bpf/hashtab.c30
-rw-r--r--kernel/bpf/preload/Kconfig1
-rw-r--r--kernel/bpf/verifier.c18
-rw-r--r--kernel/dma/swiotlb.c22
-rw-r--r--kernel/events/core.c57
-rw-r--r--kernel/events/internal.h16
-rw-r--r--kernel/events/ring_buffer.c20
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fail_function.c5
-rw-r--r--kernel/futex.c5
-rw-r--r--kernel/locking/lockdep.c25
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/ptrace.c16
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/tree_stall.h22
-rw-r--r--kernel/reboot.c28
-rw-r--r--kernel/sched/core.c26
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/deadline.c97
-rw-r--r--kernel/sched/debug.c12
-rw-r--r--kernel/sched/fair.c73
-rw-r--r--kernel/seccomp.c5
-rw-r--r--kernel/trace/bpf_trace.c12
-rw-r--r--kernel/watchdog.c4
27 files changed, 310 insertions, 214 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index bdc8cd1b6767..c1b9f71ee6aa 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,6 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
obj-y := core.o
-CFLAGS_core.o += $(call cc-disable-warning, override-init)
+ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y)
+# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details
+cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse
+endif
+CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index 78ea8a7bd27f..56cc5a915f67 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -13,6 +13,7 @@
#include <linux/bpf_verifier.h>
#include <net/bpf_sk_storage.h>
#include <linux/bpf_local_storage.h>
+#include <linux/btf_ids.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
-#define BPF_LSM_SYM_PREFX "bpf_lsm_"
+#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME)
+BTF_SET_START(bpf_lsm_hooks)
+#include <linux/lsm_hook_defs.h>
+#undef LSM_HOOK
+BTF_SET_END(bpf_lsm_hooks)
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
@@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
return -EINVAL;
}
- if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name,
- sizeof(BPF_LSM_SYM_PREFX) - 1)) {
+ if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) {
bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n",
prog->aux->attach_btf_id, prog->aux->attach_func_name);
return -EINVAL;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9268d77898b7..55454d2278b1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr)
*
* Decode and execute eBPF instructions.
*/
-static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
+static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack)
{
#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y
#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 1815e97d4c9c..1fccba6e88c4 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr,
+ void *value, bool onallcpus)
+{
+ /* When using prealloc and not setting the initial value on all cpus,
+ * zero-fill element values for other cpus (just as what happens when
+ * not using prealloc). Otherwise, bpf program has no way to ensure
+ * known initial values for cpus other than current one
+ * (onallcpus=false always when coming from bpf prog).
+ */
+ if (htab_is_prealloc(htab) && !onallcpus) {
+ u32 size = round_up(htab->map.value_size, 8);
+ int current_cpu = raw_smp_processor_id();
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu == current_cpu)
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value,
+ size);
+ else
+ memset(per_cpu_ptr(pptr, cpu), 0, size);
+ }
+ } else {
+ pcpu_copy_value(htab, pptr, value, onallcpus);
+ }
+}
+
static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
@@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
}
}
- pcpu_copy_value(htab, pptr, value, onallcpus);
+ pcpu_init_value(htab, pptr, value, onallcpus);
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr);
@@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
value, onallcpus);
} else {
- pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
+ pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig
index ace49111d3a3..26bced262473 100644
--- a/kernel/bpf/preload/Kconfig
+++ b/kernel/bpf/preload/Kconfig
@@ -6,6 +6,7 @@ config USERMODE_DRIVER
menuconfig BPF_PRELOAD
bool "Preload BPF file system with kernel specific program and map iterators"
depends on BPF
+ depends on BPF_SYSCALL
# The dependency on !COMPILE_TEST prevents it from being enabled
# in allmodconfig or allyesconfig configurations
depends on !COMPILE_TEST
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 6200519582a6..1388bf733071 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -7786,9 +7786,11 @@ static int check_return_code(struct bpf_verifier_env *env)
struct tnum range = tnum_range(0, 1);
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
int err;
+ const bool is_subprog = env->cur_state->frame[0]->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
+ if (!is_subprog &&
+ (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
prog_type == BPF_PROG_TYPE_LSM) &&
!prog->aux->attach_func_proto->type)
return 0;
@@ -7808,6 +7810,16 @@ static int check_return_code(struct bpf_verifier_env *env)
return -EACCES;
}
+ reg = cur_regs(env) + BPF_REG_0;
+ if (is_subprog) {
+ if (reg->type != SCALAR_VALUE) {
+ verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+ return 0;
+ }
+
switch (prog_type) {
case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
@@ -7861,7 +7873,6 @@ static int check_return_code(struct bpf_verifier_env *env)
return 0;
}
- reg = cur_regs(env) + BPF_REG_0;
if (reg->type != SCALAR_VALUE) {
verbose(env, "At program exit the register R0 is not a known value (%s)\n",
reg_type_str[reg->type]);
@@ -9572,12 +9583,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_insn_aux_data *aux)
{
- u32 datasec_id, type, id = insn->imm;
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
+ u32 type, id = insn->imm;
+ s32 datasec_id;
u64 addr;
int i;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b4eea0abc3f0..781b9dca197c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
}
io_tlb_index = 0;
+ no_iotlb_memory = false;
if (verbose)
swiotlb_print_info();
@@ -260,9 +261,11 @@ swiotlb_init(int verbose)
if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose))
return;
- if (io_tlb_start)
+ if (io_tlb_start) {
memblock_free_early(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
+ io_tlb_start = 0;
+ }
pr_warn("Cannot allocate buffer");
no_iotlb_memory = true;
}
@@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
io_tlb_orig_addr[i] = INVALID_PHYS_ADDR;
}
io_tlb_index = 0;
+ no_iotlb_memory = false;
swiotlb_print_info();
@@ -441,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
}
}
-phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
- dma_addr_t tbl_dma_addr,
- phys_addr_t orig_addr,
- size_t mapping_size,
- size_t alloc_size,
- enum dma_data_direction dir,
- unsigned long attrs)
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr,
+ size_t mapping_size, size_t alloc_size,
+ enum dma_data_direction dir, unsigned long attrs)
{
+ dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
unsigned long flags;
phys_addr_t tlb_addr;
unsigned int nslots, stride, index, wrap;
@@ -667,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
swiotlb_force);
- swiotlb_addr = swiotlb_tbl_map_single(dev,
- phys_to_dma_unencrypted(dev, io_tlb_start),
- paddr, size, size, dir, attrs);
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+ attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5a29ab09e72d..dc568ca295bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event,
event_sched_out(event, cpuctx, ctx);
perf_pmu_enable(ctx->pmu);
-
- if (group_event->attr.exclusive)
- cpuctx->exclusive = 0;
}
#define DETACH_GROUP 0x01UL
@@ -2583,11 +2580,8 @@ group_sched_in(struct perf_event *group_event,
pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
- if (event_sched_in(group_event, cpuctx, ctx)) {
- pmu->cancel_txn(pmu);
- perf_mux_hrtimer_restart(cpuctx);
- return -EAGAIN;
- }
+ if (event_sched_in(group_event, cpuctx, ctx))
+ goto error;
/*
* Schedule in siblings as one group (if any):
@@ -2616,10 +2610,8 @@ group_error:
}
event_sched_out(group_event, cpuctx, ctx);
+error:
pmu->cancel_txn(pmu);
-
- perf_mux_hrtimer_restart(cpuctx);
-
return -EAGAIN;
}
@@ -2645,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event,
* If this group is exclusive and there are already
* events on the CPU, it can't go on.
*/
- if (event->attr.exclusive && cpuctx->active_oncpu)
+ if (event->attr.exclusive && !list_empty(get_event_list(event)))
return 0;
/*
* Otherwise, try to add it if all previous groups were able
@@ -3679,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data)
*can_add_hw = 0;
ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
}
return 0;
@@ -6374,14 +6367,13 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
static void perf_sample_regs_user(struct perf_regs *regs_user,
- struct pt_regs *regs,
- struct pt_regs *regs_user_copy)
+ struct pt_regs *regs)
{
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
- perf_get_regs_user(regs_user, regs, regs_user_copy);
+ perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
regs_user->regs = NULL;
@@ -7083,8 +7075,7 @@ void perf_prepare_sample(struct perf_event_header *header,
}
if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
- perf_sample_regs_user(&data->regs_user, regs,
- &data->regs_user_copy);
+ perf_sample_regs_user(&data->regs_user, regs);
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
@@ -7186,6 +7177,7 @@ __perf_event_output(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs,
int (*output_begin)(struct perf_output_handle *,
+ struct perf_sample_data *,
struct perf_event *,
unsigned int))
{
@@ -7198,7 +7190,7 @@ __perf_event_output(struct perf_event *event,
perf_prepare_sample(&header, data, event, regs);
- err = output_begin(&handle, event, header.size);
+ err = output_begin(&handle, data, event, header.size);
if (err)
goto exit;
@@ -7264,7 +7256,7 @@ perf_event_read_event(struct perf_event *event,
int ret;
perf_event_header__init_id(&read_event.header, &sample, event);
- ret = perf_output_begin(&handle, event, read_event.header.size);
+ ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
if (ret)
return;
@@ -7533,7 +7525,7 @@ static void perf_event_task_output(struct perf_event *event,
perf_event_header__init_id(&task_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
task_event->event_id.header.size);
if (ret)
goto out;
@@ -7636,7 +7628,7 @@ static void perf_event_comm_output(struct perf_event *event,
return;
perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
comm_event->event_id.header.size);
if (ret)
@@ -7736,7 +7728,7 @@ static void perf_event_namespaces_output(struct perf_event *event,
perf_event_header__init_id(&namespaces_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
namespaces_event->event_id.header.size);
if (ret)
goto out;
@@ -7863,7 +7855,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data)
perf_event_header__init_id(&cgroup_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
cgroup_event->event_id.header.size);
if (ret)
goto out;
@@ -7989,7 +7981,7 @@ static void perf_event_mmap_output(struct perf_event *event,
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
mmap_event->event_id.header.size);
if (ret)
goto out;
@@ -8299,7 +8291,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
int ret;
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
@@ -8333,7 +8325,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
perf_event_header__init_id(&lost_samples_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
lost_samples_event.header.size);
if (ret)
return;
@@ -8388,7 +8380,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data)
perf_event_header__init_id(&se->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event, se->event_id.header.size);
+ ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
if (ret)
return;
@@ -8463,7 +8455,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_event_header__init_id(&throttle_event.header, &sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
throttle_event.header.size);
if (ret)
return;
@@ -8506,7 +8498,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data)
perf_event_header__init_id(&ksymbol_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, &sample, event,
ksymbol_event->event_id.header.size);
if (ret)
return;
@@ -8596,7 +8588,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data)
perf_event_header__init_id(&bpf_event->event_id.header,
&sample, event);
- ret = perf_output_begin(&handle, event,
+ ret = perf_output_begin(&handle, data, event,
bpf_event->event_id.header.size);
if (ret)
return;
@@ -8705,7 +8697,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data)
perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
- ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+ ret = perf_output_begin(&handle, &sample, event,
+ text_poke_event->event_id.header.size);
if (ret)
return;
@@ -8786,7 +8779,7 @@ static void perf_log_itrace_start(struct perf_event *event)
rec.tid = perf_event_tid(event, current);
perf_event_header__init_id(&rec.header, &sample, event);
- ret = perf_output_begin(&handle, event, rec.header.size);
+ ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index fcbf5616a441..228801e20788 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
static inline int get_recursion_context(int *recursion)
{
- int rctx;
-
- if (unlikely(in_nmi()))
- rctx = 3;
- else if (in_irq())
- rctx = 2;
- else if (in_softirq())
- rctx = 1;
- else
- rctx = 0;
+ unsigned int pc = preempt_count();
+ unsigned char rctx = 0;
+
+ rctx += !!(pc & (NMI_MASK));
+ rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
+ rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
if (recursion[rctx])
return -1;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 192b8abc6330..ef91ae75ca56 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail,
static __always_inline int
__perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size,
bool backward)
{
@@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->size = (1UL << page_shift) - offset;
if (unlikely(have_lost)) {
- struct perf_sample_data sample_data;
-
lost_event.header.size = sizeof(lost_event);
lost_event.header.type = PERF_RECORD_LOST;
lost_event.header.misc = 0;
lost_event.id = event->id;
lost_event.lost = local_xchg(&rb->lost, 0);
- perf_event_header__init_id(&lost_event.header,
- &sample_data, event);
+ /* XXX mostly redundant; @data is already fully initializes */
+ perf_event_header__init_id(&lost_event.header, data, event);
perf_output_put(handle, lost_event);
- perf_event__output_id_sample(event, handle, &sample_data);
+ perf_event__output_id_sample(event, handle, data);
}
return 0;
@@ -263,22 +262,25 @@ out:
}
int perf_output_begin_forward(struct perf_output_handle *handle,
- struct perf_event *event, unsigned int size)
+ struct perf_sample_data *data,
+ struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size, false);
+ return __perf_output_begin(handle, data, event, size, false);
}
int perf_output_begin_backward(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size, true);
+ return __perf_output_begin(handle, data, event, size, true);
}
int perf_output_begin(struct perf_output_handle *handle,
+ struct perf_sample_data *data,
struct perf_event *event, unsigned int size)
{
- return __perf_output_begin(handle, event, size,
+ return __perf_output_begin(handle, data, event, size,
unlikely(is_write_backward(event)));
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 87a2d515de0d..1f236ed375f8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -454,7 +454,10 @@ static void exit_mm(void)
mmap_read_unlock(mm);
self.task = current;
- self.next = xchg(&core_state->dumper.next, &self);
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
index 63b349168da7..b0b1ad93fa95 100644
--- a/kernel/fail_function.c
+++ b/kernel/fail_function.c
@@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
if (copy_from_user(buf, buffer, count)) {
ret = -EFAULT;
- goto out;
+ goto out_free;
}
buf[count] = '\0';
sym = strstrip(buf);
@@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer,
ret = count;
}
out:
- kfree(buf);
mutex_unlock(&fei_lock);
+out_free:
+ kfree(buf);
return ret;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index ac328874f6e5..00259c7e288e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi_state *pi_state)
*/
if (pi_state->owner) {
struct task_struct *owner;
+ unsigned long flags;
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
owner = pi_state->owner;
if (owner) {
raw_spin_lock(&owner->pi_lock);
@@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
raw_spin_unlock(&owner->pi_lock);
}
rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
}
if (current->pi_state_cache) {
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b71ad8d9f1c9..c1418b47f625 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -108,19 +108,21 @@ static inline void lockdep_lock(void)
{
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+ __this_cpu_inc(lockdep_recursion);
arch_spin_lock(&__lock);
__owner = current;
- __this_cpu_inc(lockdep_recursion);
}
static inline void lockdep_unlock(void)
{
+ DEBUG_LOCKS_WARN_ON(!irqs_disabled());
+
if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
return;
- __this_cpu_dec(lockdep_recursion);
__owner = NULL;
arch_spin_unlock(&__lock);
+ __this_cpu_dec(lockdep_recursion);
}
static inline bool lockdep_assert_locked(void)
@@ -2765,7 +2767,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
* (Note that this has to be done separately, because the graph cannot
* detect such classes of deadlocks.)
*
- * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
+ * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same
+ * lock class is held but nest_lock is also held, i.e. we rely on the
+ * nest_lock to avoid the deadlock.
*/
static int
check_deadlock(struct task_struct *curr, struct held_lock *next)
@@ -2788,7 +2792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
* lock class (i.e. read_lock(lock)+read_lock(lock)):
*/
if ((next->read == 2) && prev->read)
- return 2;
+ continue;
/*
* We're holding the nest_lock, which serializes this lock's
@@ -3593,15 +3597,12 @@ static int validate_chain(struct task_struct *curr,
if (!ret)
return 0;
/*
- * Mark recursive read, as we jump over it when
- * building dependencies (just like we jump over
- * trylock entries):
- */
- if (ret == 2)
- hlock->read = 2;
- /*
* Add dependency only if this lock is not the head
- * of the chain, and if it's not a secondary read-lock:
+ * of the chain, and if the new lock introduces no more
+ * lock dependency (because we already hold a lock with the
+ * same lock class) nor deadlock (because the nest_lock
+ * serializes nesting locks), see the comments for
+ * check_deadlock().
*/
if (!chain_head && ret != 2) {
if (!check_prevs_add(curr, hlock))
diff --git a/kernel/panic.c b/kernel/panic.c
index 396142ee43fd..332736a72a58 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
panic("panic_on_warn set ...\n");
}
- dump_stack();
+ if (!regs)
+ dump_stack();
print_irqtrace_events(current);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..79de1294f8eb 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
-static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns,
- unsigned int mode)
+static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
{
- int ret;
-
if (mode & PTRACE_MODE_NOAUDIT)
- ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT);
- else
- ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE);
-
- return ret == 0;
+ return ns_capable_noaudit(ns, CAP_SYS_PTRACE);
+ return ns_capable(ns, CAP_SYS_PTRACE);
}
/* Returns 0 on success, -errno on denial. */
@@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
gid_eq(caller_gid, tcred->sgid) &&
gid_eq(caller_gid, tcred->gid))
goto ok;
- if (ptrace_has_cap(cred, tcred->user_ns, mode))
+ if (ptrace_has_cap(tcred->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -345,7 +339,7 @@ ok:
mm = task->mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
- !ptrace_has_cap(cred, mm->user_ns, mode)))
+ !ptrace_has_cap(mm->user_ns, mode)))
return -EPERM;
return security_ptrace_access_check(task, mode);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2a52f42f64b6..bd04b09b84b3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu)
smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
-#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing function has no further need of RCU, so remove it from
* the rcu_node tree's ->qsmaskinitnext bit masks.
@@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu)
rdp->cpu_started = false;
}
+#ifdef CONFIG_HOTPLUG_CPU
/*
* The outgoing CPU has just passed through the dying-idle state, and we
* are being invoked from the CPU that was IPIed to continue the offline
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 0fde39b8daab..ca21d28a0f98 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -249,13 +249,16 @@ static bool check_slow_task(struct task_struct *t, void *arg)
/*
* Scan the current list of tasks blocked within RCU read-side critical
- * sections, printing out the tid of each.
+ * sections, printing out the tid of each of the first few of them.
*/
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
+ int i = 0;
int ndetected = 0;
struct rcu_stall_chk_rdr rscr;
struct task_struct *t;
+ struct task_struct *ts[8];
if (!rcu_preempt_blocked_readers_cgp(rnp))
return 0;
@@ -264,6 +267,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
+ get_task_struct(t);
+ ts[i++] = t;
+ if (i >= ARRAY_SIZE(ts))
+ break;
+ }
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ for (i--; i; i--) {
+ t = ts[i];
if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
@@ -273,6 +284,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
".q"[rscr.rs.b.need_qs],
".e"[rscr.rs.b.exp_hint],
".l"[rscr.on_blkd_list]);
+ put_task_struct(t);
ndetected++;
}
pr_cont("\n");
@@ -293,8 +305,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
-static int rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
{
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
@@ -472,7 +485,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
rcu_for_each_leaf_node(rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- ndetected += rcu_print_task_stall(rnp);
if (rnp->qsmask != 0) {
for_each_leaf_node_possible_cpu(rnp, cpu)
if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) {
@@ -480,7 +492,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
ndetected++;
}
}
- raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock.
}
for_each_possible_cpu(cpu)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index e7b78d5ae1ab..af6f23d8bea1 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -551,22 +551,22 @@ static int __init reboot_setup(char *str)
break;
case 's':
- {
- int rc;
-
- if (isdigit(*(str+1))) {
- rc = kstrtoint(str+1, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3))) {
- rc = kstrtoint(str+3, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else
+ if (isdigit(*(str+1)))
+ reboot_cpu = simple_strtoul(str+1, NULL, 0);
+ else if (str[1] == 'm' && str[2] == 'p' &&
+ isdigit(*(str+3)))
+ reboot_cpu = simple_strtoul(str+3, NULL, 0);
+ else
*mode = REBOOT_SOFT;
+ if (reboot_cpu >= num_possible_cpus()) {
+ pr_err("Ignoring the CPU number in reboot= option. "
+ "CPU %d exceeds possible cpu number %d\n",
+ reboot_cpu, num_possible_cpus());
+ reboot_cpu = 0;
+ break;
+ }
break;
- }
+
case 'g':
*mode = REBOOT_GPIO;
break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2003a7d5ab5..e7e453492cff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#ifdef CONFIG_SMP
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
+ else
#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
activate_task(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, rf);
@@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags))
goto unlock;
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
@@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
@@ -4907,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_prio(pi_task->prio) &&
dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.dl_boosted = 1;
+ p->dl.pi_se = pi_task->dl.pi_se;
queue_flag |= ENQUEUE_REPLENISH;
- } else
- p->dl.dl_boosted = 0;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
p->sched_class = &dl_sched_class;
} else if (rt_prio(prio)) {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
- p->dl.dl_boosted = 0;
+ p->dl.pi_se = &p->dl;
if (rt_prio(oldprio))
p->rt.timeout = 0;
p->sched_class = &fair_sched_class;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d73bccde2720..97d318b0cd0c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
.owner = THIS_MODULE,
- .dynamic_switching = true,
+ .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING,
.init = sugov_init,
.exit = sugov_exit,
.start = sugov_start,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f232305dcefe..1d3c97268ec0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+#ifdef CONFIG_RT_MUTEXES
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se->pi_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return pi_of(dl_se) != dl_se;
+}
+#else
+static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se)
+{
+ return dl_se;
+}
+
+static inline bool is_dl_boosted(struct sched_dl_entity *dl_se)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SMP
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- WARN_ON(dl_se->dl_boosted);
+ WARN_ON(is_dl_boosted(dl_se));
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
@@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
* could happen are, typically, a entity voluntarily trying to overcome its
* runtime, or it just underestimated it during sched_setattr().
*/
-static void replenish_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void replenish_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
- BUG_ON(pi_se->dl_runtime <= 0);
+ BUG_ON(pi_of(dl_se)->dl_runtime <= 0);
/*
* This could be the case for a !-dl task that is boosted.
* Just go with full inherited parameters.
*/
if (dl_se->dl_deadline == 0) {
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded && dl_se->runtime > 0)
@@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* arbitrary large.
*/
while (dl_se->runtime <= 0) {
- dl_se->deadline += pi_se->dl_period;
- dl_se->runtime += pi_se->dl_runtime;
+ dl_se->deadline += pi_of(dl_se)->dl_period;
+ dl_se->runtime += pi_of(dl_se)->dl_runtime;
}
/*
@@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
printk_deferred_once("sched: DL replenish lagged too much\n");
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
if (dl_se->dl_yielded)
@@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* task with deadline equal to period this is the same of using
* dl_period instead of dl_deadline in the equation above.
*/
-static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, u64 t)
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t)
{
u64 left, right;
@@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
- (pi_se->dl_runtime >> DL_SCALE);
+ (pi_of(dl_se)->dl_runtime >> DL_SCALE);
return dl_time_before(right, left);
}
@@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
* Please refer to the comments update_dl_revised_wakeup() function to find
* more about the Revised CBS rule.
*/
-static void update_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se)
+static void update_dl_entity(struct sched_dl_entity *dl_se)
{
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
- dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+ dl_entity_overflow(dl_se, rq_clock(rq))) {
if (unlikely(!dl_is_implicit(dl_se) &&
!dl_time_before(dl_se->deadline, rq_clock(rq)) &&
- !dl_se->dl_boosted)){
+ !is_dl_boosted(dl_se))) {
update_dl_revised_wakeup(dl_se, rq);
return;
}
- dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
- dl_se->runtime = pi_se->dl_runtime;
+ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline;
+ dl_se->runtime = pi_of(dl_se)->dl_runtime;
}
}
@@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
*/
- if (dl_se->dl_boosted)
+ if (is_dl_boosted(dl_se))
goto unlock;
/*
@@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
* but do not enqueue -- wait for our wakeup to do that.
*/
if (!task_on_rq_queued(p)) {
- replenish_dl_entity(dl_se, dl_se);
+ replenish_dl_entity(dl_se);
goto unlock;
}
@@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p)))
return;
dl_se->dl_throttled = 1;
if (dl_se->runtime > 0)
@@ -1287,7 +1306,7 @@ throttle:
dl_se->dl_overrun = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+ if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
}
static void
-enqueue_dl_entity(struct sched_dl_entity *dl_se,
- struct sched_dl_entity *pi_se, int flags)
+enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags)
{
BUG_ON(on_dl_rq(dl_se));
@@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
*/
if (flags & ENQUEUE_WAKEUP) {
task_contending(dl_se, flags);
- update_dl_entity(dl_se, pi_se);
+ update_dl_entity(dl_se);
} else if (flags & ENQUEUE_REPLENISH) {
- replenish_dl_entity(dl_se, pi_se);
+ replenish_dl_entity(dl_se);
} else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
@@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
- struct task_struct *pi_task = rt_mutex_get_top_task(p);
- struct sched_dl_entity *pi_se = &p->dl;
-
- /*
- * Use the scheduling parameters of the top pi-waiter task if:
- * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
- * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
- * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
- * boosted due to a SCHED_DEADLINE pi-waiter).
- * Otherwise we keep our runtime and deadline.
- */
- if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
- pi_se = &pi_task->dl;
+ if (is_dl_boosted(&p->dl)) {
/*
* Because of delays in the detection of the overrun of a
* thread's runtime, it might be the case that a thread
@@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* the throttle.
*/
p->dl.dl_throttled = 0;
- BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH);
return;
}
@@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
return;
}
- enqueue_dl_entity(&p->dl, pi_se, flags);
+ enqueue_dl_entity(&p->dl, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
@@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_density = 0;
- dl_se->dl_boosted = 0;
dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
dl_se->dl_non_contending = 0;
dl_se->dl_overrun = 0;
+
+#ifdef CONFIG_RT_MUTEXES
+ dl_se->pi_se = dl_se;
+#endif
}
bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0655524700d2..2357921580f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
unsigned long flags = *(unsigned long *)table->data;
size_t data_size = 0;
size_t len = 0;
- char *tmp;
+ char *tmp, *buf;
int idx;
if (write)
@@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
return 0;
}
- tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
- if (!tmp)
+ buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
+ if (!buf)
return -ENOMEM;
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
char *name = sd_flag_debug[idx].name;
- len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
+ len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
}
- tmp += *ppos;
+ tmp = buf + *ppos;
len -= *ppos;
if (len > *lenp)
@@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write,
*lenp = len;
*ppos += len;
- kfree(tmp);
+ kfree(buf);
return 0;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 290f9e38378c..ae7ceba8fd4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int idle_h_nr_running = task_has_idle_policy(p);
+ int task_new = !(flags & ENQUEUE_WAKEUP);
/*
* The code below (indirectly) updates schedutil which looks at
@@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
* into account, but that is not straightforward to implement,
* and the following generally works well enough in practice.
*/
- if (flags & ENQUEUE_WAKEUP)
+ if (!task_new)
update_overutilized_status(rq);
enqueue_throttle:
@@ -6172,21 +6173,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long best_cap = 0;
+ unsigned long task_util, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
- sync_entity_load_avg(&p->se);
-
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
+ task_util = uclamp_task_util(p);
+
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (task_fits_capacity(p, cpu_cap))
+ if (fits_capacity(task_util, cpu_cap))
return cpu;
if (cpu_cap > best_cap) {
@@ -6198,44 +6199,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
+static inline bool asym_fits_capacity(int task_util, int cpu)
+{
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ return fits_capacity(task_util, capacity_of(cpu));
+
+ return true;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
+ unsigned long task_util;
int i, recent_used_cpu;
/*
- * For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with cpu's capacity.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
- /*
- * On an asymmetric CPU capacity system where an exclusive
- * cpuset defines a symmetric island (i.e. one unique
- * capacity_orig value through the cpuset), the key will be set
- * but the CPUs within that cpuset will not have a domain with
- * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
- * capacity path.
- */
- if (!sd)
- goto symmetric;
-
- i = select_idle_capacity(p, sd, target);
- return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ sync_entity_load_avg(&p->se);
+ task_util = uclamp_task_util(p);
}
-symmetric:
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_capacity(task_util, target))
return target;
/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_capacity(task_util, prev))
return prev;
/*
@@ -6258,7 +6257,8 @@ symmetric:
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6267,6 +6267,26 @@ symmetric:
return recent_used_cpu;
}
+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
@@ -9031,7 +9051,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* emptying busiest.
*/
if (local->group_type == group_has_spare) {
- if (busiest->group_type > group_fully_busy) {
+ if ((busiest->group_type > group_fully_busy) &&
+ !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
/*
* If busiest is overloaded, try to fill spare
* capacity. This might end up creating spare capacity
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 8ad7a293255a..53a7d1512dd7 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -38,7 +38,7 @@
#include <linux/filter.h>
#include <linux/pid.h>
#include <linux/ptrace.h>
-#include <linux/security.h>
+#include <linux/capability.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
#include <linux/anon_inodes.h>
@@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
* behavior of privileged children.
*/
if (!task_no_new_privs(current) &&
- security_capable(current_cred(), current_user_ns(),
- CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
+ !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
/* Allocate a new seccomp_filter */
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 4517c8b66518..048c655315f1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size,
{
int ret;
+ /*
+ * NB: We rely on strncpy_from_user() not copying junk past the NUL
+ * terminator into `dst`.
+ *
+ * strncpy_from_user() does long-sized strides in the fast path. If the
+ * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`,
+ * then there could be junk after the NUL in `dst`. If user takes `dst`
+ * and keys a hash map with it, then semantically identical strings can
+ * occupy multiple entries in the map.
+ */
ret = strncpy_from_user_nofault(dst, unsafe_ptr, size);
if (unlikely(ret < 0))
memset(dst, 0, size);
@@ -1198,7 +1208,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
*btf = bpf_get_btf_vmlinux();
if (IS_ERR_OR_NULL(*btf))
- return PTR_ERR(*btf);
+ return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL;
if (ptr->type_id > 0)
*btf_id = ptr->type_id;
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 5abb5b22ad13..71109065bd8e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
static int __read_mostly nmi_watchdog_available;
-static struct cpumask watchdog_allowed_mask __read_mostly;
-
struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void)
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
#endif
+static struct cpumask watchdog_allowed_mask __read_mostly;
+
/* Global variables, exported for sysctl */
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;