diff options
Diffstat (limited to 'arch/x86/kernel/alternative.c')
-rw-r--r-- | arch/x86/kernel/alternative.c | 2426 |
1 files changed, 1931 insertions, 495 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 62f6b8b7c4a5..ea1d984166cd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,35 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> +#include <linux/execmem.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> +#include <asm/ibt.h> +#include <asm/set_memory.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -37,11 +19,23 @@ EXPORT_SYMBOL_GPL(alternatives_patched); #define MAX_PATCH_LEN (255-1) -static int __initdata_or_module debug_alternative; +#define DA_ALL (~0) +#define DA_ALT 0x01 +#define DA_RET 0x02 +#define DA_RETPOLINE 0x04 +#define DA_ENDBR 0x08 +#define DA_SMP 0x10 + +static unsigned int debug_alternative; static int __init debug_alt(char *str) { - debug_alternative = 1; + if (str && *str == '=') + str++; + + if (!str || kstrtouint(str, 0, &debug_alternative)) + debug_alternative = DA_ALL; + return 1; } __setup("debug-alternative", debug_alt); @@ -55,15 +49,15 @@ static int __init setup_noreplace_smp(char *str) } __setup("noreplace-smp", setup_noreplace_smp); -#define DPRINTK(fmt, args...) \ +#define DPRINTK(type, fmt, args...) \ do { \ - if (debug_alternative) \ + if (debug_alternative & DA_##type) \ printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args); \ } while (0) -#define DUMP_BYTES(buf, len, fmt, args...) \ +#define DUMP_BYTES(type, buf, len, fmt, args...) \ do { \ - if (unlikely(debug_alternative)) { \ + if (unlikely(debug_alternative & DA_##type)) { \ int j; \ \ if (!(len)) \ @@ -86,6 +80,11 @@ static const unsigned char x86nops[] = BYTES_NOP6, BYTES_NOP7, BYTES_NOP8, +#ifdef CONFIG_64BIT + BYTES_NOP9, + BYTES_NOP10, + BYTES_NOP11, +#endif }; const unsigned char * const x86_nops[ASM_NOP_MAX+1] = @@ -99,159 +98,513 @@ const unsigned char * const x86_nops[ASM_NOP_MAX+1] = x86nops + 1 + 2 + 3 + 4 + 5, x86nops + 1 + 2 + 3 + 4 + 5 + 6, x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +#ifdef CONFIG_64BIT + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9, + x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10, +#endif }; -/* Use this to add nops to a buffer, then text_poke the whole buffer. */ -static void __init_or_module add_nops(void *insns, unsigned int len) +#ifdef CONFIG_FINEIBT +static bool cfi_paranoid __ro_after_init; +#endif + +#ifdef CONFIG_MITIGATION_ITS + +#ifdef CONFIG_MODULES +static struct module *its_mod; +#endif +static void *its_page; +static unsigned int its_offset; +struct its_array its_pages; + +static void *__its_alloc(struct its_array *pages) { - while (len > 0) { - unsigned int noplen = len; - if (noplen > ASM_NOP_MAX) - noplen = ASM_NOP_MAX; - memcpy(insns, x86_nops[noplen], noplen); - insns += noplen; - len -= noplen; + void *page __free(execmem) = execmem_alloc(EXECMEM_MODULE_TEXT, PAGE_SIZE); + if (!page) + return NULL; + + void *tmp = krealloc(pages->pages, (pages->num+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + pages->pages = tmp; + pages->pages[pages->num++] = page; + + return no_free_ptr(page); +} + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int offset = 0; + int i = 0; + +#ifdef CONFIG_FINEIBT + if (cfi_paranoid) { + /* + * When ITS uses indirect branch thunk the fineibt_paranoid + * caller sequence doesn't fit in the caller site. So put the + * remaining part of the sequence (<ea> + JNE) into the ITS + * thunk. + */ + bytes[i++] = 0xea; /* invalid instruction */ + bytes[i++] = 0x75; /* JNE */ + bytes[i++] = 0xfd; + + offset = 1; + } +#endif + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk + offset; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); +static void its_pages_protect(struct its_array *pages) +{ + for (int i = 0; i < pages->num; i++) { + void *page = pages->pages[i]; + execmem_restore_rox(page, PAGE_SIZE); + } +} -/* - * Are we looking at a near JMP with a 1 or 4-byte displacement. - */ -static inline bool is_jmp(const u8 opcode) +static void its_fini_core(void) { - return opcode == 0xeb || opcode == 0xe9; + if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) + its_pages_protect(&its_pages); + kfree(its_pages.pages); } -static void __init_or_module -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff) +#ifdef CONFIG_MODULES +void its_init_mod(struct module *mod) { - u8 *next_rip, *tgt_rip; - s32 n_dspl, o_dspl; - int repl_len; + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; - if (a->replacementlen != 5) + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) return; - o_dspl = *(s32 *)(insn_buff + 1); + WARN_ON_ONCE(its_mod != mod); - /* next_rip of the replacement JMP */ - next_rip = repl_insn + a->replacementlen; - /* target rip of the replacement JMP */ - tgt_rip = next_rip + o_dspl; - n_dspl = tgt_rip - orig_insn; + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); - DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl); + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) + its_pages_protect(&mod->arch.its_pages); +} - if (tgt_rip - orig_insn >= 0) { - if (n_dspl - 2 <= 127) - goto two_byte_jmp; - else - goto five_byte_jmp; - /* negative offset */ - } else { - if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) - goto two_byte_jmp; - else - goto five_byte_jmp; +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->arch.its_pages.num; i++) { + void *page = mod->arch.its_pages.pages[i]; + execmem_free(page); } + kfree(mod->arch.its_pages.pages); +} +#endif /* CONFIG_MODULES */ -two_byte_jmp: - n_dspl -= 2; +static void *its_alloc(void) +{ + struct its_array *pages = &its_pages; + void *page; - insn_buff[0] = 0xeb; - insn_buff[1] = (s8)n_dspl; - add_nops(insn_buff + 2, 3); +#ifdef CONFIG_MODULES + if (its_mod) + pages = &its_mod->arch.its_pages; +#endif - repl_len = 2; - goto done; + page = __its_alloc(pages); + if (!page) + return NULL; + + execmem_make_temp_rw(page, PAGE_SIZE); + if (pages == &its_pages) + set_memory_x((unsigned long)page, 1); + + return page; +} -five_byte_jmp: - n_dspl -= 5; +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + +#ifdef CONFIG_FINEIBT + /* + * The ITS thunk contains an indirect jump and an int3 instruction so + * its size is 3 or 4 bytes depending on the register used. If CFI + * paranoid is used then 3 extra bytes are added in the ITS thunk to + * complete the fineibt_paranoid caller sequence. + */ + if (cfi_paranoid) + size += 3; +#endif - insn_buff[0] = 0xe9; - *(s32 *)&insn_buff[1] = n_dspl; + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } - repl_len = 5; + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; -done: + thunk = its_page + its_offset; + its_offset += size; - DPRINTK("final displ: 0x%08x, JMP 0x%lx", - n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); + return its_init_thunk(thunk, reg); } +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + +#ifdef CONFIG_FINEIBT + /* Paranoid thunk starts 2 bytes before */ + if (cfi_paranoid) + return thunk - 2; +#endif + return thunk; +} + +#else +static inline void its_fini_core(void) {} +#endif /* CONFIG_MITIGATION_ITS */ + /* - * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90) + * Nomenclature for variable names to simplify and clarify this code and ease + * any potential staring at it: + * + * @instr: source address of the original instructions in the kernel text as + * generated by the compiler. * - * @instr: instruction byte stream - * @instrlen: length of the above - * @off: offset within @instr where the first NOP has been detected + * @buf: temporary buffer on which the patching operates. This buffer is + * eventually text-poked into the kernel image. * - * Return: number of NOPs found (and replaced). + * @replacement/@repl: pointer to the opcodes which are replacing @instr, located + * in the .altinstr_replacement section. */ -static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off) + +/* + * Fill the buffer with a single effective instruction of size @len. + * + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info) + * for every single-byte NOP, try to generate the maximally available NOP of + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and + * *jump* over instead of executing long and daft NOPs. + */ +static void add_nop(u8 *buf, unsigned int len) { - unsigned long flags; - int i = off, nnops; + u8 *target = buf + len; - while (i < instrlen) { - if (instr[i] != 0x90) - break; + if (!len) + return; - i++; + if (len <= ASM_NOP_MAX) { + memcpy(buf, x86_nops[len], len); + return; } - nnops = i - off; + if (len < 128) { + __text_gen_insn(buf, JMP8_INSN_OPCODE, buf, target, JMP8_INSN_SIZE); + buf += JMP8_INSN_SIZE; + } else { + __text_gen_insn(buf, JMP32_INSN_OPCODE, buf, target, JMP32_INSN_SIZE); + buf += JMP32_INSN_SIZE; + } - if (nnops <= 1) - return nnops; + for (;buf < target; buf++) + *buf = INT3_INSN_OPCODE; +} - local_irq_save(flags); - add_nops(instr + off, nnops); - local_irq_restore(flags); +/* + * Matches NOP and NOPL, not any of the other possible NOPs. + */ +static bool insn_is_nop(struct insn *insn) +{ + /* Anything NOP, but no REP NOP */ + if (insn->opcode.bytes[0] == 0x90 && + (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3)) + return true; + + /* NOPL */ + if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F) + return true; - DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i); + /* TODO: more nops */ + + return false; +} + +/* + * Find the offset of the first non-NOP instruction starting at @offset + * but no further than @len. + */ +static int skip_nops(u8 *buf, int offset, int len) +{ + struct insn insn; + + for (; offset < len; offset += insn.length) { + if (insn_decode_kernel(&insn, &buf[offset])) + break; + + if (!insn_is_nop(&insn)) + break; + } - return nnops; + return offset; } /* * "noinline" to cause control flow change and thus invalidate I$ and * cause refetch after modification. */ -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) +static void noinline optimize_nops(const u8 * const instr, u8 *buf, size_t len) { - struct insn insn; - int i = 0; + for (int next, i = 0; i < len; i = next) { + struct insn insn; + + if (insn_decode_kernel(&insn, &buf[i])) + return; + + next = i + insn.length; + + if (insn_is_nop(&insn)) { + int nop = i; + + /* Has the NOP already been optimized? */ + if (i + insn.length == len) + return; + + next = skip_nops(buf, next, len); + + add_nop(buf + nop, next - nop); + DUMP_BYTES(ALT, buf, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, next); + } + } +} + +/* + * In this context, "source" is where the instructions are placed in the + * section .altinstr_replacement, for example during kernel build by the + * toolchain. + * "Destination" is where the instructions are being patched in by this + * machinery. + * + * The source offset is: + * + * src_imm = target - src_next_ip (1) + * + * and the target offset is: + * + * dst_imm = target - dst_next_ip (2) + * + * so rework (1) as an expression for target like: + * + * target = src_imm + src_next_ip (1a) + * + * and substitute in (2) to get: + * + * dst_imm = (src_imm + src_next_ip) - dst_next_ip (3) + * + * Now, since the instruction stream is 'identical' at src and dst (it + * is being copied after all) it can be stated that: + * + * src_next_ip = src + ip_offset + * dst_next_ip = dst + ip_offset (4) + * + * Substitute (4) in (3) and observe ip_offset being cancelled out to + * obtain: + * + * dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset) + * = src_imm + src - dst + ip_offset - ip_offset + * = src_imm + src - dst (5) + * + * IOW, only the relative displacement of the code block matters. + */ + +#define apply_reloc_n(n_, p_, d_) \ + do { \ + s32 v = *(s##n_ *)(p_); \ + v += (d_); \ + BUG_ON((v >> 31) != (v >> (n_-1))); \ + *(s##n_ *)(p_) = (s##n_)v; \ + } while (0) + + +static __always_inline +void apply_reloc(int n, void *ptr, uintptr_t diff) +{ + switch (n) { + case 1: apply_reloc_n(8, ptr, diff); break; + case 2: apply_reloc_n(16, ptr, diff); break; + case 4: apply_reloc_n(32, ptr, diff); break; + default: BUG(); + } +} +static __always_inline +bool need_reloc(unsigned long offset, u8 *src, size_t src_len) +{ + u8 *target = src + offset; /* - * Jump over the non-NOP insns and optimize single-byte NOPs into bigger - * ones. + * If the target is inside the patched block, it's relative to the + * block itself and does not need relocation. */ - for (;;) { - if (insn_decode_kernel(&insn, &instr[i])) - return; + return (target < src || target > src + src_len); +} - /* - * See if this and any potentially following NOPs can be - * optimized. - */ - if (insn.length == 1 && insn.opcode.bytes[0] == 0x90) - i += optimize_nops_range(instr, len, i); - else - i += insn.length; +static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +{ + for (int next, i = 0; i < instrlen; i = next) { + struct insn insn; - if (i >= len) + if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i]))) return; + + next = i + insn.length; + + switch (insn.opcode.bytes[0]) { + case 0x0f: + if (insn.opcode.bytes[1] < 0x80 || + insn.opcode.bytes[1] > 0x8f) + break; + + fallthrough; /* Jcc.d32 */ + case 0x70 ... 0x7f: /* Jcc.d8 */ + case JMP8_INSN_OPCODE: + case JMP32_INSN_OPCODE: + case CALL_INSN_OPCODE: + if (need_reloc(next + insn.immediate.value, repl, repl_len)) { + apply_reloc(insn.immediate.nbytes, + buf + i + insn_offset_immediate(&insn), + repl - instr); + } + + /* + * Where possible, convert JMP.d32 into JMP.d8. + */ + if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) { + s32 imm = insn.immediate.value; + imm += repl - instr; + imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE; + if ((imm >> 31) == (imm >> 7)) { + buf[i+0] = JMP8_INSN_OPCODE; + buf[i+1] = (s8)imm; + + memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2); + } + } + break; + } + + if (insn_rip_relative(&insn)) { + if (need_reloc(next + insn.displacement.value, repl, repl_len)) { + apply_reloc(insn.displacement.nbytes, + buf + i + insn_offset_displacement(&insn), + repl - instr); + } + } } } +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +{ + __apply_relocation(buf, instr, instrlen, repl, repl_len); + optimize_nops(instr, buf, instrlen); +} + +/* Low-level backend functions usable from alternative code replacements. */ +DEFINE_ASM_FUNC(nop_func, "", .entry.text); +EXPORT_SYMBOL_GPL(nop_func); + +noinstr void BUG_func(void) +{ + BUG(); +} +EXPORT_SYMBOL(BUG_func); + +#define CALL_RIP_REL_OPCODE 0xff +#define CALL_RIP_REL_MODRM 0x15 + +/* + * Rewrite the "call BUG_func" replacement to point to the target of the + * indirect pv_ops call "call *disp(%ip)". + */ +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +{ + void *target, *bug = &BUG_func; + s32 disp; + + if (a->replacementlen != 5 || insn_buff[0] != CALL_INSN_OPCODE) { + pr_err("ALT_FLAG_DIRECT_CALL set for a non-call replacement instruction\n"); + BUG(); + } + + if (a->instrlen != 6 || + instr[0] != CALL_RIP_REL_OPCODE || + instr[1] != CALL_RIP_REL_MODRM) { + pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); + BUG(); + } + + /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ + disp = *(s32 *)(instr + 2); +#ifdef CONFIG_X86_64 + /* ff 15 00 00 00 00 call *0x0(%rip) */ + /* target address is stored at "next instruction + disp". */ + target = *(void **)(instr + a->instrlen + disp); +#else + /* ff 15 00 00 00 00 call *0x0 */ + /* target address is stored at disp. */ + target = *(void **)disp; +#endif + if (!target) + target = bug; + + /* (BUG_func - .) + (target - BUG_func) := target - . */ + *(s32 *)(insn_buff + 1) += target - bug; + + if (target == &nop_func) + return 0; + + return 5; +} + +static inline u8 * instr_va(struct alt_instr *i) +{ + return (u8 *)&i->instr_offset + i->instr_offset; +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -265,11 +618,22 @@ static void __init_or_module noinline optimize_nops(u8 *instr, size_t len) void __init_or_module noinline apply_alternatives(struct alt_instr *start, struct alt_instr *end) { - struct alt_instr *a; - u8 *instr, *replacement; u8 insn_buff[MAX_PATCH_LEN]; + u8 *instr, *replacement; + struct alt_instr *a, *b; + + DPRINTK(ALT, "alt table %px, -> %px", start, end); + + /* + * KASAN_SHADOW_START is defined using + * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. + * During the process, KASAN becomes confused seeing partial LA57 + * conversion and triggers a false-positive out-of-bound report. + * + * Disable KASAN until the patching is complete. + */ + kasan_disable_current(); - DPRINTK("alt table %px, -> %px", start, end); /* * The scan order should be from start to end. A later scanned * alternative code can overwrite previously scanned alternative code. @@ -281,65 +645,73 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; - /* Mask away "NOT" flag bit for feature to test. */ - u16 feature = a->cpuid & ~ALTINSTR_FLAG_INV; - instr = (u8 *)&a->instr_offset + a->instr_offset; + /* + * In case of nested ALTERNATIVE()s the outer alternative might + * add more padding. To ensure consistent patching find the max + * padding for all alt_instr entries for this site (nested + * alternatives result in consecutive entries). + */ + for (b = a+1; b < end && instr_va(b) == instr_va(a); b++) { + u8 len = max(a->instrlen, b->instrlen); + a->instrlen = b->instrlen = len; + } + + instr = instr_va(a); replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insn_buff)); - BUG_ON(feature >= (NCAPINTS + NBUGINTS) * 32); + BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); /* * Patch if either: * - feature is present - * - feature not present but ALTINSTR_FLAG_INV is set to mean, + * - feature not present but ALT_FLAG_NOT is set to mean, * patch if feature is *NOT* present. */ - if (!boot_cpu_has(feature) == !(a->cpuid & ALTINSTR_FLAG_INV)) - goto next; + if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { + memcpy(insn_buff, instr, a->instrlen); + optimize_nops(instr, insn_buff, a->instrlen); + text_poke_early(instr, insn_buff, a->instrlen); + continue; + } - DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)", - (a->cpuid & ALTINSTR_FLAG_INV) ? "!" : "", - feature >> 5, - feature & 0x1f, + DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d) flags: 0x%x", + a->cpuid >> 5, + a->cpuid & 0x1f, instr, instr, a->instrlen, - replacement, a->replacementlen); - - DUMP_BYTES(instr, a->instrlen, "%px: old_insn: ", instr); - DUMP_BYTES(replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + replacement, a->replacementlen, a->flags); memcpy(insn_buff, replacement, a->replacementlen); insn_buff_sz = a->replacementlen; - /* - * 0xe8 is a relative jump; fix the offset. - * - * Instruction length is checked before the opcode to avoid - * accessing uninitialized bytes for zero-length replacements. - */ - if (a->replacementlen == 5 && *insn_buff == 0xe8) { - *(s32 *)(insn_buff + 1) += replacement - instr; - DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", - *(s32 *)(insn_buff + 1), - (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5); + if (a->flags & ALT_FLAG_DIRECT_CALL) { + insn_buff_sz = alt_replace_call(instr, insn_buff, a); + if (insn_buff_sz < 0) + continue; } - if (a->replacementlen && is_jmp(replacement[0])) - recompute_jump(a, instr, replacement, insn_buff); - for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - text_poke_early(instr, insn_buff, insn_buff_sz); + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); + DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); -next: - optimize_nops(instr, a->instrlen); + text_poke_early(instr, insn_buff, insn_buff_sz); } + + kasan_enable_current(); } -#if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL) +static inline bool is_jcc32(struct insn *insn) +{ + /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ + return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80; +} + +#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_OBJTOOL) /* * CALL/JMP *%\reg @@ -377,6 +749,93 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + call_dest, + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + jmp_dest, + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + +static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_call_thunk_array[reg], + __x86_indirect_jump_thunk_array[reg]); +} + +#ifdef CONFIG_MITIGATION_ITS +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#else /* CONFIG_MITIGATION_ITS */ + +#ifdef CONFIG_FINEIBT +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + return false; +} +#endif + +#endif /* CONFIG_MITIGATION_ITS */ + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -409,8 +868,12 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) BUG_ON(reg == 4); if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) && - !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) + !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH)) + return emit_call_track_retpoline(addr, insn, reg, bytes); + return -1; + } op = insn->opcode.bytes[0]; @@ -427,8 +890,7 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) * [ NOP ] * 1: */ - /* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */ - if (op == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80) { + if (is_jcc32(insn)) { cc = insn->opcode.bytes[1] & 0xf; cc ^= 1; /* invert condition */ @@ -448,11 +910,29 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; i += ret; + /* + * The compiler is supposed to EMIT an INT3 after every unconditional + * JMP instruction due to AMD BTC. However, if the compiler is too old + * or MITIGATION_SLS isn't enabled, we still need an INT3 after + * indirect JMPs even on Intel. + */ + if (op == JMP32_INSN_OPCODE && i < insn->length) + bytes[i++] = INT3_INSN_OPCODE; + for (; i < insn->length;) bytes[i++] = BYTES_NOP1; @@ -472,6 +952,7 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) int len, ret; u8 bytes[16]; u8 op1, op2; + u8 *dest; ret = insn_decode_kernel(&insn, addr); if (WARN_ON_ONCE(ret < 0)) @@ -481,8 +962,19 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) op2 = insn.opcode.bytes[1]; switch (op1) { + case 0x70 ... 0x7f: /* Jcc.d8 */ + /* See cfi_paranoid. */ + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: + /* Check for cfi_paranoid + ITS */ + dest = addr + insn.length + insn.immediate.value; + if (dest[-1] == 0xea && (dest[0] & 0xf0) == 0x70) { + WARN_ON_ONCE(cfi_mode != CFI_FINEIBT); + continue; + } break; case 0x0f: /* escape */ @@ -494,21 +986,37 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) continue; } - DPRINTK("retpoline at: %pS (%px) len: %d to: %pS", + DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { - optimize_nops(bytes, len); - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + optimize_nops(addr, bytes, len); + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } -#ifdef CONFIG_RETHUNK +#ifdef CONFIG_MITIGATION_RETHUNK + +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -524,14 +1032,17 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) - return -1; - - bytes[i++] = RET_INSN_OPCODE; + /* Patch the custom return thunks... */ + if (cpu_wants_rethunk_at(addr)) { + i = JMP32_INSN_SIZE; + __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); + } else { + /* ... or patch them out if not needed. */ + bytes[i++] = RET_INSN_OPCODE; + } for (; i < insn->length;) bytes[i++] = INT3_INSN_OPCODE; - return i; } @@ -539,6 +1050,9 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; + if (cpu_wants_rethunk()) + static_call_force_reinit(); + for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; @@ -560,64 +1074,990 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) addr, dest, 5, addr)) continue; - DPRINTK("return thunk at: %pS (%px) len: %d to: %pS", + DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS", addr, addr, insn.length, addr + insn.length + insn.immediate.value); len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(((u8*)addr), len, "%px: orig: ", addr); - DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr); + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); text_poke_early(addr, bytes, len); } } } -#else +#else /* !CONFIG_MITIGATION_RETHUNK: */ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETHUNK */ +#endif /* !CONFIG_MITIGATION_RETHUNK */ -#else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */ +#else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } -#endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */ +#endif /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT +__noendbr bool is_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return __is_endbr(endbr); + +Efault: + return false; +} + +#ifdef CONFIG_FINEIBT + +static __noendbr bool exact_endbr(u32 *val) +{ + u32 endbr; + + __get_kernel_nofault(&endbr, val, u32, Efault); + return endbr == gen_endbr(); + +Efault: + return false; +} + +#endif + +static void poison_cfi(void *addr); + +static void __init_or_module poison_endbr(void *addr) +{ + u32 poison = gen_endbr_poison(); + + if (WARN_ON_ONCE(!is_endbr(addr))) + return; + + DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr); + + /* + * When we have IBT, the lack of ENDBR will trigger #CP + */ + DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); + DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); + text_poke_early(addr, &poison, 4); +} + /* * Generated by: objtool --ibt + * + * Seal the functions for indirect calls by clobbering the ENDBR instructions + * and the kCFI hash value. */ -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + poison_endbr(addr); + if (IS_ENABLED(CONFIG_FINEIBT)) + poison_cfi(addr - 16); + } +} + +#else /* !CONFIG_X86_KERNEL_IBT: */ + +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } + +#endif /* !CONFIG_X86_KERNEL_IBT */ + +#ifdef CONFIG_CFI_AUTO_DEFAULT +# define __CFI_DEFAULT CFI_AUTO +#elif defined(CONFIG_CFI_CLANG) +# define __CFI_DEFAULT CFI_KCFI +#else +# define __CFI_DEFAULT CFI_OFF +#endif + +enum cfi_mode cfi_mode __ro_after_init = __CFI_DEFAULT; + +#ifdef CONFIG_FINEIBT_BHI +bool cfi_bhi __ro_after_init = false; +#endif + +#ifdef CONFIG_CFI_CLANG +struct bpf_insn; + +/* Must match bpf_func_t / DEFINE_BPF_PROG_RUN() */ +extern unsigned int __bpf_prog_runX(const void *ctx, + const struct bpf_insn *insn); + +KCFI_REFERENCE(__bpf_prog_runX); + +/* u32 __ro_after_init cfi_bpf_hash = __kcfi_typeid___bpf_prog_runX; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_hash,@object \n" +" .globl cfi_bpf_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_hash: \n" +" .long __kcfi_typeid___bpf_prog_runX \n" +" .size cfi_bpf_hash, 4 \n" +" .popsection \n" +); + +/* Must match bpf_callback_t */ +extern u64 __bpf_callback_fn(u64, u64, u64, u64, u64); + +KCFI_REFERENCE(__bpf_callback_fn); + +/* u32 __ro_after_init cfi_bpf_subprog_hash = __kcfi_typeid___bpf_callback_fn; */ +asm ( +" .pushsection .data..ro_after_init,\"aw\",@progbits \n" +" .type cfi_bpf_subprog_hash,@object \n" +" .globl cfi_bpf_subprog_hash \n" +" .p2align 2, 0x0 \n" +"cfi_bpf_subprog_hash: \n" +" .long __kcfi_typeid___bpf_callback_fn \n" +" .size cfi_bpf_subprog_hash, 4 \n" +" .popsection \n" +); + +u32 cfi_get_func_hash(void *func) +{ + u32 hash; + + func -= cfi_get_offset(); + switch (cfi_mode) { + case CFI_FINEIBT: + func += 7; + break; + case CFI_KCFI: + func += 1; + break; + default: + return 0; + } + + if (get_kernel_nofault(hash, func)) + return 0; + + return hash; +} + +int cfi_get_func_arity(void *func) +{ + bhi_thunk *target; + s32 disp; + + if (cfi_mode != CFI_FINEIBT && !cfi_bhi) + return 0; + + if (get_kernel_nofault(disp, func - 4)) + return 0; + + target = func + disp; + return target - __bhi_args; +} +#endif + +#ifdef CONFIG_FINEIBT + +static bool cfi_rand __ro_after_init = true; +static u32 cfi_seed __ro_after_init; + +/* + * Re-hash the CFI hash with a boot-time seed while making sure the result is + * not a valid ENDBR instruction. + */ +static u32 cfi_rehash(u32 hash) +{ + hash ^= cfi_seed; + while (unlikely(__is_endbr(hash) || __is_endbr(-hash))) { + bool lsb = hash & 1; + hash >>= 1; + if (lsb) + hash ^= 0x80200003; + } + return hash; +} + +static __init int cfi_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + while (str) { + char *next = strchr(str, ','); + if (next) { + *next = 0; + next++; + } + + if (!strcmp(str, "auto")) { + cfi_mode = CFI_AUTO; + } else if (!strcmp(str, "off")) { + cfi_mode = CFI_OFF; + cfi_rand = false; + } else if (!strcmp(str, "kcfi")) { + cfi_mode = CFI_KCFI; + } else if (!strcmp(str, "fineibt")) { + cfi_mode = CFI_FINEIBT; + } else if (!strcmp(str, "norand")) { + cfi_rand = false; + } else if (!strcmp(str, "warn")) { + pr_alert("CFI mismatch non-fatal!\n"); + cfi_warn = true; + } else if (!strcmp(str, "paranoid")) { + if (cfi_mode == CFI_FINEIBT) { + cfi_paranoid = true; + } else { + pr_err("Ignoring paranoid; depends on fineibt.\n"); + } + } else if (!strcmp(str, "bhi")) { +#ifdef CONFIG_FINEIBT_BHI + if (cfi_mode == CFI_FINEIBT) { + cfi_bhi = true; + } else { + pr_err("Ignoring bhi; depends on fineibt.\n"); + } +#else + pr_err("Ignoring bhi; depends on FINEIBT_BHI=y.\n"); +#endif + } else { + pr_err("Ignoring unknown cfi option (%s).", str); + } + + str = next; + } + + return 0; +} +early_param("cfi", cfi_parse_cmdline); + +/* + * kCFI FineIBT + * + * __cfi_\func: __cfi_\func: + * movl $0x12345678,%eax // 5 endbr64 // 4 + * nop subl $0x12345678,%r10d // 7 + * nop jne __cfi_\func+6 // 2 + * nop nop3 // 3 + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * nop + * + * + * caller: caller: + * movl $(-0x12345678),%r10d // 6 movl $0x12345678,%r10d // 6 + * addl $-15(%r11),%r10d // 4 lea -0x10(%r11),%r11 // 4 + * je 1f // 2 nop4 // 4 + * ud2 // 2 + * 1: cs call __x86_indirect_thunk_r11 // 6 call *%r11; nop3; // 6 + * + */ + +/* + * <fineibt_preamble_start>: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub $0x12345678, %r10d + * b: 75 f9 jne 6 <fineibt_preamble_start+0x6> + * d: 0f 1f 00 nopl (%rax) + * + * Note that the JNE target is the 0xEA byte inside the SUB, this decodes as + * (bad) on x86_64 and raises #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_preamble_start: \n" + " endbr64 \n" + " subl $0x12345678, %r10d \n" + "fineibt_preamble_bhi: \n" + " jne fineibt_preamble_start+6 \n" + ASM_NOP3 + "fineibt_preamble_end: \n" + ".popsection\n" +); + +extern u8 fineibt_preamble_start[]; +extern u8 fineibt_preamble_bhi[]; +extern u8 fineibt_preamble_end[]; + +#define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start) +#define fineibt_preamble_bhi (fineibt_preamble_bhi - fineibt_preamble_start) +#define fineibt_preamble_ud 6 +#define fineibt_preamble_hash 7 + +/* + * <fineibt_caller_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * a: 0f 1f 40 00 nopl 0x0(%rax) + */ +asm( ".pushsection .rodata \n" + "fineibt_caller_start: \n" + " movl $0x12345678, %r10d \n" + " lea -0x10(%r11), %r11 \n" + ASM_NOP4 + "fineibt_caller_end: \n" + ".popsection \n" +); + +extern u8 fineibt_caller_start[]; +extern u8 fineibt_caller_end[]; + +#define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start) +#define fineibt_caller_hash 2 + +#define fineibt_caller_jmp (fineibt_caller_size - 2) + +/* + * Since FineIBT does hash validation on the callee side it is prone to + * circumvention attacks where a 'naked' ENDBR instruction exists that + * is not part of the fineibt_preamble sequence. + * + * Notably the x86 entry points must be ENDBR and equally cannot be + * fineibt_preamble. + * + * The fineibt_paranoid caller sequence adds additional caller side + * hash validation. This stops such circumvention attacks dead, but at the cost + * of adding a load. + * + * <fineibt_paranoid_start>: + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b <f0> lea -0x10(%r11), %r11 + * e: 75 fd jne d <fineibt_paranoid_start+0xd> + * 10: 41 ff d3 call *%r11 + * 13: 90 nop + * + * Notably LEA does not modify flags and can be reordered with the CMP, + * avoiding a dependency. Again, using a non-taken (backwards) branch + * for the failure case, abusing LEA's immediate 0xf0 as LOCK prefix for the + * Jcc.d8, causing #UD. + */ +asm( ".pushsection .rodata \n" + "fineibt_paranoid_start: \n" + " movl $0x12345678, %r10d \n" + " cmpl -9(%r11), %r10d \n" + " lea -0x10(%r11), %r11 \n" + " jne fineibt_paranoid_start+0xd \n" + "fineibt_paranoid_ind: \n" + " call *%r11 \n" + " nop \n" + "fineibt_paranoid_end: \n" + ".popsection \n" +); + +extern u8 fineibt_paranoid_start[]; +extern u8 fineibt_paranoid_ind[]; +extern u8 fineibt_paranoid_end[]; + +#define fineibt_paranoid_size (fineibt_paranoid_end - fineibt_paranoid_start) +#define fineibt_paranoid_ind (fineibt_paranoid_ind - fineibt_paranoid_start) +#define fineibt_paranoid_ud 0xd + +static u32 decode_preamble_hash(void *addr, int *reg) +{ + u8 *p = addr; + + /* b8+reg 78 56 34 12 movl $0x12345678,\reg */ + if (p[0] >= 0xb8 && p[0] < 0xc0) { + if (reg) + *reg = p[0] - 0xb8; + return *(u32 *)(addr + 1); + } + + return 0; /* invalid hash value */ +} + +static u32 decode_caller_hash(void *addr) +{ + u8 *p = addr; + + /* 41 ba 88 a9 cb ed mov $(-0x12345678),%r10d */ + if (p[0] == 0x41 && p[1] == 0xba) + return -*(u32 *)(addr + 2); + + /* e8 0c 88 a9 cb ed jmp.d8 +12 */ + if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp) + return -*(u32 *)(addr + 2); + + return 0; /* invalid hash value */ +} + +/* .retpoline_sites */ +static int cfi_disable_callers(s32 *start, s32 *end) { + /* + * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate + * in tact for later usage. Also see decode_caller_hash() and + * cfi_rewrite_callers(). + */ + const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp }; s32 *s; for (s = start; s < end; s++) { - u32 endbr, poison = gen_endbr_poison(); void *addr = (void *)s + *s; + u32 hash; - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - if (WARN_ON_ONCE(!is_endbr(endbr))) + text_poke_early(addr, jmp, 2); + } + + return 0; +} + +static int cfi_enable_callers(s32 *start, s32 *end) +{ + /* + * Re-enable kCFI, undo what cfi_disable_callers() did. + */ + const u8 mov[] = { 0x41, 0xba }; + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) /* nocfi callers */ continue; - DPRINTK("ENDBR at: %pS (%px)", addr, addr); + text_poke_early(addr, mov, 2); + } + + return 0; +} + +/* .cfi_sites */ +static int cfi_rand_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + hash = decode_preamble_hash(addr, NULL); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + hash = cfi_rehash(hash); + text_poke_early(addr + 1, &hash, 4); + } + + return 0; +} +static void cfi_fineibt_bhi_preamble(void *addr, int arity) +{ + if (!arity) + return; + + if (!cfi_warn && arity == 1) { /* - * When we have IBT, the lack of ENDBR will trigger #CP + * Crazy scheme to allow arity-1 inline: + * + * __cfi_foo: + * 0: f3 0f 1e fa endbr64 + * 4: 41 81 <ea> 78 56 34 12 sub 0x12345678, %r10d + * b: 49 0f 45 fa cmovne %r10, %rdi + * f: 75 f5 jne __cfi_foo+6 + * 11: 0f 1f 00 nopl (%rax) + * + * Code that direct calls to foo()+0, decodes the tail end as: + * + * foo: + * 0: f5 cmc + * 1: 0f 1f 00 nopl (%rax) + * + * which clobbers CF, but does not affect anything ABI + * wise. + * + * Notably, this scheme is incompatible with permissive CFI + * because the CMOVcc is unconditional and RDI will have been + * clobbered. */ - DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr); - DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + const u8 magic[9] = { + 0x49, 0x0f, 0x45, 0xfa, + 0x75, 0xf5, + BYTES_NOP3, + }; + + text_poke_early(addr + fineibt_preamble_bhi, magic, 9); + + return; } + + text_poke_early(addr + fineibt_preamble_bhi, + text_gen_insn(CALL_INSN_OPCODE, + addr + fineibt_preamble_bhi, + __bhi_args[arity]), + CALL_INSN_SIZE); } -#else +static int cfi_rewrite_preamble(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + int arity; + u32 hash; + + /* + * When the function doesn't start with ENDBR the compiler will + * have determined there are no indirect calls to it and we + * don't need no CFI either. + */ + if (!is_endbr(addr + 16)) + continue; + + hash = decode_preamble_hash(addr, &arity); + if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", + addr, addr, 5, addr)) + return -EINVAL; + + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + + WARN_ONCE(!IS_ENABLED(CONFIG_FINEIBT_BHI) && arity, + "kCFI preamble has wrong register at: %pS %*ph\n", + addr, 5, addr); + + if (cfi_bhi) + cfi_fineibt_bhi_preamble(addr, arity); + } + + return 0; +} + +static void cfi_rewrite_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + if (!exact_endbr(addr + 16)) + continue; + + poison_endbr(addr + 16); + } +} + +/* .retpoline_sites */ +static int cfi_rand_callers(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + u32 hash; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (hash) { + hash = -cfi_rehash(hash); + text_poke_early(addr + 2, &hash, 4); + } + } + + return 0; +} + +static int emit_paranoid_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + u8 *thunk = (void *)__x86_indirect_its_thunk_array[reg] - 2; + +#ifdef CONFIG_MITIGATION_ITS + u8 *tmp = its_allocate_thunk(reg); + if (tmp) + thunk = tmp; +#endif + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); +} + +static int cfi_rewrite_callers(s32 *start, s32 *end) +{ + s32 *s; + + BUG_ON(fineibt_paranoid_size != 20); + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + struct insn insn; + u8 bytes[20]; + u32 hash; + int ret; + u8 op; + + addr -= fineibt_caller_size; + hash = decode_caller_hash(addr); + if (!hash) + continue; + + if (!cfi_paranoid) { + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(addr + fineibt_caller_hash, &hash, 4); + /* rely on apply_retpolines() */ + continue; + } + + /* cfi_paranoid */ + ret = insn_decode_kernel(&insn, addr + fineibt_caller_size); + if (WARN_ON_ONCE(ret < 0)) + continue; + + op = insn.opcode.bytes[0]; + if (op != CALL_INSN_OPCODE && op != JMP32_INSN_OPCODE) { + WARN_ON_ONCE(1); + continue; + } + + memcpy(bytes, fineibt_paranoid_start, fineibt_paranoid_size); + memcpy(bytes + fineibt_caller_hash, &hash, 4); + + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + fineibt_paranoid_ind, 11)) { + emit_paranoid_trampoline(addr + fineibt_caller_size, + &insn, 11, bytes + fineibt_caller_size); + } else { + ret = emit_indirect(op, 11, bytes + fineibt_paranoid_ind); + if (WARN_ON_ONCE(ret != 3)) + continue; + } + + text_poke_early(addr, bytes, fineibt_paranoid_size); + } + + return 0; +} + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ + int ret; + + if (WARN_ONCE(fineibt_preamble_size != 16, + "FineIBT preamble wrong size: %ld", fineibt_preamble_size)) + return; + + if (cfi_mode == CFI_AUTO) { + cfi_mode = CFI_KCFI; + if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT)) { + /* + * FRED has much saner context on exception entry and + * is less easy to take advantage of. + */ + if (!cpu_feature_enabled(X86_FEATURE_FRED)) + cfi_paranoid = true; + cfi_mode = CFI_FINEIBT; + } + } + + /* + * Rewrite the callers to not use the __cfi_ stubs, such that we might + * rewrite them. This disables all CFI. If this succeeds but any of the + * later stages fails, we're without CFI. + */ + ret = cfi_disable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + if (cfi_rand) { + if (builtin) { + cfi_seed = get_random_u32(); + cfi_bpf_hash = cfi_rehash(cfi_bpf_hash); + cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); + } + + ret = cfi_rand_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + ret = cfi_rand_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + } + + switch (cfi_mode) { + case CFI_OFF: + if (builtin) + pr_info("Disabling CFI\n"); + return; -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) { } + case CFI_KCFI: + ret = cfi_enable_callers(start_retpoline, end_retpoline); + if (ret) + goto err; -#endif /* CONFIG_X86_KERNEL_IBT */ + if (builtin) + pr_info("Using kCFI\n"); + return; + + case CFI_FINEIBT: + /* place the FineIBT preamble at func()-16 */ + ret = cfi_rewrite_preamble(start_cfi, end_cfi); + if (ret) + goto err; + + /* rewrite the callers to target func()-16 */ + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + if (ret) + goto err; + + /* now that nobody targets func()+0, remove ENDBR there */ + cfi_rewrite_endbr(start_cfi, end_cfi); + + if (builtin) { + pr_info("Using %sFineIBT%s CFI\n", + cfi_paranoid ? "paranoid " : "", + cfi_bhi ? "+BHI" : ""); + } + return; + + default: + break; + } + +err: + pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); +} + +static inline void poison_hash(void *addr) +{ + *(u32 *)addr = 0; +} + +static void poison_cfi(void *addr) +{ + /* + * Compilers manage to be inconsistent with ENDBR vs __cfi prefixes, + * some (static) functions for which they can determine the address + * is never taken do not get a __cfi prefix, but *DO* get an ENDBR. + * + * As such, these functions will get sealed, but we need to be careful + * to not unconditionally scribble the previous function. + */ + switch (cfi_mode) { + case CFI_FINEIBT: + /* + * FineIBT prefix should start with an ENDBR. + */ + if (!is_endbr(addr)) + break; + + /* + * __cfi_\func: + * osp nopl (%rax) + * subl $0, %r10d + * jz 1f + * ud2 + * 1: nop + */ + poison_endbr(addr); + poison_hash(addr + fineibt_preamble_hash); + break; + + case CFI_KCFI: + /* + * kCFI prefix should start with a valid hash. + */ + if (!decode_preamble_hash(addr, NULL)) + break; + + /* + * __cfi_\func: + * movl $0, %eax + * .skip 11, 0x90 + */ + poison_hash(addr + 1); + break; + + default: + break; + } +} + +/* + * When regs->ip points to a 0xEA byte in the FineIBT preamble, + * return true and fill out target and type. + * + * We check the preamble by checking for the ENDBR instruction relative to the + * 0xEA instruction. + */ +static bool decode_fineibt_preamble(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_preamble_ud; + u32 hash; + + if (!exact_endbr((void *)addr)) + return false; + + *target = addr + fineibt_preamble_size; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * Since regs->ip points to the middle of an instruction; it cannot + * continue with the normal fixup. + */ + regs->ip = *target; + + return true; + +Efault: + return false; +} + +/* + * regs->ip points to one of the UD2 in __bhi_args[]. + */ +static bool decode_fineibt_bhi(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr; + u32 hash; + + if (!cfi_bhi) + return false; + + if (regs->ip < (unsigned long)__bhi_args || + regs->ip >= (unsigned long)__bhi_args_end) + return false; + + /* + * Fetch the return address from the stack, this points to the + * FineIBT preamble. Since the CALL instruction is in the 5 last + * bytes of the preamble, the return address is in fact the target + * address. + */ + __get_kernel_nofault(&addr, regs->sp, unsigned long, Efault); + *target = addr; + + addr -= fineibt_preamble_size; + if (!exact_endbr((void *)addr)) + return false; + + __get_kernel_nofault(&hash, addr + fineibt_preamble_hash, u32, Efault); + *type = (u32)regs->r10 + hash; + + /* + * The UD2 sites are constructed with a RET immediately following, + * as such the non-fatal case can use the regular fixup. + */ + return true; + +Efault: + return false; +} + +static bool is_paranoid_thunk(unsigned long addr) +{ + u32 thunk; + + __get_kernel_nofault(&thunk, (u32 *)addr, u32, Efault); + return (thunk & 0x00FFFFFF) == 0xfd75ea; + +Efault: + return false; +} + +/* + * regs->ip points to a LOCK Jcc.d8 instruction from the fineibt_paranoid_start[] + * sequence, or to an invalid instruction (0xea) + Jcc.d8 for cfi_paranoid + ITS + * thunk. + */ +static bool decode_fineibt_paranoid(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + unsigned long addr = regs->ip - fineibt_paranoid_ud; + + if (!cfi_paranoid) + return false; + + if (is_cfi_trap(addr + fineibt_caller_size - LEN_UD2)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + /* + * Since the trapping instruction is the exact, but LOCK prefixed, + * Jcc.d8 that got us here, the normal fixup will work. + */ + return true; + } + + /* + * The cfi_paranoid + ITS thunk combination results in: + * + * 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d + * 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d + * a: 4d 8d 5b f0 lea -0x10(%r11), %r11 + * e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 + * + * Where the paranoid_thunk looks like: + * + * 1d: <ea> (bad) + * __x86_indirect_paranoid_thunk_r11: + * 1e: 75 fd jne 1d + * __x86_indirect_its_thunk_r11: + * 20: 41 ff eb jmp *%r11 + * 23: cc int3 + * + */ + if (is_paranoid_thunk(regs->ip)) { + *target = regs->r11 + fineibt_preamble_size; + *type = regs->r10; + + regs->ip = *target; + return true; + } + + return false; +} + +bool decode_fineibt_insn(struct pt_regs *regs, unsigned long *target, u32 *type) +{ + if (decode_fineibt_paranoid(regs, target, type)) + return true; + + if (decode_fineibt_bhi(regs, target, type)) + return true; + + return decode_fineibt_preamble(regs, target, type); +} + +#else /* !CONFIG_FINEIBT: */ + +static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi, bool builtin) +{ +} + +#ifdef CONFIG_X86_KERNEL_IBT +static void poison_cfi(void *addr) { } +#endif + +#endif /* !CONFIG_FINEIBT */ + +void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, + s32 *start_cfi, s32 *end_cfi) +{ + return __apply_fineibt(start_retpoline, end_retpoline, + start_cfi, end_cfi, + /* .builtin = */ false); +} #ifdef CONFIG_SMP static void alternatives_smp_lock(const s32 *start, const s32 *end, @@ -696,7 +2136,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n", smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); @@ -772,32 +2212,6 @@ int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#ifdef CONFIG_PARAVIRT -void __init_or_module apply_paravirt(struct paravirt_patch_site *start, - struct paravirt_patch_site *end) -{ - struct paravirt_patch_site *p; - char insn_buff[MAX_PATCH_LEN]; - - for (p = start; p < end; p++) { - unsigned int used; - - BUG_ON(p->len > MAX_PATCH_LEN); - /* prep the buffer with the original instructions */ - memcpy(insn_buff, p->instr, p->len); - used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len); - - BUG_ON(used > p->len); - - /* Pad the rest with nops */ - add_nops(insn_buff + used, p->len - used); - text_poke_early(p->instr, insn_buff, p->len); - } -} -extern struct paravirt_patch_site __start_parainstructions[], - __stop_parainstructions[]; -#endif /* CONFIG_PARAVIRT */ - /* * Self-test for the INT3 based CALL emulation code. * @@ -880,8 +2294,40 @@ static noinline void __init int3_selftest(void) unregister_die_notifier(&int3_exception_nb); } +static __initdata int __alt_reloc_selftest_addr; + +extern void __init __alt_reloc_selftest(void *arg); +__visible noinline void __init __alt_reloc_selftest(void *arg) +{ + WARN_ON(arg != &__alt_reloc_selftest_addr); +} + +static noinline void __init alt_reloc_selftest(void) +{ + /* + * Tests text_poke_apply_relocation(). + * + * This has a relative immediate (CALL) in a place other than the first + * instruction and additionally on x86_64 we get a RIP-relative LEA: + * + * lea 0x0(%rip),%rdi # 5d0: R_X86_64_PC32 .init.data+0x5566c + * call +0 # 5d5: R_X86_64_PLT32 __alt_reloc_selftest-0x4 + * + * Getting this wrong will either crash and burn or tickle the WARN + * above. + */ + asm_inline volatile ( + ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS) + : ASM_CALL_CONSTRAINT + : [mem] "m" (__alt_reloc_selftest_addr) + : _ASM_ARG1 + ); +} + void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -903,27 +2349,16 @@ void __init alternative_instructions(void) */ /* - * Paravirt patching and alternative patching can be combined to - * replace a function call with a short direct code sequence (e.g. - * by setting a constant return value instead of doing that in an - * external function). - * In order to make this work the following sequence is required: - * 1. set (artificial) features depending on used paravirt - * functions which can later influence alternative patching - * 2. apply paravirt patching (generally replacing an indirect - * function call with a direct one) - * 3. apply alternative patching (e.g. replacing a direct function - * call with a custom code sequence) - * Doing paravirt patching after alternative patching would clobber - * the optimization of the custom code with a function call again. + * Make sure to set (artificial) features depending on used paravirt + * functions which can later influence alternative patching. */ paravirt_set_cap(); - /* - * First patch paravirt functions, such that we overwrite the indirect - * call with the direct call. - */ - apply_paravirt(__parainstructions, __parainstructions_end); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(/*disable*/ true); + + __apply_fineibt(__retpoline_sites, __retpoline_sites_end, + __cfi_sites, __cfi_sites_end, true); /* * Rewrite the retpolines, must be done before alternatives since @@ -932,13 +2367,22 @@ void __init alternative_instructions(void) apply_retpolines(__retpoline_sites, __retpoline_sites_end); apply_returns(__return_sites, __return_sites_end); + its_fini_core(); + /* - * Then patch alternatives, such that those paravirt calls that are in - * alternatives can be overwritten by their immediate fragments. + * Adjust all CALL instructions to point to func()-10, including + * those in .altinstr_replacement. */ + callthunks_patch_builtin_calls(); + apply_alternatives(__alt_instructions, __alt_instructions_end); - apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + /* + * Seal all functions that do not have their address taken. + */ + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + + ibt_restore(ibt); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ @@ -958,6 +2402,8 @@ void __init alternative_instructions(void) restart_nmi(); alternatives_patched = 1; + + alt_reloc_selftest(); } /** @@ -988,8 +2434,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } else { local_irq_save(flags); memcpy(addr, opcode, len); - local_irq_restore(flags); sync_core(); + local_irq_restore(flags); /* * Could also do a CLFLUSH here to speed up CPU recovery; but @@ -998,72 +2444,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(smp_processor_id()); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -1083,7 +2465,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -1120,7 +2502,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -1130,21 +2512,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -1153,22 +2535,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -1199,7 +2581,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l * Note that the caller must ensure that if the modified code is part of a * module, the module would not be removed during poking. This can be achieved * by registering a module notifier, and ordering module removal and patching - * trough a mutex. + * through a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { @@ -1227,27 +2609,15 @@ void *text_poke_kgdb(void *addr, const void *opcode, size_t len) return __text_poke(text_poke_memcpy, addr, opcode, len); } -/** - * text_poke_copy - Copy instructions into (an unused part of) RX memory - * @addr: address to modify - * @opcode: source of the copy - * @len: length to copy, could be more than 2x PAGE_SIZE - * - * Not safe against concurrent execution; useful for JITs to dump - * new code blocks into unused regions of RX memory. Can be used in - * conjunction with synchronize_rcu_tasks() to wait for existing - * execution to quiesce after having made sure no existing functions - * pointers are live. - */ -void *text_poke_copy(void *addr, const void *opcode, size_t len) +void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, + bool core_ok) { unsigned long start = (unsigned long)addr; size_t patched = 0; - if (WARN_ON_ONCE(core_kernel_text(start))) + if (WARN_ON_ONCE(!core_ok && core_kernel_text(start))) return NULL; - mutex_lock(&text_mutex); while (patched < len) { unsigned long ptr = start + patched; size_t s; @@ -1257,6 +2627,25 @@ void *text_poke_copy(void *addr, const void *opcode, size_t len) __text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s); patched += s; } + return addr; +} + +/** + * text_poke_copy - Copy instructions into (an unused part of) RX memory + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy, could be more than 2x PAGE_SIZE + * + * Not safe against concurrent execution; useful for JITs to dump + * new code blocks into unused regions of RX memory. Can be used in + * conjunction with synchronize_rcu_tasks() to wait for existing + * execution to quiesce after having made sure no existing functions + * pointers are live. + */ +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + mutex_lock(&text_mutex); + addr = text_poke_copy_locked(addr, opcode, len, false); mutex_unlock(&text_mutex); return addr; } @@ -1297,68 +2686,76 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } -struct text_poke_loc { +/* + * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of + * this thing. When len == 6 everything is prefixed with 0x0f and we map + * opcode to Jcc.d8, using len to distinguish. + */ +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc *bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - /* rcu_dereference */ - struct bp_patching_desc *desc = __READ_ONCE(*descp); + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!desc || !arch_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(struct bp_patching_desc *desc) +static __always_inline void put_text_poke_array(void) { + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); + smp_mb__before_atomic(); - arch_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -1367,41 +2764,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc: + * text_poke_array with non-zero refcount: * - * bp_desc = desc INT3 - * WMB RMB - * write INT3 if (desc) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(&bp_desc); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -1414,12 +2810,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); + break; + + case 0x70 ... 0x7f: /* Jcc */ + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -1429,80 +2829,101 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(desc); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP + * + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { - struct bp_patching_desc desc = { - .vec = tp, - .nr_entries = nr_entries, - .refs = ATOMIC_INIT(1), - }; unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; + if (!text_poke_array.nr_entries) + return; + lockdep_assert_held(&text_mutex); - smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ + /* + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. + */ + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Function tracing can enable thousands of places that need to be + * updated. This can take quite some time, and with full kernel debugging + * enabled, this could cause the softlockup watchdog to trigger. + * This function gets called every 256 entries added to be patched. + * Call cond_resched() here to make sure that other tasks can get scheduled + * while processing all the functions being patched. + */ + cond_resched(); + + /* + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, }; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, - (const char *)tp[i].text + INT3_INSN_SIZE, + + if (len == 6) { + _new[0] = 0x0f; + memcpy(_new + 1, new, 5); + new = _new; + } + + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, + new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); + do_sync++; } @@ -1530,8 +2951,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, - tp[i].text, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -1540,51 +2960,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - if (tp[i].text[0] == INT3_INSN_OPCODE) + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; + + if (text_poke_array.vec[i].len == 6) + byte = 0x0f; + + if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), tp[i].text, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* - * Remove and synchronize_rcu(), except we have a very primitive - * refcount based completion. + * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ - if (!atomic_dec_and_test(&desc.refs)) - atomic_cond_read_acquire(&desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; - int ret, i; + int ret, i = 0; + + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; - memcpy((void *)tp->text, opcode, len); + if (len == 6) + i = 1; + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; - switch (tp->opcode) { + if (is_jcc32(&insn)) { + /* + * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. + */ + tpl->opcode = insn.opcode.bytes[1] - 0x10; + } + + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -1593,15 +3041,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); - }; - + } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -1609,21 +3056,22 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case CALL_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - tp->disp = insn.immediate.value; + case 0x70 ... 0x7f: /* Jcc */ + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -1634,56 +3082,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; - - if (!addr) /* force */ - return true; + WARN_ON_ONCE(!addr); - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) + if (!text_poke_array.nr_entries) return true; - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -1691,17 +3133,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - if (unlikely(system_state == SYSTEM_BOOTING)) { - text_poke_early(addr, opcode, len); - return; - } - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } |