From d071ae09a4a1414c1433d5ae9908959a7325b0ad Mon Sep 17 00:00:00 2001 From: Rafael Ávila de Espíndola Date: Wed, 19 Dec 2018 11:01:43 -0800 Subject: x86/build: Mark per-CPU symbols as absolute explicitly for LLD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Accessing per-CPU variables is done by finding the offset of the variable in the per-CPU block and adding it to the address of the respective CPU's block. Section 3.10.8 of ld.bfd's documentation states: For expressions involving numbers, relative addresses and absolute addresses, ld follows these rules to evaluate terms: Other binary operations, that is, between two relative addresses not in the same section, or between a relative address and an absolute address, first convert any non-absolute term to an absolute address before applying the operator." Note that LLVM's linker does not adhere to the GNU ld's implementation and as such requires implicitly-absolute terms to be explicitly marked as absolute in the linker script. If not, it fails currently with: ld.lld: error: ./arch/x86/kernel/vmlinux.lds:153: at least one side of the expression must be absolute ld.lld: error: ./arch/x86/kernel/vmlinux.lds:154: at least one side of the expression must be absolute Makefile:1040: recipe for target 'vmlinux' failed This is not a functional change for ld.bfd which converts the term to an absolute symbol anyways as specified above. Based on a previous submission by Tri Vo . Reported-by: Dmitry Golovin Signed-off-by: Rafael Ávila de Espíndola [ Update commit message per Boris' and Michael's suggestions. ] Signed-off-by: Nick Desaulniers [ Massage commit message more, fix typos. ] Signed-off-by: Borislav Petkov Tested-by: Dmitry Golovin Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: Brijesh Singh Cc: Cao Jin Cc: Ingo Molnar Cc: Joerg Roedel Cc: Masahiro Yamada Cc: Masami Hiramatsu Cc: Thomas Gleixner Cc: Tri Vo Cc: dima@golovin.in Cc: morbo@google.com Cc: x86-ml Link: https://lkml.kernel.org/r/20181219190145.252035-1-ndesaulniers@google.com --- arch/x86/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0d618ee634ac..ee3b5c7d662e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -401,7 +401,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); -- cgit v1.2.3-59-g8ed1b From 927185c124d62a9a4d35878d7f6d432a166b74e3 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Fri, 11 Jan 2019 12:10:12 -0800 Subject: x86/build: Specify elf_i386 linker emulation explicitly for i386 objects The kernel uses the OUTPUT_FORMAT linker script command in it's linker scripts. Most of the time, the -m option is passed to the linker with correct architecture, but sometimes (at least for x86_64) the -m option contradicts the OUTPUT_FORMAT directive. Specifically, arch/x86/boot and arch/x86/realmode/rm produce i386 object files, but are linked with the -m elf_x86_64 linker flag when building for x86_64. The GNU linker manpage doesn't explicitly state any tie-breakers between -m and OUTPUT_FORMAT. But with BFD and Gold linkers, OUTPUT_FORMAT overrides the emulation value specified with the -m option. LLVM lld has a different behavior, however. When supplied with contradicting -m and OUTPUT_FORMAT values it fails with the following error message: ld.lld: error: arch/x86/realmode/rm/header.o is incompatible with elf_x86_64 Therefore, just add the correct -m after the incorrect one (it overrides it), so the linker invocation looks like this: ld -m elf_x86_64 -z max-page-size=0x200000 -m elf_i386 --emit-relocs -T \ realmode.lds header.o trampoline_64.o stack.o reboot.o -o realmode.elf This is not a functional change for GNU ld, because (although not explicitly documented) OUTPUT_FORMAT overrides -m EMULATION. Tested by building x86_64 kernel with GNU gcc/ld toolchain and booting it in QEMU. [ bp: massage and clarify text. ] Suggested-by: Dmitry Golovin Signed-off-by: George Rimar Signed-off-by: Tri Vo Signed-off-by: Borislav Petkov Tested-by: Tri Vo Tested-by: Nick Desaulniers Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Michael Matz Cc: Thomas Gleixner Cc: morbo@google.com Cc: ndesaulniers@google.com Cc: ruiu@google.com Cc: x86-ml Link: https://lkml.kernel.org/r/20190111201012.71210-1-trong@android.com --- arch/x86/boot/Makefile | 2 +- arch/x86/realmode/rm/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 9b5adae9cc40..e2839b5c246c 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -100,7 +100,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE AFLAGS_header.o += -I$(objtree)/$(obj) $(obj)/header.o: $(obj)/zoffset.h -LDFLAGS_setup.elf := -T +LDFLAGS_setup.elf := -m elf_i386 -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 4463fa72db94..96cb20de08af 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -47,7 +47,7 @@ $(obj)/pasyms.h: $(REALMODE_OBJS) FORCE targets += realmode.lds $(obj)/realmode.lds: $(obj)/pasyms.h -LDFLAGS_realmode.elf := --emit-relocs -T +LDFLAGS_realmode.elf := -m elf_i386 --emit-relocs -T CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj) targets += realmode.elf -- cgit v1.2.3-59-g8ed1b From e6d7bc0bdf4155e896c48cf6f8fb7ef76b504058 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 9 Jan 2019 17:32:10 +0100 Subject: x86/build: Use the single-argument OUTPUT_FORMAT() linker script command The various x86 linker scripts use the three-argument linker script command variant OUTPUT_FORMAT(DEFAULT, BIG, LITTLE) which specifies three object file formats when the -EL and -EB linker command line options are used. When -EB is specified, OUTPUT_FORMAT issues the BIG object file format, when -EL, LITTLE, respectively, and when neither is specified, DEFAULT. However, those -E[LB] options are not used by arch/x86/ so switch to the simple OUTPUT_FORMAT(BFDNAME) macro variant. No functional changes. Signed-off-by: Borislav Petkov Cc: "H. Peter Anvin" Cc: x86@kernel.org Link: https://lkml.kernel.org/r/20190109181531.27513-1-bp@alien8.de --- arch/x86/boot/compressed/vmlinux.lds.S | 2 +- arch/x86/boot/setup.ld | 2 +- arch/x86/kernel/vmlinux.lds.S | 2 +- arch/x86/realmode/rm/realmode.lds.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index f491bbde8493..508cfa6828c5 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include -OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #undef i386 diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 96a6c7563538..0149e41d42c2 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -3,7 +3,7 @@ * * Linker script for the i386 setup code */ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) ENTRY(_start) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ee3b5c7d662e..bad8c51fee6e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -31,7 +31,7 @@ #undef i386 /* in case the preprocessor is a 32bit one */ -OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT) #ifdef CONFIG_X86_32 OUTPUT_ARCH(i386) diff --git a/arch/x86/realmode/rm/realmode.lds.S b/arch/x86/realmode/rm/realmode.lds.S index df8e11e26bc3..3bb980800c58 100644 --- a/arch/x86/realmode/rm/realmode.lds.S +++ b/arch/x86/realmode/rm/realmode.lds.S @@ -9,7 +9,7 @@ #undef i386 -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") +OUTPUT_FORMAT("elf32-i386") OUTPUT_ARCH(i386) SECTIONS -- cgit v1.2.3-59-g8ed1b From ce02ef06fcf7a399a6276adb83f37373d10cbbe1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 21 Feb 2019 23:19:41 +0100 Subject: x86, retpolines: Raise limit for generating indirect calls from switch-case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From networking side, there are numerous attempts to get rid of indirect calls in fast-path wherever feasible in order to avoid the cost of retpolines, for example, just to name a few: * 283c16a2dfd3 ("indirect call wrappers: helpers to speed-up indirect calls of builtin") * aaa5d90b395a ("net: use indirect call wrappers at GRO network layer") * 028e0a476684 ("net: use indirect call wrappers at GRO transport layer") * 356da6d0cde3 ("dma-mapping: bypass indirect calls for dma-direct") * 09772d92cd5a ("bpf: avoid retpoline for lookup/update/delete calls on maps") * 10870dd89e95 ("netfilter: nf_tables: add direct calls for all builtin expressions") [...] Recent work on XDP from Björn and Magnus additionally found that manually transforming the XDP return code switch statement with more than 5 cases into if-else combination would result in a considerable speedup in XDP layer due to avoidance of indirect calls in CONFIG_RETPOLINE enabled builds. On i40e driver with XDP prog attached, a 20-26% speedup has been observed [0]. Aside from XDP, there are many other places later in the networking stack's critical path with similar switch-case processing. Rather than fixing every XDP-enabled driver and locations in stack by hand, it would be good to instead raise the limit where gcc would emit expensive indirect calls from the switch under retpolines and stick with the default as-is in case of !retpoline configured kernels. This would also have the advantage that for archs where this is not necessary, we let compiler select the underlying target optimization for these constructs and avoid potential slow-downs by if-else hand-rewrite. In case of gcc, this setting is controlled by case-values-threshold which has an architecture global default that selects 4 or 5 (latter if target does not have a case insn that compares the bounds) where some arch back ends like arm64 or s390 override it with their own target hooks, for example, in gcc commit db7a90aa0de5 ("S/390: Disable prediction of indirect branches") the threshold pretty much disables jump tables by limit of 20 under retpoline builds. Comparing gcc's and clang's default code generation on x86-64 under O2 level with retpoline build results in the following outcome for 5 switch cases: * gcc with -mindirect-branch=thunk-inline -mindirect-branch-register: # gdb -batch -ex 'disassemble dispatch' ./c-switch Dump of assembler code for function dispatch: 0x0000000000400be0 <+0>: cmp $0x4,%edi 0x0000000000400be3 <+3>: ja 0x400c35 0x0000000000400be5 <+5>: lea 0x915f8(%rip),%rdx # 0x4921e4 0x0000000000400bec <+12>: mov %edi,%edi 0x0000000000400bee <+14>: movslq (%rdx,%rdi,4),%rax 0x0000000000400bf2 <+18>: add %rdx,%rax 0x0000000000400bf5 <+21>: callq 0x400c01 0x0000000000400bfa <+26>: pause 0x0000000000400bfc <+28>: lfence 0x0000000000400bff <+31>: jmp 0x400bfa 0x0000000000400c01 <+33>: mov %rax,(%rsp) 0x0000000000400c05 <+37>: retq 0x0000000000400c06 <+38>: nopw %cs:0x0(%rax,%rax,1) 0x0000000000400c10 <+48>: jmpq 0x400c90 0x0000000000400c15 <+53>: nopl (%rax) 0x0000000000400c18 <+56>: jmpq 0x400c70 0x0000000000400c1d <+61>: nopl (%rax) 0x0000000000400c20 <+64>: jmpq 0x400c50 0x0000000000400c25 <+69>: nopl (%rax) 0x0000000000400c28 <+72>: jmpq 0x400c40 0x0000000000400c2d <+77>: nopl (%rax) 0x0000000000400c30 <+80>: jmpq 0x400cb0 0x0000000000400c35 <+85>: push %rax 0x0000000000400c36 <+86>: callq 0x40dd80 End of assembler dump. * clang with -mretpoline emitting search tree: # gdb -batch -ex 'disassemble dispatch' ./c-switch Dump of assembler code for function dispatch: 0x0000000000400b30 <+0>: cmp $0x1,%edi 0x0000000000400b33 <+3>: jle 0x400b44 0x0000000000400b35 <+5>: cmp $0x2,%edi 0x0000000000400b38 <+8>: je 0x400b4d 0x0000000000400b3a <+10>: cmp $0x3,%edi 0x0000000000400b3d <+13>: jne 0x400b52 0x0000000000400b3f <+15>: jmpq 0x400c50 0x0000000000400b44 <+20>: test %edi,%edi 0x0000000000400b46 <+22>: jne 0x400b5c 0x0000000000400b48 <+24>: jmpq 0x400c20 0x0000000000400b4d <+29>: jmpq 0x400c40 0x0000000000400b52 <+34>: cmp $0x4,%edi 0x0000000000400b55 <+37>: jne 0x400b66 0x0000000000400b57 <+39>: jmpq 0x400c60 0x0000000000400b5c <+44>: cmp $0x1,%edi 0x0000000000400b5f <+47>: jne 0x400b66 0x0000000000400b61 <+49>: jmpq 0x400c30 0x0000000000400b66 <+54>: push %rax 0x0000000000400b67 <+55>: callq 0x40dd20 End of assembler dump. For sake of comparison, clang without -mretpoline: # gdb -batch -ex 'disassemble dispatch' ./c-switch Dump of assembler code for function dispatch: 0x0000000000400b30 <+0>: cmp $0x4,%edi 0x0000000000400b33 <+3>: ja 0x400b57 0x0000000000400b35 <+5>: mov %edi,%eax 0x0000000000400b37 <+7>: jmpq *0x492148(,%rax,8) 0x0000000000400b3e <+14>: jmpq 0x400bf0 0x0000000000400b43 <+19>: jmpq 0x400c30 0x0000000000400b48 <+24>: jmpq 0x400c10 0x0000000000400b4d <+29>: jmpq 0x400c20 0x0000000000400b52 <+34>: jmpq 0x400c00 0x0000000000400b57 <+39>: push %rax 0x0000000000400b58 <+40>: callq 0x40dcf0 End of assembler dump. Raising the cases to a high number (e.g. 100) will still result in similar code generation pattern with clang and gcc as above, in other words clang generally turns off jump table emission by having an extra expansion pass under retpoline build to turn indirectbr instructions from their IR into switch instructions as a built-in -mno-jump-table lowering of a switch (in this case, even if IR input already contained an indirect branch). For gcc, adding --param=case-values-threshold=20 as in similar fashion as s390 in order to raise the limit for x86 retpoline enabled builds results in a small vmlinux size increase of only 0.13% (before=18,027,528 after=18,051,192). For clang this option is ignored due to i) not being needed as mentioned and ii) not having above cmdline parameter. Non-retpoline-enabled builds with gcc continue to use the default case-values-threshold setting, so nothing changes here. [0] https://lore.kernel.org/netdev/20190129095754.9390-1-bjorn.topel@gmail.com/ and "The Path to DPDK Speeds for AF_XDP", LPC 2018, networking track: - http://vger.kernel.org/lpc_net2018_talks/lpc18_pres_af_xdp_perf-v3.pdf - http://vger.kernel.org/lpc_net2018_talks/lpc18_paper_af_xdp_perf-v2.pdf Signed-off-by: Daniel Borkmann Signed-off-by: Thomas Gleixner Acked-by: Jesper Dangaard Brouer Acked-by: Björn Töpel Acked-by: Linus Torvalds Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Magnus Karlsson Cc: Alexei Starovoitov Cc: Peter Zijlstra Cc: David Woodhouse Cc: Andy Lutomirski Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20190221221941.29358-1-daniel@iogearbox.net --- arch/x86/Makefile | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 9c5a67d1b9c1..f55420a67164 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -217,6 +217,11 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables # Avoid indirect branches in kernel to deal with Spectre ifdef CONFIG_RETPOLINE KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) + # Additionally, avoid generating expensive indirect jumps which + # are subject to retpolines for small number of switch cases. + # clang turns off jump table generation by default when under + # retpoline builds, however, gcc does not for x86. + KBUILD_CFLAGS += $(call cc-option,--param=case-values-threshold=20) endif archscripts: scripts_basic -- cgit v1.2.3-59-g8ed1b