aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S48
-rw-r--r--arch/x86/crypto/camellia-x86_64-asm_64.S26
-rw-r--r--arch/x86/crypto/cast5-avx-x86_64-asm_64.S47
-rw-r--r--arch/x86/crypto/cast6-avx-x86_64-asm_64.S50
-rw-r--r--arch/x86/crypto/des3_ede-asm_64.S15
-rw-r--r--arch/x86/crypto/sha1_avx2_x86_64_asm.S4
-rw-r--r--arch/x86/crypto/sha1_ssse3_asm.S11
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S15
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S22
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S15
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S75
-rw-r--r--arch/x86/crypto/twofish-avx-x86_64-asm_64.S12
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/include/asm/alternative.h3
-rw-r--r--arch/x86/include/asm/asm.h11
-rw-r--r--arch/x86/include/asm/fpu/internal.h90
-rw-r--r--arch/x86/include/asm/fpu/types.h32
-rw-r--r--arch/x86/include/asm/fpu/xstate.h12
-rw-r--r--arch/x86/include/asm/mmu_context.h32
-rw-r--r--arch/x86/include/asm/mshyperv.h10
-rw-r--r--arch/x86/include/asm/paravirt_types.h14
-rw-r--r--arch/x86/include/asm/preempt.h15
-rw-r--r--arch/x86/include/asm/processor.h6
-rw-r--r--arch/x86/include/asm/rwsem.h4
-rw-r--r--arch/x86/include/asm/trace/fpu.h11
-rw-r--r--arch/x86/include/asm/uaccess.h4
-rw-r--r--arch/x86/include/asm/xen/hypercall.h5
-rw-r--r--arch/x86/kernel/cpu/amd.c11
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c8
-rw-r--r--arch/x86/kernel/fpu/core.c155
-rw-r--r--arch/x86/kernel/fpu/init.c2
-rw-r--r--arch/x86/kernel/fpu/regset.c48
-rw-r--r--arch/x86/kernel/fpu/signal.c37
-rw-r--r--arch/x86/kernel/fpu/xstate.c264
-rw-r--r--arch/x86/kernel/signal.c6
-rw-r--r--arch/x86/kernel/smpboot.c13
-rw-r--r--arch/x86/kvm/emulate.c3
-rw-r--r--arch/x86/kvm/vmx.c3
-rw-r--r--arch/x86/kvm/x86.c2
-rw-r--r--arch/x86/math-emu/fpu_entry.c2
-rw-r--r--arch/x86/mm/extable.c24
-rw-r--r--arch/x86/mm/fault.c3
-rw-r--r--arch/x86/mm/pkeys.c3
-rw-r--r--arch/x86/mm/tlb.c11
-rw-r--r--arch/x86/xen/mmu_pv.c2
46 files changed, 658 insertions, 538 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 246c67006ed0..8c1fcb6bad21 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -33,7 +33,7 @@
#define s3 ((16 + 2 + (3 * 256)) * 4)
/* register macros */
-#define CTX %rdi
+#define CTX %r12
#define RIO %rsi
#define RX0 %rax
@@ -56,12 +56,12 @@
#define RX2bh %ch
#define RX3bh %dh
-#define RT0 %rbp
+#define RT0 %rdi
#define RT1 %rsi
#define RT2 %r8
#define RT3 %r9
-#define RT0d %ebp
+#define RT0d %edi
#define RT1d %esi
#define RT2d %r8d
#define RT3d %r9d
@@ -120,13 +120,14 @@
ENTRY(__blowfish_enc_blk)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
*/
- movq %rbp, %r11;
+ movq %r12, %r11;
+ movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;
@@ -142,7 +143,7 @@ ENTRY(__blowfish_enc_blk)
round_enc(14);
add_roundkey_enc(16);
- movq %r11, %rbp;
+ movq %r11, %r12;
movq %r10, RIO;
test %cl, %cl;
@@ -157,12 +158,13 @@ ENDPROC(__blowfish_enc_blk)
ENTRY(blowfish_dec_blk)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
- movq %rbp, %r11;
+ movq %r12, %r11;
+ movq %rdi, CTX;
movq %rsi, %r10;
movq %rdx, RIO;
@@ -181,7 +183,7 @@ ENTRY(blowfish_dec_blk)
movq %r10, RIO;
write_block();
- movq %r11, %rbp;
+ movq %r11, %r12;
ret;
ENDPROC(blowfish_dec_blk)
@@ -298,20 +300,21 @@ ENDPROC(blowfish_dec_blk)
ENTRY(__blowfish_enc_blk_4way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: bool, if true: xor output
*/
- pushq %rbp;
+ pushq %r12;
pushq %rbx;
pushq %rcx;
- preload_roundkey_enc(0);
-
+ movq %rdi, CTX
movq %rsi, %r11;
movq %rdx, RIO;
+ preload_roundkey_enc(0);
+
read_block4();
round_enc4(0);
@@ -324,39 +327,40 @@ ENTRY(__blowfish_enc_blk_4way)
round_enc4(14);
add_preloaded_roundkey4();
- popq %rbp;
+ popq %r12;
movq %r11, RIO;
- test %bpl, %bpl;
+ test %r12b, %r12b;
jnz .L__enc_xor4;
write_block4();
popq %rbx;
- popq %rbp;
+ popq %r12;
ret;
.L__enc_xor4:
xor_block4();
popq %rbx;
- popq %rbp;
+ popq %r12;
ret;
ENDPROC(__blowfish_enc_blk_4way)
ENTRY(blowfish_dec_blk_4way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
- pushq %rbp;
+ pushq %r12;
pushq %rbx;
- preload_roundkey_dec(17);
- movq %rsi, %r11;
+ movq %rdi, CTX;
+ movq %rsi, %r11
movq %rdx, RIO;
+ preload_roundkey_dec(17);
read_block4();
round_dec4(17);
@@ -373,7 +377,7 @@ ENTRY(blowfish_dec_blk_4way)
write_block4();
popq %rbx;
- popq %rbp;
+ popq %r12;
ret;
ENDPROC(blowfish_dec_blk_4way)
diff --git a/arch/x86/crypto/camellia-x86_64-asm_64.S b/arch/x86/crypto/camellia-x86_64-asm_64.S
index 310319c601ed..95ba6956a7f6 100644
--- a/arch/x86/crypto/camellia-x86_64-asm_64.S
+++ b/arch/x86/crypto/camellia-x86_64-asm_64.S
@@ -75,17 +75,17 @@
#define RCD1bh %dh
#define RT0 %rsi
-#define RT1 %rbp
+#define RT1 %r12
#define RT2 %r8
#define RT0d %esi
-#define RT1d %ebp
+#define RT1d %r12d
#define RT2d %r8d
#define RT2bl %r8b
#define RXOR %r9
-#define RRBP %r10
+#define RR12 %r10
#define RDST %r11
#define RXORd %r9d
@@ -197,7 +197,7 @@ ENTRY(__camellia_enc_blk)
* %rdx: src
* %rcx: bool xor
*/
- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rcx, RXOR;
movq %rsi, RDST;
@@ -227,13 +227,13 @@ ENTRY(__camellia_enc_blk)
enc_outunpack(mov, RT1);
- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;
.L__enc_xor:
enc_outunpack(xor, RT1);
- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;
ENDPROC(__camellia_enc_blk)
@@ -248,7 +248,7 @@ ENTRY(camellia_dec_blk)
movl $24, RXORd;
cmovel RXORd, RT2d; /* max */
- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;
@@ -271,7 +271,7 @@ ENTRY(camellia_dec_blk)
dec_outunpack();
- movq RRBP, %rbp;
+ movq RR12, %r12;
ret;
ENDPROC(camellia_dec_blk)
@@ -433,7 +433,7 @@ ENTRY(__camellia_enc_blk_2way)
*/
pushq %rbx;
- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rcx, RXOR;
movq %rsi, RDST;
movq %rdx, RIO;
@@ -461,14 +461,14 @@ ENTRY(__camellia_enc_blk_2way)
enc_outunpack2(mov, RT2);
- movq RRBP, %rbp;
+ movq RR12, %r12;
popq %rbx;
ret;
.L__enc2_xor:
enc_outunpack2(xor, RT2);
- movq RRBP, %rbp;
+ movq RR12, %r12;
popq %rbx;
ret;
ENDPROC(__camellia_enc_blk_2way)
@@ -485,7 +485,7 @@ ENTRY(camellia_dec_blk_2way)
cmovel RXORd, RT2d; /* max */
movq %rbx, RXOR;
- movq %rbp, RRBP;
+ movq %r12, RR12;
movq %rsi, RDST;
movq %rdx, RIO;
@@ -508,7 +508,7 @@ ENTRY(camellia_dec_blk_2way)
dec_outunpack2();
- movq RRBP, %rbp;
+ movq RR12, %r12;
movq RXOR, %rbx;
ret;
ENDPROC(camellia_dec_blk_2way)
diff --git a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
index b4a8806234ea..86107c961bb4 100644
--- a/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast5-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
/**********************************************************************
16-way AVX cast5
**********************************************************************/
-#define CTX %rdi
+#define CTX %r15
#define RL1 %xmm0
#define RR1 %xmm1
@@ -70,8 +70,8 @@
#define RTMP %xmm15
-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %rdi
+#define RID1d %edi
#define RID2 %rsi
#define RID2d %esi
@@ -226,7 +226,7 @@
.align 16
__cast5_enc_blk16:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RL1: blocks 1 and 2
* RR1: blocks 3 and 4
* RL2: blocks 5 and 6
@@ -246,9 +246,11 @@ __cast5_enc_blk16:
* RR4: encrypted blocks 15 and 16
*/
- pushq %rbp;
+ pushq %r15;
pushq %rbx;
+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -283,7 +285,7 @@ __cast5_enc_blk16:
.L__skip_enc:
popq %rbx;
- popq %rbp;
+ popq %r15;
vmovdqa .Lbswap_mask, RKM;
@@ -298,7 +300,7 @@ ENDPROC(__cast5_enc_blk16)
.align 16
__cast5_dec_blk16:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RL1: encrypted blocks 1 and 2
* RR1: encrypted blocks 3 and 4
* RL2: encrypted blocks 5 and 6
@@ -318,9 +320,11 @@ __cast5_dec_blk16:
* RR4: decrypted blocks 15 and 16
*/
- pushq %rbp;
+ pushq %r15;
pushq %rbx;
+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -356,7 +360,7 @@ __cast5_dec_blk16:
vmovdqa .Lbswap_mask, RKM;
popq %rbx;
- popq %rbp;
+ popq %r15;
outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
@@ -372,12 +376,14 @@ ENDPROC(__cast5_dec_blk16)
ENTRY(cast5_ecb_enc_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
vmovdqu (0*4*4)(%rdx), RL1;
@@ -400,18 +406,22 @@ ENTRY(cast5_ecb_enc_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast5_ecb_enc_16way)
ENTRY(cast5_ecb_dec_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;
+
+ movq %rdi, CTX;
movq %rsi, %r11;
vmovdqu (0*4*4)(%rdx), RL1;
@@ -434,20 +444,22 @@ ENTRY(cast5_ecb_dec_16way)
vmovdqu RR4, (6*4*4)(%r11);
vmovdqu RL4, (7*4*4)(%r11);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast5_ecb_dec_16way)
ENTRY(cast5_cbc_dec_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
@@ -483,23 +495,24 @@ ENTRY(cast5_cbc_dec_16way)
vmovdqu RR4, (6*16)(%r11);
vmovdqu RL4, (7*16)(%r11);
+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast5_cbc_dec_16way)
ENTRY(cast5_ctr_16way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
* %rcx: iv (big endian, 64bit)
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
@@ -558,8 +571,8 @@ ENTRY(cast5_ctr_16way)
vmovdqu RR4, (6*16)(%r11);
vmovdqu RL4, (7*16)(%r11);
+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast5_ctr_16way)
diff --git a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
index 952d3156a933..7f30b6f0d72c 100644
--- a/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/cast6-avx-x86_64-asm_64.S
@@ -47,7 +47,7 @@
/**********************************************************************
8-way AVX cast6
**********************************************************************/
-#define CTX %rdi
+#define CTX %r15
#define RA1 %xmm0
#define RB1 %xmm1
@@ -70,8 +70,8 @@
#define RTMP %xmm15
-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %rdi
+#define RID1d %edi
#define RID2 %rsi
#define RID2d %esi
@@ -264,15 +264,17 @@
.align 8
__cast6_enc_blk8:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
*/
- pushq %rbp;
+ pushq %r15;
pushq %rbx;
+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -297,7 +299,7 @@ __cast6_enc_blk8:
QBAR(11);
popq %rbx;
- popq %rbp;
+ popq %r15;
vmovdqa .Lbswap_mask, RKM;
@@ -310,15 +312,17 @@ ENDPROC(__cast6_enc_blk8)
.align 8
__cast6_dec_blk8:
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
* output:
* RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks
*/
- pushq %rbp;
+ pushq %r15;
pushq %rbx;
+ movq %rdi, CTX;
+
vmovdqa .Lbswap_mask, RKM;
vmovd .Lfirst_mask, R1ST;
vmovd .L32_mask, R32;
@@ -343,7 +347,7 @@ __cast6_dec_blk8:
QBAR(0);
popq %rbx;
- popq %rbp;
+ popq %r15;
vmovdqa .Lbswap_mask, RKM;
outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
@@ -354,12 +358,14 @@ ENDPROC(__cast6_dec_blk8)
ENTRY(cast6_ecb_enc_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -368,18 +374,21 @@ ENTRY(cast6_ecb_enc_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_ecb_enc_8way)
ENTRY(cast6_ecb_dec_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
@@ -388,20 +397,22 @@ ENTRY(cast6_ecb_dec_8way)
store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_ecb_dec_8way)
ENTRY(cast6_cbc_dec_8way)
/* input:
- * %rdi: ctx, CTX
+ * %rdi: ctx
* %rsi: dst
* %rdx: src
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15;
+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
@@ -411,8 +422,8 @@ ENTRY(cast6_cbc_dec_8way)
store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast6_cbc_dec_8way)
@@ -425,9 +436,10 @@ ENTRY(cast6_ctr_8way)
* %rcx: iv (little endian, 128bit)
*/
FRAME_BEGIN
-
pushq %r12;
+ pushq %r15
+ movq %rdi, CTX;
movq %rsi, %r11;
movq %rdx, %r12;
@@ -438,8 +450,8 @@ ENTRY(cast6_ctr_8way)
store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
popq %r12;
-
FRAME_END
ret;
ENDPROC(cast6_ctr_8way)
@@ -452,7 +464,9 @@ ENTRY(cast6_xts_enc_8way)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
FRAME_BEGIN
+ pushq %r15;
+ movq %rdi, CTX
movq %rsi, %r11;
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -464,6 +478,7 @@ ENTRY(cast6_xts_enc_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_xts_enc_8way)
@@ -476,7 +491,9 @@ ENTRY(cast6_xts_dec_8way)
* %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
*/
FRAME_BEGIN
+ pushq %r15;
+ movq %rdi, CTX
movq %rsi, %r11;
/* regs <= src, dst <= IVs, regs <= regs xor IVs */
@@ -488,6 +505,7 @@ ENTRY(cast6_xts_dec_8way)
/* dst <= regs xor IVs(in dst) */
store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
+ popq %r15;
FRAME_END
ret;
ENDPROC(cast6_xts_dec_8way)
diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index f3e91647ca27..8e49ce117494 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -64,12 +64,12 @@
#define RW2bh %ch
#define RT0 %r15
-#define RT1 %rbp
+#define RT1 %rsi
#define RT2 %r14
#define RT3 %rdx
#define RT0d %r15d
-#define RT1d %ebp
+#define RT1d %esi
#define RT2d %r14d
#define RT3d %edx
@@ -177,13 +177,14 @@ ENTRY(des3_ede_x86_64_crypt_blk)
* %rsi: dst
* %rdx: src
*/
- pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
+ pushq %rsi; /* dst */
+
read_block(%rdx, RL0, RR0);
initial_permutation(RL0, RR0);
@@ -241,6 +242,8 @@ ENTRY(des3_ede_x86_64_crypt_blk)
round1(32+15, RL0, RR0, dummy2);
final_permutation(RR0, RL0);
+
+ popq %rsi /* dst */
write_block(%rsi, RR0, RL0);
popq %r15;
@@ -248,7 +251,6 @@ ENTRY(des3_ede_x86_64_crypt_blk)
popq %r13;
popq %r12;
popq %rbx;
- popq %rbp;
ret;
ENDPROC(des3_ede_x86_64_crypt_blk)
@@ -432,13 +434,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
* %rdx: src (3 blocks)
*/
- pushq %rbp;
pushq %rbx;
pushq %r12;
pushq %r13;
pushq %r14;
pushq %r15;
+ pushq %rsi /* dst */
+
/* load input */
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
@@ -520,6 +523,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR2d;
bswapl RL2d;
+ popq %rsi /* dst */
movl RR0d, 0 * 4(%rsi);
movl RL0d, 1 * 4(%rsi);
movl RR1d, 2 * 4(%rsi);
@@ -532,7 +536,6 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
popq %r13;
popq %r12;
popq %rbx;
- popq %rbp;
ret;
ENDPROC(des3_ede_x86_64_crypt_blk_3way)
diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
index 1eab79c9ac48..9f712a7dfd79 100644
--- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S
+++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S
@@ -89,7 +89,7 @@
#define REG_RE %rdx
#define REG_RTA %r12
#define REG_RTB %rbx
-#define REG_T1 %ebp
+#define REG_T1 %r11d
#define xmm_mov vmovups
#define avx2_zeroupper vzeroupper
#define RND_F1 1
@@ -637,7 +637,6 @@ _loop3:
ENTRY(\name)
push %rbx
- push %rbp
push %r12
push %r13
push %r14
@@ -673,7 +672,6 @@ _loop3:
pop %r14
pop %r13
pop %r12
- pop %rbp
pop %rbx
ret
diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S
index a4109506a5e8..6204bd53528c 100644
--- a/arch/x86/crypto/sha1_ssse3_asm.S
+++ b/arch/x86/crypto/sha1_ssse3_asm.S
@@ -37,7 +37,7 @@
#define REG_A %ecx
#define REG_B %esi
#define REG_C %edi
-#define REG_D %ebp
+#define REG_D %r12d
#define REG_E %edx
#define REG_T1 %eax
@@ -74,10 +74,10 @@
ENTRY(\name)
push %rbx
- push %rbp
push %r12
+ push %rbp
+ mov %rsp, %rbp
- mov %rsp, %r12
sub $64, %rsp # allocate workspace
and $~15, %rsp # align stack
@@ -99,10 +99,9 @@
xor %rax, %rax
rep stosq
- mov %r12, %rsp # deallocate workspace
-
- pop %r12
+ mov %rbp, %rsp # deallocate workspace
pop %rbp
+ pop %r12
pop %rbx
ret
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index e08888a1a5f2..001bbcf93c79 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -103,7 +103,7 @@ SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
-TBL = %rbp
+TBL = %r12
a = %eax
b = %ebx
@@ -350,13 +350,13 @@ a = TMP_
ENTRY(sha256_transform_avx)
.align 32
pushq %rbx
- pushq %rbp
+ pushq %r12
pushq %r13
pushq %r14
pushq %r15
- pushq %r12
+ pushq %rbp
+ movq %rsp, %rbp
- mov %rsp, %r12
subq $STACK_SIZE, %rsp # allocate stack space
and $~15, %rsp # align stack pointer
@@ -452,13 +452,12 @@ loop2:
done_hash:
- mov %r12, %rsp
-
- popq %r12
+ mov %rbp, %rsp
+ popq %rbp
popq %r15
popq %r14
popq %r13
- popq %rbp
+ popq %r12
popq %rbx
ret
ENDPROC(sha256_transform_avx)
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 89c8f09787d2..1420db15dcdd 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -98,8 +98,6 @@ d = %r8d
e = %edx # clobbers NUM_BLKS
y3 = %esi # clobbers INP
-
-TBL = %rbp
SRND = CTX # SRND is same register as CTX
a = %eax
@@ -531,7 +529,6 @@ STACK_SIZE = _RSP + _RSP_SIZE
ENTRY(sha256_transform_rorx)
.align 32
pushq %rbx
- pushq %rbp
pushq %r12
pushq %r13
pushq %r14
@@ -568,8 +565,6 @@ ENTRY(sha256_transform_rorx)
mov CTX, _CTX(%rsp)
loop0:
- lea K256(%rip), TBL
-
## Load first 16 dwords from two blocks
VMOVDQ 0*32(INP),XTMP0
VMOVDQ 1*32(INP),XTMP1
@@ -597,19 +592,19 @@ last_block_enter:
.align 16
loop1:
- vpaddd 0*32(TBL, SRND), X0, XFER
+ vpaddd K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 0*32
- vpaddd 1*32(TBL, SRND), X0, XFER
+ vpaddd K256+1*32(SRND), X0, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 1*32
- vpaddd 2*32(TBL, SRND), X0, XFER
+ vpaddd K256+2*32(SRND), X0, XFER
vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 2*32
- vpaddd 3*32(TBL, SRND), X0, XFER
+ vpaddd K256+3*32(SRND), X0, XFER
vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
FOUR_ROUNDS_AND_SCHED _XFER + 3*32
@@ -619,10 +614,11 @@ loop1:
loop2:
## Do last 16 rounds with no scheduling
- vpaddd 0*32(TBL, SRND), X0, XFER
+ vpaddd K256+0*32(SRND), X0, XFER
vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 0*32
- vpaddd 1*32(TBL, SRND), X1, XFER
+
+ vpaddd K256+1*32(SRND), X1, XFER
vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
DO_4ROUNDS _XFER + 1*32
add $2*32, SRND
@@ -676,9 +672,6 @@ loop3:
ja done_hash
do_last_block:
- #### do last block
- lea K256(%rip), TBL
-
VMOVDQ 0*16(INP),XWORD0
VMOVDQ 1*16(INP),XWORD1
VMOVDQ 2*16(INP),XWORD2
@@ -718,7 +711,6 @@ done_hash:
popq %r14
popq %r13
popq %r12
- popq %rbp
popq %rbx
ret
ENDPROC(sha256_transform_rorx)
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index 39b83c93e7fd..c6c05ed2c16a 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -95,7 +95,7 @@ SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
-TBL = %rbp
+TBL = %r12
a = %eax
b = %ebx
@@ -356,13 +356,13 @@ a = TMP_
ENTRY(sha256_transform_ssse3)
.align 32
pushq %rbx
- pushq %rbp
+ pushq %r12
pushq %r13
pushq %r14
pushq %r15
- pushq %r12
+ pushq %rbp
+ mov %rsp, %rbp
- mov %rsp, %r12
subq $STACK_SIZE, %rsp
and $~15, %rsp
@@ -462,13 +462,12 @@ loop2:
done_hash:
- mov %r12, %rsp
-
- popq %r12
+ mov %rbp, %rsp
+ popq %rbp
popq %r15
popq %r14
popq %r13
- popq %rbp
+ popq %r12
popq %rbx
ret
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 7f5f6c6ec72e..b16d56005162 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -69,8 +69,9 @@ XFER = YTMP0
BYTE_FLIP_MASK = %ymm9
-# 1st arg
-CTX = %rdi
+# 1st arg is %rdi, which is saved to the stack and accessed later via %r12
+CTX1 = %rdi
+CTX2 = %r12
# 2nd arg
INP = %rsi
# 3rd arg
@@ -81,7 +82,7 @@ d = %r8
e = %rdx
y3 = %rsi
-TBL = %rbp
+TBL = %rdi # clobbers CTX1
a = %rax
b = %rbx
@@ -91,26 +92,26 @@ g = %r10
h = %r11
old_h = %r11
-T1 = %r12
+T1 = %r12 # clobbers CTX2
y0 = %r13
y1 = %r14
y2 = %r15
-y4 = %r12
-
# Local variables (stack frame)
XFER_SIZE = 4*8
SRND_SIZE = 1*8
INP_SIZE = 1*8
INPEND_SIZE = 1*8
+CTX_SIZE = 1*8
RSPSAVE_SIZE = 1*8
-GPRSAVE_SIZE = 6*8
+GPRSAVE_SIZE = 5*8
frame_XFER = 0
frame_SRND = frame_XFER + XFER_SIZE
frame_INP = frame_SRND + SRND_SIZE
frame_INPEND = frame_INP + INP_SIZE
-frame_RSPSAVE = frame_INPEND + INPEND_SIZE
+frame_CTX = frame_INPEND + INPEND_SIZE
+frame_RSPSAVE = frame_CTX + CTX_SIZE
frame_GPRSAVE = frame_RSPSAVE + RSPSAVE_SIZE
frame_size = frame_GPRSAVE + GPRSAVE_SIZE
@@ -576,12 +577,11 @@ ENTRY(sha512_transform_rorx)
mov %rax, frame_RSPSAVE(%rsp)
# Save GPRs
- mov %rbp, frame_GPRSAVE(%rsp)
- mov %rbx, 8*1+frame_GPRSAVE(%rsp)
- mov %r12, 8*2+frame_GPRSAVE(%rsp)
- mov %r13, 8*3+frame_GPRSAVE(%rsp)
- mov %r14, 8*4+frame_GPRSAVE(%rsp)
- mov %r15, 8*5+frame_GPRSAVE(%rsp)
+ mov %rbx, 8*0+frame_GPRSAVE(%rsp)
+ mov %r12, 8*1+frame_GPRSAVE(%rsp)
+ mov %r13, 8*2+frame_GPRSAVE(%rsp)
+ mov %r14, 8*3+frame_GPRSAVE(%rsp)
+ mov %r15, 8*4+frame_GPRSAVE(%rsp)
shl $7, NUM_BLKS # convert to bytes
jz done_hash
@@ -589,14 +589,17 @@ ENTRY(sha512_transform_rorx)
mov NUM_BLKS, frame_INPEND(%rsp)
## load initial digest
- mov 8*0(CTX),a
- mov 8*1(CTX),b
- mov 8*2(CTX),c
- mov 8*3(CTX),d
- mov 8*4(CTX),e
- mov 8*5(CTX),f
- mov 8*6(CTX),g
- mov 8*7(CTX),h
+ mov 8*0(CTX1), a
+ mov 8*1(CTX1), b
+ mov 8*2(CTX1), c
+ mov 8*3(CTX1), d
+ mov 8*4(CTX1), e
+ mov 8*5(CTX1), f
+ mov 8*6(CTX1), g
+ mov 8*7(CTX1), h
+
+ # save %rdi (CTX) before it gets clobbered
+ mov %rdi, frame_CTX(%rsp)
vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
@@ -652,14 +655,15 @@ loop2:
subq $1, frame_SRND(%rsp)
jne loop2
- addm 8*0(CTX),a
- addm 8*1(CTX),b
- addm 8*2(CTX),c
- addm 8*3(CTX),d
- addm 8*4(CTX),e
- addm 8*5(CTX),f
- addm 8*6(CTX),g
- addm 8*7(CTX),h
+ mov frame_CTX(%rsp), CTX2
+ addm 8*0(CTX2), a
+ addm 8*1(CTX2), b
+ addm 8*2(CTX2), c
+ addm 8*3(CTX2), d
+ addm 8*4(CTX2), e
+ addm 8*5(CTX2), f
+ addm 8*6(CTX2), g
+ addm 8*7(CTX2), h
mov frame_INP(%rsp), INP
add $128, INP
@@ -669,12 +673,11 @@ loop2:
done_hash:
# Restore GPRs
- mov frame_GPRSAVE(%rsp) ,%rbp
- mov 8*1+frame_GPRSAVE(%rsp) ,%rbx
- mov 8*2+frame_GPRSAVE(%rsp) ,%r12
- mov 8*3+frame_GPRSAVE(%rsp) ,%r13
- mov 8*4+frame_GPRSAVE(%rsp) ,%r14
- mov 8*5+frame_GPRSAVE(%rsp) ,%r15
+ mov 8*0+frame_GPRSAVE(%rsp), %rbx
+ mov 8*1+frame_GPRSAVE(%rsp), %r12
+ mov 8*2+frame_GPRSAVE(%rsp), %r13
+ mov 8*3+frame_GPRSAVE(%rsp), %r14
+ mov 8*4+frame_GPRSAVE(%rsp), %r15
# Restore Stack Pointer
mov frame_RSPSAVE(%rsp), %rsp
diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index b3f49d286348..73b471da3622 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -76,8 +76,8 @@
#define RT %xmm14
#define RR %xmm15
-#define RID1 %rbp
-#define RID1d %ebp
+#define RID1 %r13
+#define RID1d %r13d
#define RID2 %rsi
#define RID2d %esi
@@ -259,7 +259,7 @@ __twofish_enc_blk8:
vmovdqu w(CTX), RK1;
- pushq %rbp;
+ pushq %r13;
pushq %rbx;
pushq %rcx;
@@ -282,7 +282,7 @@ __twofish_enc_blk8:
popq %rcx;
popq %rbx;
- popq %rbp;
+ popq %r13;
outunpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
outunpack_blocks(RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
@@ -301,7 +301,7 @@ __twofish_dec_blk8:
vmovdqu (w+4*4)(CTX), RK1;
- pushq %rbp;
+ pushq %r13;
pushq %rbx;
inpack_blocks(RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
@@ -322,7 +322,7 @@ __twofish_dec_blk8:
vmovdqu (w)(CTX), RK1;
popq %rbx;
- popq %rbp;
+ popq %r13;
outunpack_blocks(RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
outunpack_blocks(RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index e0bb46c02857..0e2a5edbce00 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -231,7 +231,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
ksig->ka.sa.sa_restorer)
sp = (unsigned long) ksig->ka.sa.sa_restorer;
- if (fpu->fpstate_active) {
+ if (fpu->initialized) {
unsigned long fx_aligned, math_size;
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1b020381ab38..c096624137ae 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -218,10 +218,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \
output, input...) \
{ \
- register void *__sp asm(_ASM_SP); \
asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\
"call %P[new2]", feature2) \
- : output, "+r" (__sp) \
+ : output, ASM_CALL_CONSTRAINT \
: [old] "i" (oldfunc), [new1] "i" (newfunc1), \
[new2] "i" (newfunc2), ## input); \
}
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 676ee5807d86..c1eadbaf1115 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -132,4 +132,15 @@
/* For C file, we already have NOKPROBE_SYMBOL macro */
#endif
+#ifndef __ASSEMBLY__
+/*
+ * This output constraint should be used for any inline asm which has a "call"
+ * instruction. Otherwise the asm may be inserted before the frame pointer
+ * gets set up by the containing function. If you forget to do this, objtool
+ * may print a "call without frame pointer save/setup" warning.
+ */
+register unsigned int __asm_call_sp asm("esp");
+#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)
+#endif
+
#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 554cdb205d17..e3221ffa304e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -23,11 +23,9 @@
/*
* High level FPU state handling functions:
*/
-extern void fpu__activate_curr(struct fpu *fpu);
-extern void fpu__activate_fpstate_read(struct fpu *fpu);
-extern void fpu__activate_fpstate_write(struct fpu *fpu);
-extern void fpu__current_fpstate_write_begin(void);
-extern void fpu__current_fpstate_write_end(void);
+extern void fpu__initialize(struct fpu *fpu);
+extern void fpu__prepare_read(struct fpu *fpu);
+extern void fpu__prepare_write(struct fpu *fpu);
extern void fpu__save(struct fpu *fpu);
extern void fpu__restore(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
@@ -120,20 +118,11 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
err; \
})
-#define check_insn(insn, output, input...) \
-({ \
- int err; \
+#define kernel_insn(insn, output, input...) \
asm volatile("1:" #insn "\n\t" \
"2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl $-1,%[err]\n" \
- " jmp 2b\n" \
- ".previous\n" \
- _ASM_EXTABLE(1b, 3b) \
- : [err] "=r" (err), output \
- : "0"(0), input); \
- err; \
-})
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \
+ : output : input)
static inline int copy_fregs_to_user(struct fregs_state __user *fx)
{
@@ -153,20 +142,16 @@ static inline int copy_fxregs_to_user(struct fxregs_state __user *fx)
static inline void copy_kernel_to_fxregs(struct fxregs_state *fx)
{
- int err;
-
if (IS_ENABLED(CONFIG_X86_32)) {
- err = check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
+ kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
} else {
if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) {
- err = check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
+ kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
} else {
/* See comment in copy_fxregs_to_kernel() below. */
- err = check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
+ kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx));
}
}
- /* Copying from a kernel buffer to FPU registers should never fail: */
- WARN_ON_FPU(err);
}
static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
@@ -183,9 +168,7 @@ static inline int copy_user_to_fxregs(struct fxregs_state __user *fx)
static inline void copy_kernel_to_fregs(struct fregs_state *fx)
{
- int err = check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
-
- WARN_ON_FPU(err);
+ kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
static inline int copy_user_to_fregs(struct fregs_state __user *fx)
@@ -281,18 +264,13 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
* Use XRSTORS to restore context if it is enabled. XRSTORS supports compact
* XSAVE area format.
*/
-#define XSTATE_XRESTORE(st, lmask, hmask, err) \
+#define XSTATE_XRESTORE(st, lmask, hmask) \
asm volatile(ALTERNATIVE(XRSTOR, \
XRSTORS, X86_FEATURE_XSAVES) \
"\n" \
- "xor %[err], %[err]\n" \
"3:\n" \
- ".pushsection .fixup,\"ax\"\n" \
- "4: movl $-2, %[err]\n" \
- "jmp 3b\n" \
- ".popsection\n" \
- _ASM_EXTABLE(661b, 4b) \
- : [err] "=r" (err) \
+ _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\
+ : \
: "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \
: "memory")
@@ -336,7 +314,10 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
else
XSTATE_OP(XRSTOR, xstate, lmask, hmask, err);
- /* We should never fault when copying from a kernel buffer: */
+ /*
+ * We should never fault when copying from a kernel buffer, and the FPU
+ * state we set at boot time should be valid.
+ */
WARN_ON_FPU(err);
}
@@ -350,7 +331,7 @@ static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
u32 hmask = mask >> 32;
int err;
- WARN_ON(!alternatives_patched);
+ WARN_ON_FPU(!alternatives_patched);
XSTATE_XSAVE(xstate, lmask, hmask, err);
@@ -365,12 +346,8 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
- int err;
-
- XSTATE_XRESTORE(xstate, lmask, hmask, err);
- /* We should never fault when copying from a kernel buffer: */
- WARN_ON_FPU(err);
+ XSTATE_XRESTORE(xstate, lmask, hmask);
}
/*
@@ -526,38 +503,17 @@ static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
*/
static inline void fpregs_deactivate(struct fpu *fpu)
{
- WARN_ON_FPU(!fpu->fpregs_active);
-
- fpu->fpregs_active = 0;
this_cpu_write(fpu_fpregs_owner_ctx, NULL);
trace_x86_fpu_regs_deactivated(fpu);
}
static inline void fpregs_activate(struct fpu *fpu)
{
- WARN_ON_FPU(fpu->fpregs_active);
-
- fpu->fpregs_active = 1;
this_cpu_write(fpu_fpregs_owner_ctx, fpu);
trace_x86_fpu_regs_activated(fpu);
}
/*
- * The question "does this thread have fpu access?"
- * is slightly racy, since preemption could come in
- * and revoke it immediately after the test.
- *
- * However, even in that very unlikely scenario,
- * we can just assume we have FPU access - typically
- * to save the FP state - we'll just take a #NM
- * fault and get the FPU access back.
- */
-static inline int fpregs_active(void)
-{
- return current->thread.fpu.fpregs_active;
-}
-
-/*
* FPU state switching for scheduling.
*
* This is a two-stage process:
@@ -571,14 +527,13 @@ static inline int fpregs_active(void)
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
- if (old_fpu->fpregs_active) {
+ if (old_fpu->initialized) {
if (!copy_fpregs_to_fpstate(old_fpu))
old_fpu->last_cpu = -1;
else
old_fpu->last_cpu = cpu;
/* But leave fpu_fpregs_owner_ctx! */
- old_fpu->fpregs_active = 0;
trace_x86_fpu_regs_deactivated(old_fpu);
} else
old_fpu->last_cpu = -1;
@@ -595,7 +550,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
{
bool preload = static_cpu_has(X86_FEATURE_FPU) &&
- new_fpu->fpstate_active;
+ new_fpu->initialized;
if (preload) {
if (!fpregs_state_valid(new_fpu, cpu))
@@ -617,8 +572,7 @@ static inline void user_fpu_begin(void)
struct fpu *fpu = &current->thread.fpu;
preempt_disable();
- if (!fpregs_active())
- fpregs_activate(fpu);
+ fpregs_activate(fpu);
preempt_enable();
}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c80f5b9c09d..a1520575d86b 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -68,6 +68,9 @@ struct fxregs_state {
/* Default value for fxregs_state.mxcsr: */
#define MXCSR_DEFAULT 0x1f80
+/* Copy both mxcsr & mxcsr_flags with a single u64 memcpy: */
+#define MXCSR_AND_FLAGS_SIZE sizeof(u64)
+
/*
* Software based FPU emulation state. This is arbitrary really,
* it matches the x87 format to make it easier to understand:
@@ -290,36 +293,13 @@ struct fpu {
unsigned int last_cpu;
/*
- * @fpstate_active:
+ * @initialized:
*
- * This flag indicates whether this context is active: if the task
+ * This flag indicates whether this context is initialized: if the task
* is not running then we can restore from this context, if the task
* is running then we should save into this context.
*/
- unsigned char fpstate_active;
-
- /*
- * @fpregs_active:
- *
- * This flag determines whether a given context is actively
- * loaded into the FPU's registers and that those registers
- * represent the task's current FPU state.
- *
- * Note the interaction with fpstate_active:
- *
- * # task does not use the FPU:
- * fpstate_active == 0
- *
- * # task uses the FPU and regs are active:
- * fpstate_active == 1 && fpregs_active == 1
- *
- * # the regs are inactive but still match fpstate:
- * fpstate_active == 1 && fpregs_active == 0 && fpregs_owner == fpu
- *
- * The third state is what we use for the lazy restore optimization
- * on lazy-switching CPUs.
- */
- unsigned char fpregs_active;
+ unsigned char initialized;
/*
* @state:
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 1b2799e0699a..83fee2469eb7 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -48,8 +48,12 @@ void fpu__xstate_clear_all_cpu_caps(void);
void *get_xsave_addr(struct xregs_state *xsave, int xstate);
const void *get_xsave_field_ptr(int xstate_field);
int using_compacted_format(void);
-int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
- void __user *ubuf, struct xregs_state *xsave);
-int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
- struct xregs_state *xsave);
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
+
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+extern int validate_xstate_header(const struct xstate_header *hdr);
+
#endif
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 7ae318c340d9..c120b5db178a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -286,6 +286,32 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
+/*
+ * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
+ * bits. This serves two purposes. It prevents a nasty situation in
+ * which PCID-unaware code saves CR3, loads some other value (with PCID
+ * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
+ * the saved ASID was nonzero. It also means that any bugs involving
+ * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
+ * deterministically.
+ */
+
+static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
+{
+ if (static_cpu_has(X86_FEATURE_PCID)) {
+ VM_WARN_ON_ONCE(asid > 4094);
+ return __sme_pa(mm->pgd) | (asid + 1);
+ } else {
+ VM_WARN_ON_ONCE(asid != 0);
+ return __sme_pa(mm->pgd);
+ }
+}
+
+static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
+{
+ VM_WARN_ON_ONCE(asid > 4094);
+ return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
+}
/*
* This can be used from process context to figure out what the value of
@@ -296,10 +322,8 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
*/
static inline unsigned long __get_current_cr3_fast(void)
{
- unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
-
- if (static_cpu_has(X86_FEATURE_PCID))
- cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+ unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+ this_cpu_read(cpu_tlbstate.loaded_mm_asid));
/* For now, be very restrictive about when this can be called. */
VM_WARN_ON(in_nmi() || preemptible());
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 63cc96f064dc..738503e1f80c 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -179,7 +179,6 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
u64 input_address = input ? virt_to_phys(input) : 0;
u64 output_address = output ? virt_to_phys(output) : 0;
u64 hv_status;
- register void *__sp asm(_ASM_SP);
#ifdef CONFIG_X86_64
if (!hv_hypercall_pg)
@@ -187,7 +186,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
__asm__ __volatile__("mov %4, %%r8\n"
"call *%5"
- : "=a" (hv_status), "+r" (__sp),
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input_address)
: "r" (output_address), "m" (hv_hypercall_pg)
: "cc", "memory", "r8", "r9", "r10", "r11");
@@ -202,7 +201,7 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
__asm__ __volatile__("call *%7"
: "=A" (hv_status),
- "+c" (input_address_lo), "+r" (__sp)
+ "+c" (input_address_lo), ASM_CALL_CONSTRAINT
: "A" (control),
"b" (input_address_hi),
"D"(output_address_hi), "S"(output_address_lo),
@@ -224,12 +223,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
{
u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT;
- register void *__sp asm(_ASM_SP);
#ifdef CONFIG_X86_64
{
__asm__ __volatile__("call *%4"
- : "=a" (hv_status), "+r" (__sp),
+ : "=a" (hv_status), ASM_CALL_CONSTRAINT,
"+c" (control), "+d" (input1)
: "m" (hv_hypercall_pg)
: "cc", "r8", "r9", "r10", "r11");
@@ -242,7 +240,7 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
__asm__ __volatile__ ("call *%5"
: "=A"(hv_status),
"+c"(input1_lo),
- "+r"(__sp)
+ ASM_CALL_CONSTRAINT
: "A" (control),
"b" (input1_hi),
"m" (hv_hypercall_pg)
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 42873edd9f9d..280d94c36dad 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -459,8 +459,8 @@ int paravirt_disable_iospace(void);
*/
#ifdef CONFIG_X86_32
#define PVOP_VCALL_ARGS \
- unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \
- register void *__sp asm("esp")
+ unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx;
+
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x))
@@ -480,8 +480,8 @@ int paravirt_disable_iospace(void);
/* [re]ax isn't an arg, but the return val */
#define PVOP_VCALL_ARGS \
unsigned long __edi = __edi, __esi = __esi, \
- __edx = __edx, __ecx = __ecx, __eax = __eax; \
- register void *__sp asm("rsp")
+ __edx = __edx, __ecx = __ecx, __eax = __eax;
+
#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
#define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x))
@@ -532,7 +532,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -542,7 +542,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
@@ -569,7 +569,7 @@ int paravirt_disable_iospace(void);
asm volatile(pre \
paravirt_alt(PARAVIRT_CALL) \
post \
- : call_clbr, "+r" (__sp) \
+ : call_clbr, ASM_CALL_CONSTRAINT \
: paravirt_type(op), \
paravirt_clobber(clbr), \
##__VA_ARGS__ \
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index ec1f3c651150..4f44505dbf87 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -100,19 +100,14 @@ static __always_inline bool should_resched(int preempt_offset)
#ifdef CONFIG_PREEMPT
extern asmlinkage void ___preempt_schedule(void);
-# define __preempt_schedule() \
-({ \
- register void *__sp asm(_ASM_SP); \
- asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \
-})
+# define __preempt_schedule() \
+ asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
extern asmlinkage void preempt_schedule(void);
extern asmlinkage void ___preempt_schedule_notrace(void);
-# define __preempt_schedule_notrace() \
-({ \
- register void *__sp asm(_ASM_SP); \
- asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \
-})
+# define __preempt_schedule_notrace() \
+ asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT)
+
extern asmlinkage void preempt_schedule_notrace(void);
#endif
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3fa26a61eabc..b390ff76e58f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -677,8 +677,6 @@ static inline void sync_core(void)
* Like all of Linux's memory ordering operations, this is a
* compiler barrier as well.
*/
- register void *__sp asm(_ASM_SP);
-
#ifdef CONFIG_X86_32
asm volatile (
"pushfl\n\t"
@@ -686,7 +684,7 @@ static inline void sync_core(void)
"pushl $1f\n\t"
"iret\n\t"
"1:"
- : "+r" (__sp) : : "memory");
+ : ASM_CALL_CONSTRAINT : : "memory");
#else
unsigned int tmp;
@@ -703,7 +701,7 @@ static inline void sync_core(void)
"iretq\n\t"
UNWIND_HINT_RESTORE
"1:"
- : "=&r" (tmp), "+r" (__sp) : : "cc", "memory");
+ : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory");
#endif
}
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index a34e0d4b957d..7116b7931c7b 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -103,7 +103,6 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
({ \
long tmp; \
struct rw_semaphore* ret; \
- register void *__sp asm(_ASM_SP); \
\
asm volatile("# beginning down_write\n\t" \
LOCK_PREFIX " xadd %1,(%4)\n\t" \
@@ -114,7 +113,8 @@ static inline bool __down_read_trylock(struct rw_semaphore *sem)
" call " slow_path "\n" \
"1:\n" \
"# ending down_write" \
- : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \
+ : "+m" (sem->count), "=d" (tmp), \
+ "=a" (ret), ASM_CALL_CONSTRAINT \
: "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \
: "memory", "cc"); \
ret; \
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 342e59789fcd..39f7a27bef13 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -12,25 +12,22 @@ DECLARE_EVENT_CLASS(x86_fpu,
TP_STRUCT__entry(
__field(struct fpu *, fpu)
- __field(bool, fpregs_active)
- __field(bool, fpstate_active)
+ __field(bool, initialized)
__field(u64, xfeatures)
__field(u64, xcomp_bv)
),
TP_fast_assign(
__entry->fpu = fpu;
- __entry->fpregs_active = fpu->fpregs_active;
- __entry->fpstate_active = fpu->fpstate_active;
+ __entry->initialized = fpu->initialized;
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
__entry->xfeatures = fpu->state.xsave.header.xfeatures;
__entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
}
),
- TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
+ TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx",
__entry->fpu,
- __entry->fpregs_active,
- __entry->fpstate_active,
+ __entry->initialized,
__entry->xfeatures,
__entry->xcomp_bv
)
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 184eb9894dae..78e8fcc87d4c 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -166,11 +166,11 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
({ \
int __ret_gu; \
register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \
- register void *__sp asm(_ASM_SP); \
__chk_user_ptr(ptr); \
might_fault(); \
asm volatile("call __get_user_%P4" \
- : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \
+ : "=a" (__ret_gu), "=r" (__val_gu), \
+ ASM_CALL_CONSTRAINT \
: "0" (ptr), "i" (sizeof(*(ptr)))); \
(x) = (__force __typeof__(*(ptr))) __val_gu; \
__builtin_expect(__ret_gu, 0); \
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 9606688caa4b..128a1a0b1450 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -113,10 +113,9 @@ extern struct { char _entry[32]; } hypercall_page[];
register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
- register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \
- register void *__sp asm(_ASM_SP);
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
-#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp)
+#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT
#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9862e2cd6d93..d58184b7cd44 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -763,6 +763,16 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
}
}
+static void init_amd_zn(struct cpuinfo_x86 *c)
+{
+ /*
+ * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects
+ * all up to and including B1.
+ */
+ if (c->x86_model <= 1 && c->x86_mask <= 1)
+ set_cpu_cap(c, X86_FEATURE_CPB);
+}
+
static void init_amd(struct cpuinfo_x86 *c)
{
early_init_amd(c);
@@ -791,6 +801,7 @@ static void init_amd(struct cpuinfo_x86 *c)
case 0x10: init_amd_gh(c); break;
case 0x12: init_amd_ln(c); break;
case 0x15: init_amd_bd(c); break;
+ case 0x17: init_amd_zn(c); break;
}
/* Enable workaround for FXSAVE leak */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index db684880d74a..0af86d9242da 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -21,14 +21,6 @@
void __init check_bugs(void)
{
-#ifdef CONFIG_X86_32
- /*
- * Regardless of whether PCID is enumerated, the SDM says
- * that it can't be enabled in 32-bit mode.
- */
- setup_clear_cpu_cap(X86_FEATURE_PCID);
-#endif
-
identify_boot_cpu();
if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 775f10100d7f..c9176bae7fd8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -904,6 +904,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
setup_force_cpu_cap(X86_FEATURE_ALWAYS);
fpu__init_system(c);
+
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
}
void __init early_cpu_init(void)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e1114f070c2d..f92a6593de1e 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -100,7 +100,7 @@ void __kernel_fpu_begin(void)
kernel_fpu_disable();
- if (fpu->fpregs_active) {
+ if (fpu->initialized) {
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
@@ -116,7 +116,7 @@ void __kernel_fpu_end(void)
{
struct fpu *fpu = &current->thread.fpu;
- if (fpu->fpregs_active)
+ if (fpu->initialized)
copy_kernel_to_fpregs(&fpu->state);
kernel_fpu_enable();
@@ -148,7 +148,7 @@ void fpu__save(struct fpu *fpu)
preempt_disable();
trace_x86_fpu_before_save(fpu);
- if (fpu->fpregs_active) {
+ if (fpu->initialized) {
if (!copy_fpregs_to_fpstate(fpu)) {
copy_kernel_to_fpregs(&fpu->state);
}
@@ -189,10 +189,9 @@ EXPORT_SYMBOL_GPL(fpstate_init);
int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
{
- dst_fpu->fpregs_active = 0;
dst_fpu->last_cpu = -1;
- if (!src_fpu->fpstate_active || !static_cpu_has(X86_FEATURE_FPU))
+ if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU))
return 0;
WARN_ON_FPU(src_fpu != &current->thread.fpu);
@@ -206,26 +205,14 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
/*
* Save current FPU registers directly into the child
* FPU context, without any memory-to-memory copying.
- * In lazy mode, if the FPU context isn't loaded into
- * fpregs, CR0.TS will be set and do_device_not_available
- * will load the FPU context.
*
- * We have to do all this with preemption disabled,
- * mostly because of the FNSAVE case, because in that
- * case we must not allow preemption in the window
- * between the FNSAVE and us marking the context lazy.
- *
- * It shouldn't be an issue as even FNSAVE is plenty
- * fast in terms of critical section length.
+ * ( The function 'fails' in the FNSAVE case, which destroys
+ * register contents so we have to copy them back. )
*/
- preempt_disable();
if (!copy_fpregs_to_fpstate(dst_fpu)) {
- memcpy(&src_fpu->state, &dst_fpu->state,
- fpu_kernel_xstate_size);
-
+ memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size);
copy_kernel_to_fpregs(&src_fpu->state);
}
- preempt_enable();
trace_x86_fpu_copy_src(src_fpu);
trace_x86_fpu_copy_dst(dst_fpu);
@@ -237,45 +224,48 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* Activate the current task's in-memory FPU context,
* if it has not been used before:
*/
-void fpu__activate_curr(struct fpu *fpu)
+void fpu__initialize(struct fpu *fpu)
{
WARN_ON_FPU(fpu != &current->thread.fpu);
- if (!fpu->fpstate_active) {
+ if (!fpu->initialized) {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
trace_x86_fpu_activate_state(fpu);
/* Safe to do for the current task: */
- fpu->fpstate_active = 1;
+ fpu->initialized = 1;
}
}
-EXPORT_SYMBOL_GPL(fpu__activate_curr);
+EXPORT_SYMBOL_GPL(fpu__initialize);
/*
* This function must be called before we read a task's fpstate.
*
- * If the task has not used the FPU before then initialize its
- * fpstate.
+ * There's two cases where this gets called:
+ *
+ * - for the current task (when coredumping), in which case we have
+ * to save the latest FPU registers into the fpstate,
+ *
+ * - or it's called for stopped tasks (ptrace), in which case the
+ * registers were already saved by the context-switch code when
+ * the task scheduled out - we only have to initialize the registers
+ * if they've never been initialized.
*
* If the task has used the FPU before then save it.
*/
-void fpu__activate_fpstate_read(struct fpu *fpu)
+void fpu__prepare_read(struct fpu *fpu)
{
- /*
- * If fpregs are active (in the current CPU), then
- * copy them to the fpstate:
- */
- if (fpu->fpregs_active) {
+ if (fpu == &current->thread.fpu) {
fpu__save(fpu);
} else {
- if (!fpu->fpstate_active) {
+ if (!fpu->initialized) {
fpstate_init(&fpu->state);
trace_x86_fpu_init_state(fpu);
trace_x86_fpu_activate_state(fpu);
/* Safe to do for current and for stopped child tasks: */
- fpu->fpstate_active = 1;
+ fpu->initialized = 1;
}
}
}
@@ -283,17 +273,17 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
/*
* This function must be called before we write a task's fpstate.
*
- * If the task has used the FPU before then unlazy it.
+ * If the task has used the FPU before then invalidate any cached FPU registers.
* If the task has not used the FPU before then initialize its fpstate.
*
* After this function call, after registers in the fpstate are
* modified and the child task has woken up, the child task will
* restore the modified FPU state from the modified context. If we
- * didn't clear its lazy status here then the lazy in-registers
+ * didn't clear its cached status here then the cached in-registers
* state pending on its former CPU could be restored, corrupting
* the modifications.
*/
-void fpu__activate_fpstate_write(struct fpu *fpu)
+void fpu__prepare_write(struct fpu *fpu)
{
/*
* Only stopped child tasks can be used to modify the FPU
@@ -301,8 +291,8 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
*/
WARN_ON_FPU(fpu == &current->thread.fpu);
- if (fpu->fpstate_active) {
- /* Invalidate any lazy state: */
+ if (fpu->initialized) {
+ /* Invalidate any cached state: */
__fpu_invalidate_fpregs_state(fpu);
} else {
fpstate_init(&fpu->state);
@@ -310,74 +300,11 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
trace_x86_fpu_activate_state(fpu);
/* Safe to do for stopped child tasks: */
- fpu->fpstate_active = 1;
+ fpu->initialized = 1;
}
}
/*
- * This function must be called before we write the current
- * task's fpstate.
- *
- * This call gets the current FPU register state and moves
- * it in to the 'fpstate'. Preemption is disabled so that
- * no writes to the 'fpstate' can occur from context
- * swiches.
- *
- * Must be followed by a fpu__current_fpstate_write_end().
- */
-void fpu__current_fpstate_write_begin(void)
-{
- struct fpu *fpu = &current->thread.fpu;
-
- /*
- * Ensure that the context-switching code does not write
- * over the fpstate while we are doing our update.
- */
- preempt_disable();
-
- /*
- * Move the fpregs in to the fpu's 'fpstate'.
- */
- fpu__activate_fpstate_read(fpu);
-
- /*
- * The caller is about to write to 'fpu'. Ensure that no
- * CPU thinks that its fpregs match the fpstate. This
- * ensures we will not be lazy and skip a XRSTOR in the
- * future.
- */
- __fpu_invalidate_fpregs_state(fpu);
-}
-
-/*
- * This function must be paired with fpu__current_fpstate_write_begin()
- *
- * This will ensure that the modified fpstate gets placed back in
- * the fpregs if necessary.
- *
- * Note: This function may be called whether or not an _actual_
- * write to the fpstate occurred.
- */
-void fpu__current_fpstate_write_end(void)
-{
- struct fpu *fpu = &current->thread.fpu;
-
- /*
- * 'fpu' now has an updated copy of the state, but the
- * registers may still be out of date. Update them with
- * an XRSTOR if they are active.
- */
- if (fpregs_active())
- copy_kernel_to_fpregs(&fpu->state);
-
- /*
- * Our update is done and the fpregs/fpstate are in sync
- * if necessary. Context switches can happen again.
- */
- preempt_enable();
-}
-
-/*
* 'fpu__restore()' is called to copy FPU registers from
* the FPU fpstate to the live hw registers and to activate
* access to the hardware registers, so that FPU instructions
@@ -389,7 +316,7 @@ void fpu__current_fpstate_write_end(void)
*/
void fpu__restore(struct fpu *fpu)
{
- fpu__activate_curr(fpu);
+ fpu__initialize(fpu);
/* Avoid __kernel_fpu_begin() right after fpregs_activate() */
kernel_fpu_disable();
@@ -414,15 +341,17 @@ void fpu__drop(struct fpu *fpu)
{
preempt_disable();
- if (fpu->fpregs_active) {
- /* Ignore delayed exceptions from user space */
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
- fpregs_deactivate(fpu);
+ if (fpu == &current->thread.fpu) {
+ if (fpu->initialized) {
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
+ fpregs_deactivate(fpu);
+ }
}
- fpu->fpstate_active = 0;
+ fpu->initialized = 0;
trace_x86_fpu_dropped(fpu);
@@ -462,9 +391,11 @@ void fpu__clear(struct fpu *fpu)
* Make sure fpstate is cleared and initialized.
*/
if (static_cpu_has(X86_FEATURE_FPU)) {
- fpu__activate_curr(fpu);
+ preempt_disable();
+ fpu__initialize(fpu);
user_fpu_begin();
copy_init_fpstate_to_fpregs();
+ preempt_enable();
}
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index d5d44c452624..7affb7e3d9a5 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -240,7 +240,7 @@ static void __init fpu__init_system_ctx_switch(void)
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
- WARN_ON_FPU(current->thread.fpu.fpstate_active);
+ WARN_ON_FPU(current->thread.fpu.initialized);
}
/*
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b188b16841e3..3ea151372389 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -16,14 +16,14 @@ int regset_fpregs_active(struct task_struct *target, const struct user_regset *r
{
struct fpu *target_fpu = &target->thread.fpu;
- return target_fpu->fpstate_active ? regset->n : 0;
+ return target_fpu->initialized ? regset->n : 0;
}
int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset)
{
struct fpu *target_fpu = &target->thread.fpu;
- if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->fpstate_active)
+ if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized)
return regset->n;
else
return 0;
@@ -38,7 +38,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (!boot_cpu_has(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__activate_fpstate_read(fpu);
+ fpu__prepare_read(fpu);
fpstate_sanitize_xstate(fpu);
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -55,7 +55,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!boot_cpu_has(X86_FEATURE_FXSR))
return -ENODEV;
- fpu__activate_fpstate_write(fpu);
+ fpu__prepare_write(fpu);
fpstate_sanitize_xstate(fpu);
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -89,10 +89,13 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
xsave = &fpu->state.xsave;
- fpu__activate_fpstate_read(fpu);
+ fpu__prepare_read(fpu);
if (using_compacted_format()) {
- ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave);
+ if (kbuf)
+ ret = copy_xstate_to_kernel(kbuf, xsave, pos, count);
+ else
+ ret = copy_xstate_to_user(ubuf, xsave, pos, count);
} else {
fpstate_sanitize_xstate(fpu);
/*
@@ -129,28 +132,29 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
xsave = &fpu->state.xsave;
- fpu__activate_fpstate_write(fpu);
+ fpu__prepare_write(fpu);
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- ret = copyin_to_xsaves(kbuf, ubuf, xsave);
- else
+ if (using_compacted_format()) {
+ if (kbuf)
+ ret = copy_kernel_to_xstate(xsave, kbuf);
+ else
+ ret = copy_user_to_xstate(xsave, ubuf);
+ } else {
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
-
- /*
- * In case of failure, mark all states as init:
- */
- if (ret)
- fpstate_init(&fpu->state);
+ if (!ret)
+ ret = validate_xstate_header(&xsave->header);
+ }
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
xsave->i387.mxcsr &= mxcsr_feature_mask;
- xsave->header.xfeatures &= xfeatures_mask;
+
/*
- * These bits must be zero.
+ * In case of failure, mark all states as init:
*/
- memset(&xsave->header.reserved, 0, 48);
+ if (ret)
+ fpstate_init(&fpu->state);
return ret;
}
@@ -299,7 +303,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
struct fpu *fpu = &target->thread.fpu;
struct user_i387_ia32_struct env;
- fpu__activate_fpstate_read(fpu);
+ fpu__prepare_read(fpu);
if (!boot_cpu_has(X86_FEATURE_FPU))
return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
@@ -329,7 +333,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;
- fpu__activate_fpstate_write(fpu);
+ fpu__prepare_write(fpu);
fpstate_sanitize_xstate(fpu);
if (!boot_cpu_has(X86_FEATURE_FPU))
@@ -369,7 +373,7 @@ int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu)
struct fpu *fpu = &tsk->thread.fpu;
int fpvalid;
- fpvalid = fpu->fpstate_active;
+ fpvalid = fpu->initialized;
if (fpvalid)
fpvalid = !fpregs_get(tsk, NULL,
0, sizeof(struct user_i387_ia32_struct),
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 83c23c230b4c..fb639e70048f 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -155,7 +155,8 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
*/
int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
- struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+ struct fpu *fpu = &current->thread.fpu;
+ struct xregs_state *xsave = &fpu->state.xsave;
struct task_struct *tsk = current;
int ia32_fxstate = (buf != buf_fx);
@@ -170,13 +171,13 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
sizeof(struct user_i387_ia32_struct), NULL,
(struct _fpstate_32 __user *) buf) ? -1 : 1;
- if (fpregs_active() || using_compacted_format()) {
+ if (fpu->initialized || using_compacted_format()) {
/* Save the live register state to the user directly. */
if (copy_fpregs_to_sigframe(buf_fx))
return -1;
/* Update the thread's fxstate to save the fsave header. */
if (ia32_fxstate)
- copy_fxregs_to_kernel(&tsk->thread.fpu);
+ copy_fxregs_to_kernel(fpu);
} else {
/*
* It is a *bug* if kernel uses compacted-format for xsave
@@ -189,7 +190,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
return -1;
}
- fpstate_sanitize_xstate(&tsk->thread.fpu);
+ fpstate_sanitize_xstate(fpu);
if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size))
return -1;
}
@@ -213,8 +214,11 @@ sanitize_restored_xstate(struct task_struct *tsk,
struct xstate_header *header = &xsave->header;
if (use_xsave()) {
- /* These bits must be zero. */
- memset(header->reserved, 0, 48);
+ /*
+ * Note: we don't need to zero the reserved bits in the
+ * xstate_header here because we either didn't copy them at all,
+ * or we checked earlier that they aren't set.
+ */
/*
* Init the state that is not present in the memory
@@ -223,7 +227,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
if (fx_only)
header->xfeatures = XFEATURE_MASK_FPSSE;
else
- header->xfeatures &= (xfeatures_mask & xfeatures);
+ header->xfeatures &= xfeatures;
}
if (use_fxsr()) {
@@ -279,7 +283,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
if (!access_ok(VERIFY_READ, buf, size))
return -EACCES;
- fpu__activate_curr(fpu);
+ fpu__initialize(fpu);
if (!static_cpu_has(X86_FEATURE_FPU))
return fpregs_soft_set(current, NULL,
@@ -307,28 +311,29 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
/*
* For 32-bit frames with fxstate, copy the user state to the
* thread's fpu state, reconstruct fxstate from the fsave
- * header. Sanitize the copied state etc.
+ * header. Validate and sanitize the copied state.
*/
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
int err = 0;
/*
- * Drop the current fpu which clears fpu->fpstate_active. This ensures
+ * Drop the current fpu which clears fpu->initialized. This ensures
* that any context-switch during the copy of the new state,
* avoids the intermediate state from getting restored/saved.
* Thus avoiding the new restored state from getting corrupted.
* We will be ready to restore/save the state only after
- * fpu->fpstate_active is again set.
+ * fpu->initialized is again set.
*/
fpu__drop(fpu);
if (using_compacted_format()) {
- err = copyin_to_xsaves(NULL, buf_fx,
- &fpu->state.xsave);
+ err = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
} else {
- err = __copy_from_user(&fpu->state.xsave,
- buf_fx, state_size);
+ err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
+
+ if (!err && state_size > offsetof(struct xregs_state, header))
+ err = validate_xstate_header(&fpu->state.xsave.header);
}
if (err || __copy_from_user(&env, buf, sizeof(env))) {
@@ -339,7 +344,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
sanitize_restored_xstate(tsk, &env, xfeatures, fx_only);
}
- fpu->fpstate_active = 1;
+ fpu->initialized = 1;
preempt_disable();
fpu__restore(fpu);
preempt_enable();
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c24ac1efb12d..f1d5476c9022 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -483,6 +483,30 @@ int using_compacted_format(void)
return boot_cpu_has(X86_FEATURE_XSAVES);
}
+/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
+int validate_xstate_header(const struct xstate_header *hdr)
+{
+ /* No unknown or supervisor features may be set */
+ if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
+ return -EINVAL;
+
+ /* Userspace must use the uncompacted format */
+ if (hdr->xcomp_bv)
+ return -EINVAL;
+
+ /*
+ * If 'reserved' is shrunken to add a new field, make sure to validate
+ * that new field here!
+ */
+ BUILD_BUG_ON(sizeof(hdr->reserved) != 48);
+
+ /* No reserved bits may be set */
+ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved)))
+ return -EINVAL;
+
+ return 0;
+}
+
static void __xstate_dump_leaves(void)
{
int i;
@@ -867,7 +891,7 @@ const void *get_xsave_field_ptr(int xsave_state)
{
struct fpu *fpu = &current->thread.fpu;
- if (!fpu->fpstate_active)
+ if (!fpu->initialized)
return NULL;
/*
* fpu__save() takes the CPU's xstate registers
@@ -921,38 +945,129 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
#endif /* ! CONFIG_ARCH_HAS_PKEYS */
/*
+ * Weird legacy quirk: SSE and YMM states store information in the
+ * MXCSR and MXCSR_FLAGS fields of the FP area. That means if the FP
+ * area is marked as unused in the xfeatures header, we need to copy
+ * MXCSR and MXCSR_FLAGS if either SSE or YMM are in use.
+ */
+static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
+{
+ if (!(xfeatures & (XFEATURE_MASK_SSE|XFEATURE_MASK_YMM)))
+ return false;
+
+ if (xfeatures & XFEATURE_MASK_FP)
+ return false;
+
+ return true;
+}
+
+/*
* This is similar to user_regset_copyout(), but will not add offset to
* the source data pointer or increment pos, count, kbuf, and ubuf.
*/
-static inline int xstate_copyout(unsigned int pos, unsigned int count,
- void *kbuf, void __user *ubuf,
- const void *data, const int start_pos,
- const int end_pos)
+static inline void
+__copy_xstate_to_kernel(void *kbuf, const void *data,
+ unsigned int offset, unsigned int size, unsigned int size_total)
{
- if ((count == 0) || (pos < start_pos))
- return 0;
+ if (offset < size_total) {
+ unsigned int copy = min(size, size_total - offset);
- if (end_pos < 0 || pos < end_pos) {
- unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos));
+ memcpy(kbuf + offset, data, copy);
+ }
+}
- if (kbuf) {
- memcpy(kbuf + pos, data, copy);
- } else {
- if (__copy_to_user(ubuf + pos, data, copy))
- return -EFAULT;
+/*
+ * Convert from kernel XSAVES compacted format to standard format and copy
+ * to a kernel-space ptrace buffer.
+ *
+ * It supports partial copy but pos always starts from zero. This is called
+ * from xstateregs_get() and there we check the CPU has XSAVES.
+ */
+int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
+{
+ unsigned int offset, size;
+ struct xstate_header header;
+ int i;
+
+ /*
+ * Currently copy_regset_to_user() starts from pos 0:
+ */
+ if (unlikely(offset_start != 0))
+ return -EFAULT;
+
+ /*
+ * The destination is a ptrace buffer; we put in only user xstates:
+ */
+ memset(&header, 0, sizeof(header));
+ header.xfeatures = xsave->header.xfeatures;
+ header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Copy xregs_state->header:
+ */
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(header);
+
+ __copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ /*
+ * Copy only in-use xstates:
+ */
+ if ((header.xfeatures >> i) & 1) {
+ void *src = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ /* The next component has to fit fully into the output buffer: */
+ if (offset + size > size_total)
+ break;
+
+ __copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
}
+
+ }
+
+ if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ __copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
+ }
+
+ /*
+ * Fill xsave->i387.sw_reserved value for ptrace frame:
+ */
+ offset = offsetof(struct fxregs_state, sw_reserved);
+ size = sizeof(xstate_fx_sw_bytes);
+
+ __copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
+
+ return 0;
+}
+
+static inline int
+__copy_xstate_to_user(void __user *ubuf, const void *data, unsigned int offset, unsigned int size, unsigned int size_total)
+{
+ if (!size)
+ return 0;
+
+ if (offset < size_total) {
+ unsigned int copy = min(size, size_total - offset);
+
+ if (__copy_to_user(ubuf + offset, data, copy))
+ return -EFAULT;
}
return 0;
}
/*
* Convert from kernel XSAVES compacted format to standard format and copy
- * to a ptrace buffer. It supports partial copy but pos always starts from
+ * to a user-space buffer. It supports partial copy but pos always starts from
* zero. This is called from xstateregs_get() and there we check the CPU
* has XSAVES.
*/
-int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
- void __user *ubuf, struct xregs_state *xsave)
+int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
{
unsigned int offset, size;
int ret, i;
@@ -961,7 +1076,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
/*
* Currently copy_regset_to_user() starts from pos 0:
*/
- if (unlikely(pos != 0))
+ if (unlikely(offset_start != 0))
return -EFAULT;
/*
@@ -977,8 +1092,7 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
offset = offsetof(struct xregs_state, header);
size = sizeof(header);
- ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count);
-
+ ret = __copy_xstate_to_user(ubuf, &header, offset, size, size_total);
if (ret)
return ret;
@@ -992,25 +1106,30 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
offset = xstate_offsets[i];
size = xstate_sizes[i];
- ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count);
+ /* The next component has to fit fully into the output buffer: */
+ if (offset + size > size_total)
+ break;
+ ret = __copy_xstate_to_user(ubuf, src, offset, size, size_total);
if (ret)
return ret;
-
- if (offset + size >= count)
- break;
}
}
+ if (xfeatures_mxcsr_quirk(header.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ __copy_xstate_to_user(ubuf, &xsave->i387.mxcsr, offset, size, size_total);
+ }
+
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
*/
offset = offsetof(struct fxregs_state, sw_reserved);
size = sizeof(xstate_fx_sw_bytes);
- ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count);
-
+ ret = __copy_xstate_to_user(ubuf, xstate_fx_sw_bytes, offset, size, size_total);
if (ret)
return ret;
@@ -1018,55 +1137,98 @@ int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf,
}
/*
- * Convert from a ptrace standard-format buffer to kernel XSAVES format
- * and copy to the target thread. This is called from xstateregs_set() and
- * there we check the CPU has XSAVES and a whole standard-sized buffer
- * exists.
+ * Convert from a ptrace standard-format kernel buffer to kernel XSAVES format
+ * and copy to the target thread. This is called from xstateregs_set().
*/
-int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
- struct xregs_state *xsave)
+int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
{
unsigned int offset, size;
int i;
- u64 xfeatures;
- u64 allowed_features;
+ struct xstate_header hdr;
offset = offsetof(struct xregs_state, header);
- size = sizeof(xfeatures);
+ size = sizeof(hdr);
- if (kbuf) {
- memcpy(&xfeatures, kbuf + offset, size);
- } else {
- if (__copy_from_user(&xfeatures, ubuf + offset, size))
- return -EFAULT;
+ memcpy(&hdr, kbuf + offset, size);
+
+ if (validate_xstate_header(&hdr))
+ return -EINVAL;
+
+ for (i = 0; i < XFEATURE_MAX; i++) {
+ u64 mask = ((u64)1 << i);
+
+ if (hdr.xfeatures & mask) {
+ void *dst = __raw_xsave_addr(xsave, 1 << i);
+
+ offset = xstate_offsets[i];
+ size = xstate_sizes[i];
+
+ memcpy(dst, kbuf + offset, size);
+ }
+ }
+
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ memcpy(&xsave->i387.mxcsr, kbuf + offset, size);
}
/*
- * Reject if the user sets any disabled or supervisor features:
+ * The state that came in from userspace was user-state only.
+ * Mask all the user states out of 'xfeatures':
+ */
+ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+
+ /*
+ * Add back in the features that came in from userspace:
*/
- allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures |= hdr.xfeatures;
- if (xfeatures & ~allowed_features)
+ return 0;
+}
+
+/*
+ * Convert from a ptrace or sigreturn standard-format user-space buffer to
+ * kernel XSAVES format and copy to the target thread. This is called from
+ * xstateregs_set(), as well as potentially from the sigreturn() and
+ * rt_sigreturn() system calls.
+ */
+int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
+{
+ unsigned int offset, size;
+ int i;
+ struct xstate_header hdr;
+
+ offset = offsetof(struct xregs_state, header);
+ size = sizeof(hdr);
+
+ if (__copy_from_user(&hdr, ubuf + offset, size))
+ return -EFAULT;
+
+ if (validate_xstate_header(&hdr))
return -EINVAL;
for (i = 0; i < XFEATURE_MAX; i++) {
u64 mask = ((u64)1 << i);
- if (xfeatures & mask) {
+ if (hdr.xfeatures & mask) {
void *dst = __raw_xsave_addr(xsave, 1 << i);
offset = xstate_offsets[i];
size = xstate_sizes[i];
- if (kbuf) {
- memcpy(dst, kbuf + offset, size);
- } else {
- if (__copy_from_user(dst, ubuf + offset, size))
- return -EFAULT;
- }
+ if (__copy_from_user(dst, ubuf + offset, size))
+ return -EFAULT;
}
}
+ if (xfeatures_mxcsr_quirk(hdr.xfeatures)) {
+ offset = offsetof(struct fxregs_state, mxcsr);
+ size = MXCSR_AND_FLAGS_SIZE;
+ if (__copy_from_user(&xsave->i387.mxcsr, ubuf + offset, size))
+ return -EFAULT;
+ }
+
/*
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
@@ -1076,7 +1238,7 @@ int copyin_to_xsaves(const void *kbuf, const void __user *ubuf,
/*
* Add back in the features that came in from userspace:
*/
- xsave->header.xfeatures |= xfeatures;
+ xsave->header.xfeatures |= hdr.xfeatures;
return 0;
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e04442345fc0..4e188fda5961 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -263,7 +263,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
sp = (unsigned long) ka->sa.sa_restorer;
}
- if (fpu->fpstate_active) {
+ if (fpu->initialized) {
sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32),
&buf_fx, &math_size);
*fpstate = (void __user *)sp;
@@ -279,7 +279,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
return (void __user *)-1L;
/* save i387 and extended state */
- if (fpu->fpstate_active &&
+ if (fpu->initialized &&
copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0)
return (void __user *)-1L;
@@ -755,7 +755,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
/*
* Ensure the signal handler starts with the new fpu state.
*/
- if (fpu->fpstate_active)
+ if (fpu->initialized)
fpu__clear(fpu);
}
signal_setup_done(failed, ksig, stepping);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 0854ff169274..ad59edd84de7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -232,12 +232,6 @@ static void notrace start_secondary(void *unused)
*/
if (boot_cpu_has(X86_FEATURE_PCID))
__write_cr4(__read_cr4() | X86_CR4_PCIDE);
- cpu_init();
- x86_cpuinit.early_percpu_clock_init();
- preempt_disable();
- smp_callin();
-
- enable_start_cpu0 = 0;
#ifdef CONFIG_X86_32
/* switch away from the initial page table */
@@ -245,6 +239,13 @@ static void notrace start_secondary(void *unused)
__flush_tlb_all();
#endif
+ cpu_init();
+ x86_cpuinit.early_percpu_clock_init();
+ preempt_disable();
+ smp_callin();
+
+ enable_start_cpu0 = 0;
+
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 15f527b44aa7..a36254cbf776 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5298,7 +5298,6 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt,
static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
{
- register void *__sp asm(_ASM_SP);
ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
if (!(ctxt->d & ByteOp))
@@ -5306,7 +5305,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
: "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
- [fastop]"+S"(fop), "+r"(__sp)
+ [fastop]"+S"(fop), ASM_CALL_CONSTRAINT
: "c"(ctxt->src2.val));
ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5bfa353f6354..a2b804e10c95 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -9043,7 +9043,6 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
{
u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- register void *__sp asm(_ASM_SP);
if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
@@ -9072,7 +9071,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
#ifdef CONFIG_X86_64
[sp]"=&r"(tmp),
#endif
- "+r"(__sp)
+ ASM_CALL_CONSTRAINT
:
[entry]"r"(entry),
[ss]"i"(__KERNEL_DS),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cd17b7d9a107..03869eb7fcd6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7225,7 +7225,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
- fpu__activate_curr(fpu);
+ fpu__initialize(fpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index d4a7df2205b8..220638a4cb94 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -114,7 +114,7 @@ void math_emulate(struct math_emu_info *info)
struct desc_struct code_descriptor;
struct fpu *fpu = &current->thread.fpu;
- fpu__activate_curr(fpu);
+ fpu__initialize(fpu);
#ifdef RE_ENTRANT_CHECKING
if (emulating) {
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index c076f710de4c..c3521e2be396 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -2,6 +2,7 @@
#include <linux/uaccess.h>
#include <linux/sched/debug.h>
+#include <asm/fpu/internal.h>
#include <asm/traps.h>
#include <asm/kdebug.h>
@@ -78,6 +79,29 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
}
EXPORT_SYMBOL_GPL(ex_handler_refcount);
+/*
+ * Handler for when we fail to restore a task's FPU state. We should never get
+ * here because the FPU state of a task using the FPU (task->thread.fpu.state)
+ * should always be valid. However, past bugs have allowed userspace to set
+ * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
+ * These caused XRSTOR to fail when switching to the task, leaking the FPU
+ * registers of the task previously executing on the CPU. Mitigate this class
+ * of vulnerability by restoring from the initial state (essentially, zeroing
+ * out all the FPU registers) if we can't restore from the task's FPU state.
+ */
+bool ex_handler_fprestore(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr)
+{
+ regs->ip = ex_fixup_addr(fixup);
+
+ WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
+ (void *)instruction_pointer(regs));
+
+ __copy_kernel_to_fpregs(&init_fpstate, -1);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ex_handler_fprestore);
+
bool ex_handler_ext(const struct exception_table_entry *fixup,
struct pt_regs *regs, int trapnr)
{
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b836a7274e12..39567b5c33da 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -806,7 +806,6 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
- register void *__sp asm("rsp");
unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
/*
* We're likely to be running with very little stack space
@@ -821,7 +820,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
asm volatile ("movq %[stack], %%rsp\n\t"
"call handle_stack_overflow\n\t"
"1: jmp 1b"
- : "+r" (__sp)
+ : ASM_CALL_CONSTRAINT
: "D" ("kernel stack overflow (page fault)"),
"S" (regs), "d" (address),
[stack] "rm" (stack));
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 2dab69a706ec..d7bc0eea20a5 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -18,7 +18,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/mmu_context.h> /* vma_pkey() */
-#include <asm/fpu/internal.h> /* fpregs_active() */
int __execute_only_pkey(struct mm_struct *mm)
{
@@ -45,7 +44,7 @@ int __execute_only_pkey(struct mm_struct *mm)
*/
preempt_disable();
if (!need_to_set_mm_pkey &&
- fpregs_active() &&
+ current->thread.fpu.initialized &&
!__pkru_allows_read(read_pkru(), execute_only_pkey)) {
preempt_enable();
return execute_only_pkey;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1ab3821f9e26..93fe97cce581 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -126,8 +126,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* isn't free.
*/
#ifdef CONFIG_DEBUG_VM
- if (WARN_ON_ONCE(__read_cr3() !=
- (__sme_pa(real_prev->pgd) | prev_asid))) {
+ if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) {
/*
* If we were to BUG here, we'd be very likely to kill
* the system so hard that we don't see the call trace.
@@ -172,7 +171,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
*/
this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
next_tlb_gen);
- write_cr3(__sme_pa(next->pgd) | prev_asid);
+ write_cr3(build_cr3(next, prev_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
}
@@ -216,12 +215,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (need_flush) {
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- write_cr3(__sme_pa(next->pgd) | new_asid);
+ write_cr3(build_cr3(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
TLB_FLUSH_ALL);
} else {
/* The new ASID is already up to date. */
- write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
+ write_cr3(build_cr3_noflush(next, new_asid));
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
}
@@ -265,7 +264,7 @@ void initialize_tlbstate_and_flush(void)
!(cr4_read_shadow() & X86_CR4_PCIDE));
/* Force ASID 0 and force a TLB flush. */
- write_cr3(cr3 & ~CR3_PCID_MASK);
+ write_cr3(build_cr3(mm, 0));
/* Reinitialize tlbstate. */
this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0);
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 509f560bd0c6..7330cb3b2283 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -2220,7 +2220,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
* not the first page table in the page table pool.
* Iterate through the initial page tables to find the real page table base.
*/
-static phys_addr_t xen_find_pt_base(pmd_t *pmd)
+static phys_addr_t __init xen_find_pt_base(pmd_t *pmd)
{
phys_addr_t pt_base, paddr;
unsigned pmdidx;