diff options
author | 2025-06-05 11:23:07 -0700 | |
---|---|---|
committer | 2025-06-05 14:03:16 -0700 | |
commit | 2670a39b1ea68fb0b9175e26e299f3fe974e0332 (patch) | |
tree | e0a6198c7e5e3628e4e3aff52bce963472646647 /arch | |
parent | RISC-V: Documentation: Add enough title underlines to CMODX (diff) | |
parent | RISC-V: vDSO: Wire up getrandom() vDSO implementation (diff) | |
download | linux-rng-2670a39b1ea68fb0b9175e26e299f3fe974e0332.tar.xz linux-rng-2670a39b1ea68fb0b9175e26e299f3fe974e0332.zip |
Merge tag 'riscv-mw2-6.16-rc1' of ssh://gitolite.kernel.org/pub/scm/linux/kernel/git/alexghiti/linux into for-next
riscv patches for 6.16-rc1, part 2
* Performance improvements
- Add support for vdso getrandom
- Implement raid6 calculations using vectors
- Introduce svinval tlb invalidation
* Cleanup
- A bunch of deduplication of the macros we use for manipulating instructions
* Misc
- Introduce a kunit test for kprobes
- Add support for mseal as riscv fits the requirements (thanks to Lorenzo for making sure of that :))
[Palmer: There was a rebase between part 1 and part 2, so I've had to do
some more git surgery here... at least two rounds of surgery...]
* alex-pr-2: (866 commits)
RISC-V: vDSO: Wire up getrandom() vDSO implementation
riscv: enable mseal sysmap for RV64
raid6: Add RISC-V SIMD syndrome and recovery calculations
riscv: mm: Add support for Svinval extension
riscv: Add kprobes KUnit test
riscv: kprobes: Remove duplication of RV_EXTRACT_ITYPE_IMM
riscv: kprobes: Remove duplication of RV_EXTRACT_UTYPE_IMM
riscv: kprobes: Remove duplication of RV_EXTRACT_RD_REG
riscv: kprobes: Remove duplication of RVC_EXTRACT_BTYPE_IMM
riscv: kprobes: Remove duplication of RVC_EXTRACT_C2_RS1_REG
riscv: kproves: Remove duplication of RVC_EXTRACT_JTYPE_IMM
riscv: kprobes: Remove duplication of RV_EXTRACT_BTYPE_IMM
riscv: kprobes: Remove duplication of RV_EXTRACT_RS1_REG
riscv: kprobes: Remove duplication of RV_EXTRACT_JTYPE_IMM
riscv: kprobes: Move branch_funct3 to insn.h
riscv: kprobes: Move branch_rs2_idx to insn.h
Linux 6.15-rc6
Input: xpad - fix xpad_device sorting
Input: xpad - add support for several more controllers
Input: xpad - fix Share button on Xbox One controllers
...
Diffstat (limited to 'arch')
117 files changed, 1390 insertions, 527 deletions
diff --git a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi index f2386dcb9ff2..dda4fa91b2f2 100644 --- a/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi +++ b/arch/arm/boot/dts/nxp/imx/imx6ul-imx6ull-opos6ul.dtsi @@ -40,6 +40,9 @@ reg = <1>; interrupt-parent = <&gpio4>; interrupts = <16 IRQ_TYPE_LEVEL_LOW>; + micrel,led-mode = <1>; + clocks = <&clks IMX6UL_CLK_ENET_REF>; + clock-names = "rmii-ref"; status = "okay"; }; }; diff --git a/arch/arm64/boot/dts/arm/morello.dtsi b/arch/arm64/boot/dts/arm/morello.dtsi index 0bab0b3ea969..5bc1c725dc86 100644 --- a/arch/arm64/boot/dts/arm/morello.dtsi +++ b/arch/arm64/boot/dts/arm/morello.dtsi @@ -44,7 +44,7 @@ next-level-cache = <&l2_0>; clocks = <&scmi_dvfs 0>; - l2_0: l2-cache-0 { + l2_0: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -53,13 +53,6 @@ cache-sets = <2048>; cache-unified; next-level-cache = <&l3_0>; - - l3_0: l3-cache { - compatible = "cache"; - cache-level = <3>; - cache-size = <0x100000>; - cache-unified; - }; }; }; @@ -78,7 +71,7 @@ next-level-cache = <&l2_1>; clocks = <&scmi_dvfs 0>; - l2_1: l2-cache-1 { + l2_1: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -105,7 +98,7 @@ next-level-cache = <&l2_2>; clocks = <&scmi_dvfs 1>; - l2_2: l2-cache-2 { + l2_2: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -132,7 +125,7 @@ next-level-cache = <&l2_3>; clocks = <&scmi_dvfs 1>; - l2_3: l2-cache-3 { + l2_3: l2-cache { compatible = "cache"; cache-level = <2>; /* 8 ways set associative */ @@ -143,6 +136,13 @@ next-level-cache = <&l3_0>; }; }; + + l3_0: l3-cache { + compatible = "cache"; + cache-level = <3>; + cache-size = <0x100000>; + cache-unified; + }; }; firmware { diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index 7251ad3a0017..b46566f3ce20 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -144,6 +144,19 @@ startup-delay-us = <20000>; }; + reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc { + compatible = "regulator-gpio"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_usdhc2_vsel>; + gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>; + regulator-max-microvolt = <3300000>; + regulator-min-microvolt = <1800000>; + states = <1800000 0x1>, + <3300000 0x0>; + regulator-name = "PMIC_USDHC_VSELECT"; + vin-supply = <®_nvcc_sd>; + }; + reserved-memory { #address-cells = <2>; #size-cells = <2>; @@ -269,7 +282,7 @@ "SODIMM_19", "", "", - "", + "PMIC_USDHC_VSELECT", "", "", "", @@ -785,6 +798,7 @@ pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_cd>; pinctrl-3 = <&pinctrl_usdhc2_sleep>, <&pinctrl_usdhc2_cd_sleep>; vmmc-supply = <®_usdhc2_vmmc>; + vqmmc-supply = <®_usdhc2_vqmmc>; }; &wdog1 { @@ -1206,13 +1220,17 @@ <MX8MM_IOMUXC_NAND_CLE_GPIO3_IO5 0x6>; /* SODIMM 76 */ }; + pinctrl_usdhc2_vsel: usdhc2vselgrp { + fsl,pins = + <MX8MM_IOMUXC_GPIO1_IO04_GPIO1_IO4 0x10>; /* PMIC_USDHC_VSELECT */ + }; + /* * Note: Due to ERR050080 we use discrete external on-module resistors pulling-up to the * on-module +V3.3_1.8_SD (LDO5) rail and explicitly disable the internal pull-ups here. */ pinctrl_usdhc2: usdhc2grp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x90>, /* SODIMM 78 */ <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x90>, /* SODIMM 74 */ <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x90>, /* SODIMM 80 */ @@ -1223,7 +1241,6 @@ pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x94>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x94>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x94>, @@ -1234,7 +1251,6 @@ pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x10>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x96>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x96>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x96>, @@ -1246,7 +1262,6 @@ /* Avoid backfeeding with removed card power */ pinctrl_usdhc2_sleep: usdhc2slpgrp { fsl,pins = - <MX8MM_IOMUXC_GPIO1_IO04_USDHC2_VSELECT 0x0>, <MX8MM_IOMUXC_SD2_CLK_USDHC2_CLK 0x0>, <MX8MM_IOMUXC_SD2_CMD_USDHC2_CMD 0x0>, <MX8MM_IOMUXC_SD2_DATA0_USDHC2_DATA0 0x0>, diff --git a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi index a1b75c9068b2..dc0ccd723c6d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-nominal.dtsi @@ -24,6 +24,20 @@ fsl,operating-mode = "nominal"; }; +&gpu2d { + assigned-clocks = <&clk IMX8MP_CLK_GPU2D_CORE>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>; +}; + +&gpu3d { + assigned-clocks = <&clk IMX8MP_CLK_GPU3D_CORE>, + <&clk IMX8MP_CLK_GPU3D_SHADER_CORE>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>, <800000000>; +}; + &pgc_hdmimix { assigned-clocks = <&clk IMX8MP_CLK_HDMI_AXI>, <&clk IMX8MP_CLK_HDMI_APB>; @@ -46,6 +60,18 @@ assigned-clock-rates = <600000000>, <300000000>; }; +&pgc_mlmix { + assigned-clocks = <&clk IMX8MP_CLK_ML_CORE>, + <&clk IMX8MP_CLK_ML_AXI>, + <&clk IMX8MP_CLK_ML_AHB>; + assigned-clock-parents = <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>, + <&clk IMX8MP_SYS_PLL1_800M>; + assigned-clock-rates = <800000000>, + <800000000>, + <300000000>; +}; + &media_blk_ctrl { assigned-clocks = <&clk IMX8MP_CLK_MEDIA_AXI>, <&clk IMX8MP_CLK_MEDIA_APB>, diff --git a/arch/arm64/boot/dts/freescale/imx95.dtsi b/arch/arm64/boot/dts/freescale/imx95.dtsi index 9bb26b466a06..59f057ba6fa7 100644 --- a/arch/arm64/boot/dts/freescale/imx95.dtsi +++ b/arch/arm64/boot/dts/freescale/imx95.dtsi @@ -1626,7 +1626,7 @@ reg = <0 0x4c300000 0 0x10000>, <0 0x60100000 0 0xfe00000>, <0 0x4c360000 0 0x10000>, - <0 0x4c340000 0 0x2000>; + <0 0x4c340000 0 0x4000>; reg-names = "dbi", "config", "atu", "app"; ranges = <0x81000000 0x0 0x00000000 0x0 0x6ff00000 0 0x00100000>, <0x82000000 0x0 0x10000000 0x9 0x10000000 0 0x10000000>; @@ -1673,7 +1673,7 @@ reg = <0 0x4c300000 0 0x10000>, <0 0x4c360000 0 0x1000>, <0 0x4c320000 0 0x1000>, - <0 0x4c340000 0 0x2000>, + <0 0x4c340000 0 0x4000>, <0 0x4c370000 0 0x10000>, <0x9 0 1 0>; reg-names = "dbi","atu", "dbi2", "app", "dma", "addr_space"; @@ -1700,7 +1700,7 @@ reg = <0 0x4c380000 0 0x10000>, <8 0x80100000 0 0xfe00000>, <0 0x4c3e0000 0 0x10000>, - <0 0x4c3c0000 0 0x2000>; + <0 0x4c3c0000 0 0x4000>; reg-names = "dbi", "config", "atu", "app"; ranges = <0x81000000 0 0x00000000 0x8 0x8ff00000 0 0x00100000>, <0x82000000 0 0x10000000 0xa 0x10000000 0 0x10000000>; @@ -1749,7 +1749,7 @@ reg = <0 0x4c380000 0 0x10000>, <0 0x4c3e0000 0 0x1000>, <0 0x4c3a0000 0 0x1000>, - <0 0x4c3c0000 0 0x2000>, + <0 0x4c3c0000 0 0x4000>, <0 0x4c3f0000 0 0x10000>, <0xa 0 1 0>; reg-names = "dbi", "atu", "dbi2", "app", "dma", "addr_space"; diff --git a/arch/arm64/boot/dts/st/stm32mp211.dtsi b/arch/arm64/boot/dts/st/stm32mp211.dtsi index 6dd1377f3e1d..bf888d60cd4f 100644 --- a/arch/arm64/boot/dts/st/stm32mp211.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp211.dtsi @@ -116,11 +116,11 @@ }; intc: interrupt-controller@4ac10000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; reg = <0x4ac10000 0x0 0x1000>, - <0x4ac20000 0x0 0x2000>, - <0x4ac40000 0x0 0x2000>, - <0x4ac60000 0x0 0x2000>; + <0x4ac20000 0x0 0x20000>, + <0x4ac40000 0x0 0x20000>, + <0x4ac60000 0x0 0x20000>; #interrupt-cells = <3>; interrupt-controller; }; diff --git a/arch/arm64/boot/dts/st/stm32mp231.dtsi b/arch/arm64/boot/dts/st/stm32mp231.dtsi index 8820d219a33e..75697acd1345 100644 --- a/arch/arm64/boot/dts/st/stm32mp231.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp231.dtsi @@ -1201,13 +1201,12 @@ }; intc: interrupt-controller@4ac10000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; reg = <0x4ac10000 0x1000>, - <0x4ac20000 0x2000>, - <0x4ac40000 0x2000>, - <0x4ac60000 0x2000>; + <0x4ac20000 0x20000>, + <0x4ac40000 0x20000>, + <0x4ac60000 0x20000>; #interrupt-cells = <3>; - #address-cells = <1>; interrupt-controller; }; }; diff --git a/arch/arm64/boot/dts/st/stm32mp251.dtsi b/arch/arm64/boot/dts/st/stm32mp251.dtsi index f3c6cdfd7008..87110f91e489 100644 --- a/arch/arm64/boot/dts/st/stm32mp251.dtsi +++ b/arch/arm64/boot/dts/st/stm32mp251.dtsi @@ -115,14 +115,13 @@ }; intc: interrupt-controller@4ac00000 { - compatible = "arm,cortex-a7-gic"; + compatible = "arm,gic-400"; #interrupt-cells = <3>; - #address-cells = <1>; interrupt-controller; reg = <0x0 0x4ac10000 0x0 0x1000>, - <0x0 0x4ac20000 0x0 0x2000>, - <0x0 0x4ac40000 0x0 0x2000>, - <0x0 0x4ac60000 0x0 0x2000>; + <0x0 0x4ac20000 0x0 0x20000>, + <0x0 0x4ac40000 0x0 0x20000>, + <0x0 0x4ac60000 0x0 0x20000>; }; psci { diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index ebceaae3c749..d40e427ddad9 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -52,7 +52,7 @@ mrs x0, id_aa64mmfr1_el1 ubfx x0, x0, #ID_AA64MMFR1_EL1_HCX_SHIFT, #4 cbz x0, .Lskip_hcrx_\@ - mov_q x0, HCRX_HOST_FLAGS + mov_q x0, (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) /* Enable GCS if supported */ mrs_s x1, SYS_ID_AA64PFR1_EL1 diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 974d72b5905b..e9c8a581e16f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -100,9 +100,8 @@ HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID1) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) -#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H) +#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H | HCR_AMO | HCR_IMO | HCR_FMO) -#define HCRX_HOST_FLAGS (HCRX_EL2_MSCEn | HCRX_EL2_TCR2En | HCRX_EL2_EnFPM) #define MPAMHCR_HOST_FLAGS 0 /* TCR_EL2 Registers bits */ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e98cfe7855a6..08ba91e6fb03 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1588,4 +1588,9 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define kvm_has_s1poe(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) +static inline bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 30a29e88994b..6e8aa8e72601 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -94,17 +94,6 @@ static inline bool kaslr_requires_kpti(void) return false; } - /* - * Systems affected by Cavium erratum 24756 are incompatible - * with KPTI. - */ - if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { - extern const struct midr_range cavium_erratum_27456_cpus[]; - - if (is_midr_in_range_list(cavium_erratum_27456_cpus)) - return false; - } - return true; } diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h index 92a2b59a9f3d..3322c7047d84 100644 --- a/arch/arm64/include/asm/vdso/gettimeofday.h +++ b/arch/arm64/include/asm/vdso/gettimeofday.h @@ -99,6 +99,19 @@ static __always_inline u64 __arch_get_hw_counter(s32 clock_mode, return res; } +#if IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) +static __always_inline const struct vdso_time_data *__arch_get_vdso_u_time_data(void) +{ + const struct vdso_time_data *ret = &vdso_u_time_data; + + /* Work around invalid absolute relocations */ + OPTIMIZER_HIDE_VAR(ret); + + return ret; +} +#define __arch_get_vdso_u_time_data __arch_get_vdso_u_time_data +#endif /* IS_ENABLED(CONFIG_CC_IS_GCC) && IS_ENABLED(CONFIG_PAGE_SIZE_64KB) */ + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b55f5f705750..6b0ad5070d3e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -335,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 9c4d6d552b25..4c46d80aa64b 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -114,7 +114,14 @@ static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NC DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS); -bool arm64_use_ng_mappings = false; +/* + * arm64_use_ng_mappings must be placed in the .data section, otherwise it + * ends up in the .bss section where it is initialized in early_map_kernel() + * after the MMU (with the idmap) was enabled. create_init_idmap() - which + * runs before early_map_kernel() and reads the variable via PTE_MAYBE_NG - + * may end up generating an incorrect idmap page table attributes. + */ +bool arm64_use_ng_mappings __read_mostly = false; EXPORT_SYMBOL(arm64_use_ng_mappings); DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors; diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5e3c4b58f279..2004b4f41ade 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -47,10 +47,6 @@ PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); -PROVIDE(__pi_is_midr_in_range_list = is_midr_in_range_list); -#endif PROVIDE(__pi__ctype = _ctype); PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e57b043f324b..c6650cfe706c 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -207,6 +207,29 @@ static void __init map_fdt(u64 fdt) dsb(ishst); } +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; +} + asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) { static char const chosen_str[] __initconst = "/chosen"; @@ -246,7 +269,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) u64 kaslr_seed = kaslr_early_init(fdt, chosen); if (kaslr_seed && kaslr_requires_kpti()) - arm64_use_ng_mappings = true; + arm64_use_ng_mappings = ng_mappings_allowed(); kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); } diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index b198dde79e59..b607f6dfc5e6 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -879,10 +879,12 @@ static u8 spectre_bhb_loop_affected(void) static const struct midr_range spectre_bhb_k132_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_X3), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2), + {}, }; static const struct midr_range spectre_bhb_k38_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A715), MIDR_ALL_VERSIONS(MIDR_CORTEX_A720), + {}, }; static const struct midr_range spectre_bhb_k32_list[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A78), diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 68fec8c95fee..19ca57def629 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2743,11 +2743,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index b741ea6aefa5..96f625dc7256 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -235,6 +235,8 @@ static inline void __deactivate_traps_mpam(void) static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); @@ -245,11 +247,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) * EL1 instead of being trapped to EL2. */ if (system_supports_pmuv3()) { - struct kvm_cpu_context *hctxt; - write_sysreg(0, pmselr_el0); - hctxt = host_data_ptr(host_ctxt); ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); vcpu_set_flag(vcpu, PMUSERENR_ON_CPU); @@ -269,6 +268,7 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) hcrx &= ~clr; } + ctxt_sys_reg(hctxt, HCRX_EL2) = read_sysreg_s(SYS_HCRX_EL2); write_sysreg_s(hcrx, SYS_HCRX_EL2); } @@ -278,19 +278,18 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt); + write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2); write_sysreg(0, hstr_el2); if (system_supports_pmuv3()) { - struct kvm_cpu_context *hctxt; - - hctxt = host_data_ptr(host_ctxt); write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0); vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU); } if (cpus_have_final_cap(ARM64_HAS_HCX)) - write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2); + write_sysreg_s(ctxt_sys_reg(hctxt, HCRX_EL2), SYS_HCRX_EL2); __deactivate_traps_hfgxtr(vcpu); __deactivate_traps_mpam(); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2a5284f749b4..e80f3ebd3e2a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -503,7 +503,7 @@ int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) { int ret; - if (!addr_is_memory(addr)) + if (!range_is_memory(addr, addr + size)) return -EPERM; ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt, diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index ed363aa3027e..50aa8dbcae75 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -429,23 +429,27 @@ u64 __vgic_v3_get_gic_config(void) /* * To check whether we have a MMIO-based (GICv2 compatible) * CPU interface, we need to disable the system register - * view. To do that safely, we have to prevent any interrupt - * from firing (which would be deadly). + * view. * - * Note that this only makes sense on VHE, as interrupts are - * already masked for nVHE as part of the exception entry to - * EL2. - */ - if (has_vhe()) - flags = local_daif_save(); - - /* * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates * that to be able to set ICC_SRE_EL1.SRE to 0, all the * interrupt overrides must be set. You've got to love this. + * + * As we always run VHE with HCR_xMO set, no extra xMO + * manipulation is required in that case. + * + * To safely disable SRE, we have to prevent any interrupt + * from firing (which would be deadly). This only makes sense + * on VHE, as interrupts are already masked for nVHE as part + * of the exception entry to EL2. */ - sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); - isb(); + if (has_vhe()) { + flags = local_daif_save(); + } else { + sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); + } + write_gicreg(0, ICC_SRE_EL1); isb(); @@ -453,11 +457,13 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); - sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); - isb(); - if (has_vhe()) + if (has_vhe()) { local_daif_restore(flags); + } else { + sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); + } val = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63); val |= read_gicreg(ICH_VTR_EL2); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 754f2fe0cc67..eeda92330ade 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1501,6 +1501,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } + if (!is_protected_kvm_enabled()) + memcache = &vcpu->arch.mmu_page_cache; + else + memcache = &vcpu->arch.pkvm_memcache; + /* * Permission faults just need to update the existing leaf entry, * and so normally don't require allocations from the memcache. The @@ -1510,13 +1515,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (!fault_is_perm || (logging_active && write_fault)) { int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu); - if (!is_protected_kvm_enabled()) { - memcache = &vcpu->arch.mmu_page_cache; + if (!is_protected_kvm_enabled()) ret = kvm_mmu_topup_memory_cache(memcache, min_pages); - } else { - memcache = &vcpu->arch.pkvm_memcache; + else ret = topup_hyp_memcache(memcache, min_pages); - } + if (ret) return ret; } diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 005ad28f7306..5dde9285afc8 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1945,6 +1945,12 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, if ((hw_val & mpam_mask) == (user_val & mpam_mask)) user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK; + /* Fail the guest's request to disable the AA64 ISA at EL{0,1,2} */ + if (!FIELD_GET(ID_AA64PFR0_EL1_EL0, user_val) || + !FIELD_GET(ID_AA64PFR0_EL1_EL1, user_val) || + (vcpu_has_nv(vcpu) && !FIELD_GET(ID_AA64PFR0_EL1_EL2, user_val))) + return -EINVAL; + return set_id_reg(vcpu, rd, user_val); } diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 067c0b994648..1a2cf012b8f2 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -73,6 +73,7 @@ config LOONGARCH select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_BPF_JIT diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h index 3177674228f8..45514f314664 100644 --- a/arch/loongarch/include/asm/fpu.h +++ b/arch/loongarch/include/asm/fpu.h @@ -22,22 +22,29 @@ struct sigcontext; #define kernel_fpu_available() cpu_has_fpu -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); - -extern void _init_fpu(unsigned int); -extern void _save_fp(struct loongarch_fpu *); -extern void _restore_fp(struct loongarch_fpu *); - -extern void _save_lsx(struct loongarch_fpu *fpu); -extern void _restore_lsx(struct loongarch_fpu *fpu); -extern void _init_lsx_upper(void); -extern void _restore_lsx_upper(struct loongarch_fpu *fpu); - -extern void _save_lasx(struct loongarch_fpu *fpu); -extern void _restore_lasx(struct loongarch_fpu *fpu); -extern void _init_lasx_upper(void); -extern void _restore_lasx_upper(struct loongarch_fpu *fpu); + +void kernel_fpu_begin(void); +void kernel_fpu_end(void); + +asmlinkage void _init_fpu(unsigned int); +asmlinkage void _save_fp(struct loongarch_fpu *); +asmlinkage void _restore_fp(struct loongarch_fpu *); +asmlinkage int _save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); +asmlinkage int _restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); + +asmlinkage void _save_lsx(struct loongarch_fpu *fpu); +asmlinkage void _restore_lsx(struct loongarch_fpu *fpu); +asmlinkage void _init_lsx_upper(void); +asmlinkage void _restore_lsx_upper(struct loongarch_fpu *fpu); +asmlinkage int _save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); +asmlinkage int _restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); + +asmlinkage void _save_lasx(struct loongarch_fpu *fpu); +asmlinkage void _restore_lasx(struct loongarch_fpu *fpu); +asmlinkage void _init_lasx_upper(void); +asmlinkage void _restore_lasx_upper(struct loongarch_fpu *fpu); +asmlinkage int _save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); +asmlinkage int _restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); static inline void enable_lsx(void); static inline void disable_lsx(void); diff --git a/arch/loongarch/include/asm/lbt.h b/arch/loongarch/include/asm/lbt.h index e671978bf552..38566574e562 100644 --- a/arch/loongarch/include/asm/lbt.h +++ b/arch/loongarch/include/asm/lbt.h @@ -12,9 +12,13 @@ #include <asm/loongarch.h> #include <asm/processor.h> -extern void _init_lbt(void); -extern void _save_lbt(struct loongarch_lbt *); -extern void _restore_lbt(struct loongarch_lbt *); +asmlinkage void _init_lbt(void); +asmlinkage void _save_lbt(struct loongarch_lbt *); +asmlinkage void _restore_lbt(struct loongarch_lbt *); +asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags); +asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags); +asmlinkage int _save_ftop_context(void __user *ftop); +asmlinkage int _restore_ftop_context(void __user *ftop); static inline int is_lbt_enabled(void) { diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index f3ddaed9ef7f..a5b63c84f854 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -33,9 +33,9 @@ struct pt_regs { unsigned long __last[]; } __aligned(8); -static inline int regs_irqs_disabled(struct pt_regs *regs) +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { - return arch_irqs_disabled_flags(regs->csr_prmd); + return !(regs->csr_prmd & CSR_PRMD_PIE); } static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 4853e8b04c6f..f9dcaa60033d 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -21,10 +21,10 @@ obj-$(CONFIG_CPU_HAS_LBT) += lbt.o obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o -CFLAGS_module.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall.o += $(call cc-option,-Wno-override-init,) -CFLAGS_traps.o += $(call cc-option,-Wno-override-init,) -CFLAGS_perf_event.o += $(call cc-option,-Wno-override-init,) +CFLAGS_module.o += $(call cc-disable-warning, override-init) +CFLAGS_syscall.o += $(call cc-disable-warning, override-init) +CFLAGS_traps.o += $(call cc-disable-warning, override-init) +CFLAGS_perf_event.o += $(call cc-disable-warning, override-init) ifdef CONFIG_FUNCTION_TRACER ifndef CONFIG_DYNAMIC_FTRACE diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 6ab640101457..28caf416ae36 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -458,6 +458,7 @@ SYM_FUNC_START(_save_fp_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_fp_context) +EXPORT_SYMBOL_GPL(_save_fp_context) /* * a0: fpregs @@ -471,6 +472,7 @@ SYM_FUNC_START(_restore_fp_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_fp_context) +EXPORT_SYMBOL_GPL(_restore_fp_context) /* * a0: fpregs @@ -484,6 +486,7 @@ SYM_FUNC_START(_save_lsx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lsx_context) +EXPORT_SYMBOL_GPL(_save_lsx_context) /* * a0: fpregs @@ -497,6 +500,7 @@ SYM_FUNC_START(_restore_lsx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lsx_context) +EXPORT_SYMBOL_GPL(_restore_lsx_context) /* * a0: fpregs @@ -510,6 +514,7 @@ SYM_FUNC_START(_save_lasx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lasx_context) +EXPORT_SYMBOL_GPL(_save_lasx_context) /* * a0: fpregs @@ -523,6 +528,7 @@ SYM_FUNC_START(_restore_lasx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lasx_context) +EXPORT_SYMBOL_GPL(_restore_lasx_context) .L_fpu_fault: li.w a0, -EFAULT # failure diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S index 001f061d226a..71678912d24c 100644 --- a/arch/loongarch/kernel/lbt.S +++ b/arch/loongarch/kernel/lbt.S @@ -90,6 +90,7 @@ SYM_FUNC_START(_save_lbt_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lbt_context) +EXPORT_SYMBOL_GPL(_save_lbt_context) /* * a0: scr @@ -110,6 +111,7 @@ SYM_FUNC_START(_restore_lbt_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lbt_context) +EXPORT_SYMBOL_GPL(_restore_lbt_context) /* * a0: ftop @@ -120,6 +122,7 @@ SYM_FUNC_START(_save_ftop_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_ftop_context) +EXPORT_SYMBOL_GPL(_save_ftop_context) /* * a0: ftop @@ -150,6 +153,7 @@ SYM_FUNC_START(_restore_ftop_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_ftop_context) +EXPORT_SYMBOL_GPL(_restore_ftop_context) .L_lbt_fault: li.w a0, -EFAULT # failure diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c index 7a555b600171..4740cb5b2388 100644 --- a/arch/loongarch/kernel/signal.c +++ b/arch/loongarch/kernel/signal.c @@ -51,27 +51,6 @@ #define lock_lbt_owner() ({ preempt_disable(); pagefault_disable(); }) #define unlock_lbt_owner() ({ pagefault_enable(); preempt_enable(); }) -/* Assembly functions to move context to/from the FPU */ -extern asmlinkage int -_save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); -extern asmlinkage int -_restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); -extern asmlinkage int -_save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); - -#ifdef CONFIG_CPU_HAS_LBT -extern asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags); -extern asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags); -extern asmlinkage int _save_ftop_context(void __user *ftop); -extern asmlinkage int _restore_ftop_context(void __user *ftop); -#endif - struct rt_sigframe { struct siginfo rs_info; struct ucontext rs_uctx; diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 2ec3106c0da3..47fc2de6d150 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -553,9 +553,10 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs) die_if_kernel("Kernel ale access", regs); force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); #else + bool pie = regs_irqs_disabled(regs); unsigned int *pc; - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->csr_badvaddr); @@ -582,7 +583,7 @@ sigbus: die_if_kernel("Kernel ale access", regs); force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); #endif irqentry_exit(regs, state); @@ -621,12 +622,13 @@ static void bug_handler(struct pt_regs *regs) asmlinkage void noinstr do_bce(struct pt_regs *regs) { bool user = user_mode(regs); + bool pie = regs_irqs_disabled(regs); unsigned long era = exception_era(regs); u64 badv = 0, lower = 0, upper = ULONG_MAX; union loongarch_instruction insn; irqentry_state_t state = irqentry_enter(regs); - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); current->thread.trap_nr = read_csr_excode(); @@ -692,7 +694,7 @@ asmlinkage void noinstr do_bce(struct pt_regs *regs) force_sig_bnderr((void __user *)badv, (void __user *)lower, (void __user *)upper); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); @@ -710,11 +712,12 @@ bad_era: asmlinkage void noinstr do_bp(struct pt_regs *regs) { bool user = user_mode(regs); + bool pie = regs_irqs_disabled(regs); unsigned int opcode, bcode; unsigned long era = exception_era(regs); irqentry_state_t state = irqentry_enter(regs); - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); if (__get_inst(&opcode, (u32 *)era, user)) @@ -780,7 +783,7 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) } out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); @@ -1015,6 +1018,7 @@ static void init_restore_lbt(void) asmlinkage void noinstr do_lbt(struct pt_regs *regs) { + bool pie = regs_irqs_disabled(regs); irqentry_state_t state = irqentry_enter(regs); /* @@ -1024,7 +1028,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs) * (including the user using 'MOVGR2GCSR' to turn on TM, which * will not trigger the BTE), we need to check PRMD first. */ - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); if (!cpu_has_lbt) { @@ -1038,7 +1042,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs) preempt_enable(); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile index f4c8e35c216a..cb41d9265662 100644 --- a/arch/loongarch/kvm/Makefile +++ b/arch/loongarch/kvm/Makefile @@ -21,4 +21,4 @@ kvm-y += intc/eiointc.o kvm-y += intc/pch_pic.o kvm-y += irqfd.o -CFLAGS_exit.o += $(call cc-option,-Wno-override-init,) +CFLAGS_exit.o += $(call cc-disable-warning, override-init) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index 93f4acd44523..fe734dc062ed 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -111,7 +111,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) { - kvm_err("%s: : read date from addr %llx failed\n", __func__, addr); + kvm_err("%s: : read data from addr %llx failed\n", __func__, addr); return ret; } /* Construct the mask by scanning the bit 27-30 */ @@ -127,7 +127,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) - kvm_err("%s: : write date to addr %llx failed\n", __func__, addr); + kvm_err("%s: : write data to addr %llx failed\n", __func__, addr); return ret; } diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index d165cd38c6bb..80ea63d465b8 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -296,10 +296,10 @@ int kvm_arch_enable_virtualization_cpu(void) /* * Enable virtualization features granting guest direct control of * certain features: - * GCI=2: Trap on init or unimplement cache instruction. + * GCI=2: Trap on init or unimplemented cache instruction. * TORU=0: Trap on Root Unimplement. * CACTRL=1: Root control cache. - * TOP=0: Trap on Previlege. + * TOP=0: Trap on Privilege. * TOE=0: Trap on Exception. * TIT=0: Trap on Timer. */ diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 8e427b379661..5af32ec62cb1 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -294,6 +294,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { + kvm_lose_pmu(vcpu); /* make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); @@ -902,6 +903,13 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, vcpu->arch.st.guest_addr = 0; memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); + + /* + * When vCPU reset, clear the ESTAT and GINTC registers + * Other CSR registers are cleared with function _kvm_setcsr(). + */ + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_GINTC, 0); + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_ESTAT, 0); break; default: ret = -EINVAL; diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c index e4068906143b..cea84d7f2b91 100644 --- a/arch/loongarch/mm/hugetlbpage.c +++ b/arch/loongarch/mm/hugetlbpage.c @@ -47,7 +47,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, pmd = pmd_offset(pud, addr); } } - return (pte_t *) pmd; + return pmd_none(pmdp_get(pmd)) ? NULL : (pte_t *) pmd; } uint64_t pmd_to_entrylo(unsigned long pmd_val) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index fdb7f73ad160..06f11d9e4ec1 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -65,9 +65,6 @@ void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h index 0992cad9c632..c7d75807d13f 100644 --- a/arch/mips/include/asm/idle.h +++ b/arch/mips/include/asm/idle.h @@ -6,11 +6,10 @@ #include <linux/linkage.h> extern void (*cpu_wait)(void); -extern void r4k_wait(void); -extern asmlinkage void __r4k_wait(void); +extern asmlinkage void r4k_wait(void); extern void r4k_wait_irqoff(void); -static inline int using_rollback_handler(void) +static inline int using_skipover_handler(void) { return cpu_wait == r4k_wait; } diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 85fa9962266a..ef72c46b5568 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -65,7 +65,8 @@ static inline void instruction_pointer_set(struct pt_regs *regs, /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last)) +#define MAX_REG_OFFSET \ + (offsetof(struct pt_regs, __last) - sizeof(unsigned long)) /** * regs_get_register() - get register value from its offset diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index a572ce36a24f..08c0a01d9a29 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -104,48 +104,59 @@ handle_vcei: __FINIT - .align 5 /* 32 byte rollback region */ -LEAF(__r4k_wait) - .set push - .set noreorder - /* start of rollback region */ - LONG_L t0, TI_FLAGS($28) - nop - andi t0, _TIF_NEED_RESCHED - bnez t0, 1f - nop - nop - nop -#ifdef CONFIG_CPU_MICROMIPS - nop - nop - nop - nop -#endif + .section .cpuidle.text,"ax" + /* Align to 32 bytes for the maximum idle interrupt region size. */ + .align 5 +LEAF(r4k_wait) + /* Keep the ISA bit clear for calculations on local labels here. */ +0: .fill 0 + /* Start of idle interrupt region. */ + local_irq_enable + /* + * If an interrupt lands here, before going idle on the next + * instruction, we must *NOT* go idle since the interrupt could + * have set TIF_NEED_RESCHED or caused a timer to need resched. + * Fall through -- see skipover_handler below -- and have the + * idle loop take care of things. + */ +1: .fill 0 + /* The R2 EI/EHB sequence takes 8 bytes, otherwise pad up. */ + .if 1b - 0b > 32 + .error "overlong idle interrupt region" + .elseif 1b - 0b > 8 + .align 4 + .endif +2: .fill 0 + .equ r4k_wait_idle_size, 2b - 0b + /* End of idle interrupt region; size has to be a power of 2. */ .set MIPS_ISA_ARCH_LEVEL_RAW +r4k_wait_insn: wait - /* end of rollback region (the region size must be power of two) */ -1: +r4k_wait_exit: + .set mips0 + local_irq_disable jr ra - nop - .set pop - END(__r4k_wait) + END(r4k_wait) + .previous - .macro BUILD_ROLLBACK_PROLOGUE handler - FEXPORT(rollback_\handler) + .macro BUILD_SKIPOVER_PROLOGUE handler + FEXPORT(skipover_\handler) .set push .set noat MFC0 k0, CP0_EPC - PTR_LA k1, __r4k_wait - ori k0, 0x1f /* 32 byte rollback region */ - xori k0, 0x1f + /* Subtract/add 2 to let the ISA bit propagate through the mask. */ + PTR_LA k1, r4k_wait_insn - 2 + ori k0, r4k_wait_idle_size - 2 + .set noreorder bne k0, k1, \handler + PTR_ADDIU k0, r4k_wait_exit - r4k_wait_insn + 2 + .set reorder MTC0 k0, CP0_EPC .set pop .endm .align 5 -BUILD_ROLLBACK_PROLOGUE handle_int +BUILD_SKIPOVER_PROLOGUE handle_int NESTED(handle_int, PT_SIZE, sp) .cfi_signal_frame #ifdef CONFIG_TRACE_IRQFLAGS @@ -265,7 +276,7 @@ NESTED(except_vec_ejtag_debug, 0, sp) * This prototype is copied to ebase + n*IntCtl.VS and patched * to invoke the handler */ -BUILD_ROLLBACK_PROLOGUE except_vec_vi +BUILD_SKIPOVER_PROLOGUE except_vec_vi NESTED(except_vec_vi, 0, sp) SAVE_SOME docfi=1 SAVE_AT docfi=1 diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c index 5abc8b7340f8..80e8a04a642e 100644 --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -35,13 +35,6 @@ static void __cpuidle r3081_wait(void) write_c0_conf(cfg | R30XX_CONF_HALT); } -void __cpuidle r4k_wait(void) -{ - raw_local_irq_enable(); - __r4k_wait(); - raw_local_irq_disable(); -} - /* * This variant is preferable as it allows testing need_resched and going to * sleep depending on the outcome atomically. Unfortunately the "It is diff --git a/arch/mips/kernel/smp-cps.c b/arch/mips/kernel/smp-cps.c index e85bd087467e..cc26d56f3ab6 100644 --- a/arch/mips/kernel/smp-cps.c +++ b/arch/mips/kernel/smp-cps.c @@ -332,6 +332,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) mips_cps_cluster_bootcfg = kcalloc(nclusters, sizeof(*mips_cps_cluster_bootcfg), GFP_KERNEL); + if (!mips_cps_cluster_bootcfg) + goto err_out; if (nclusters > 1) mips_cm_update_property(); @@ -348,6 +350,8 @@ static void __init cps_prepare_cpus(unsigned int max_cpus) mips_cps_cluster_bootcfg[cl].core_power = kcalloc(BITS_TO_LONGS(ncores), sizeof(unsigned long), GFP_KERNEL); + if (!mips_cps_cluster_bootcfg[cl].core_power) + goto err_out; /* Allocate VPE boot configuration structs */ for (c = 0; c < ncores; c++) { diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 39e248d0ed59..8ec1e185b35c 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -77,7 +77,7 @@ #include "access-helper.h" extern void check_wait(void); -extern asmlinkage void rollback_handle_int(void); +extern asmlinkage void skipover_handle_int(void); extern asmlinkage void handle_int(void); extern asmlinkage void handle_adel(void); extern asmlinkage void handle_ades(void); @@ -2066,7 +2066,7 @@ void *set_vi_handler(int n, vi_handler_t addr) { extern const u8 except_vec_vi[]; extern const u8 except_vec_vi_ori[], except_vec_vi_end[]; - extern const u8 rollback_except_vec_vi[]; + extern const u8 skipover_except_vec_vi[]; unsigned long handler; unsigned long old_handler = vi_handlers[n]; int srssets = current_cpu_data.srsets; @@ -2095,7 +2095,7 @@ void *set_vi_handler(int n, vi_handler_t addr) change_c0_srsmap(0xf << n*4, 0 << n*4); } - vec_start = using_rollback_handler() ? rollback_except_vec_vi : + vec_start = using_skipover_handler() ? skipover_except_vec_vi : except_vec_vi; #if defined(CONFIG_CPU_MICROMIPS) || defined(CONFIG_CPU_BIG_ENDIAN) ori_offset = except_vec_vi_ori - vec_start + 2; @@ -2426,8 +2426,8 @@ void __init trap_init(void) if (board_be_init) board_be_init(); - set_except_vector(EXCCODE_INT, using_rollback_handler() ? - rollback_handle_int : handle_int); + set_except_vector(EXCCODE_INT, using_skipover_handler() ? + skipover_handle_int : handle_int); set_except_vector(EXCCODE_MOD, handle_tlbm); set_except_vector(EXCCODE_TLBL, handle_tlbl); set_except_vector(EXCCODE_TLBS, handle_tlbs); diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 984c331ff5f4..0e60af486ec1 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -23,6 +23,9 @@ */ extern void local_dcache_page_flush(struct page *page); extern void local_icache_page_inv(struct page *page); +extern void local_dcache_range_flush(unsigned long start, unsigned long end); +extern void local_dcache_range_inv(unsigned long start, unsigned long end); +extern void local_icache_range_inv(unsigned long start, unsigned long end); /* * Data cache flushing always happen on the local cpu. Instruction cache @@ -39,6 +42,20 @@ extern void smp_icache_page_inv(struct page *page); #endif /* CONFIG_SMP */ /* + * Even if the actual block size is larger than L1_CACHE_BYTES, paddr + * can be incremented by L1_CACHE_BYTES. When paddr is written to the + * invalidate register, the entire cache line encompassing this address + * is invalidated. Each subsequent reference to the same cache line will + * not affect the invalidation process. + */ +#define local_dcache_block_flush(addr) \ + local_dcache_range_flush(addr, addr + L1_CACHE_BYTES) +#define local_dcache_block_inv(addr) \ + local_dcache_range_inv(addr, addr + L1_CACHE_BYTES) +#define local_icache_block_inv(addr) \ + local_icache_range_inv(addr, addr + L1_CACHE_BYTES) + +/* * Synchronizes caches. Whenever a cpu writes executable code to memory, this * should be called to make sure the processor sees the newly written code. */ diff --git a/arch/openrisc/include/asm/cpuinfo.h b/arch/openrisc/include/asm/cpuinfo.h index 5e4744153d0e..3cfc4cf0b019 100644 --- a/arch/openrisc/include/asm/cpuinfo.h +++ b/arch/openrisc/include/asm/cpuinfo.h @@ -15,16 +15,21 @@ #ifndef __ASM_OPENRISC_CPUINFO_H #define __ASM_OPENRISC_CPUINFO_H +#include <asm/spr.h> +#include <asm/spr_defs.h> + +struct cache_desc { + u32 size; + u32 sets; + u32 block_size; + u32 ways; +}; + struct cpuinfo_or1k { u32 clock_frequency; - u32 icache_size; - u32 icache_block_size; - u32 icache_ways; - - u32 dcache_size; - u32 dcache_block_size; - u32 dcache_ways; + struct cache_desc icache; + struct cache_desc dcache; u16 coreid; }; @@ -32,4 +37,9 @@ struct cpuinfo_or1k { extern struct cpuinfo_or1k cpuinfo_or1k[NR_CPUS]; extern void setup_cpuinfo(void); +/* + * Check if the cache component exists. + */ +extern bool cpu_cache_is_present(const unsigned int cache_type); + #endif /* __ASM_OPENRISC_CPUINFO_H */ diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index 79129161f3e0..e4c7d9bdd598 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := vmlinux.lds obj-y := head.o setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ - sys_call_table.o unwinder.o + sys_call_table.o unwinder.o cacheinfo.o obj-$(CONFIG_SMP) += smp.o sync-timer.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/openrisc/kernel/cacheinfo.c b/arch/openrisc/kernel/cacheinfo.c new file mode 100644 index 000000000000..61230545e4ff --- /dev/null +++ b/arch/openrisc/kernel/cacheinfo.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * OpenRISC cacheinfo support + * + * Based on work done for MIPS and LoongArch. All original copyrights + * apply as per the original source declaration. + * + * OpenRISC implementation: + * Copyright (C) 2025 Sahil Siddiq <sahilcdq@proton.me> + */ + +#include <linux/cacheinfo.h> +#include <asm/cpuinfo.h> +#include <asm/spr.h> +#include <asm/spr_defs.h> + +static inline void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, + unsigned int level, struct cache_desc *cache, int cpu) +{ + this_leaf->type = type; + this_leaf->level = level; + this_leaf->coherency_line_size = cache->block_size; + this_leaf->number_of_sets = cache->sets; + this_leaf->ways_of_associativity = cache->ways; + this_leaf->size = cache->size; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); +} + +int init_cache_level(unsigned int cpu) +{ + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + int leaves = 0, levels = 0; + unsigned long upr = mfspr(SPR_UPR); + unsigned long iccfgr, dccfgr; + + if (!(upr & SPR_UPR_UP)) { + printk(KERN_INFO + "-- no UPR register... unable to detect configuration\n"); + return -ENOENT; + } + + if (cpu_cache_is_present(SPR_UPR_DCP)) { + dccfgr = mfspr(SPR_DCCFGR); + cpuinfo->dcache.ways = 1 << (dccfgr & SPR_DCCFGR_NCW); + cpuinfo->dcache.sets = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); + cpuinfo->dcache.block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); + cpuinfo->dcache.size = + cpuinfo->dcache.sets * cpuinfo->dcache.ways * cpuinfo->dcache.block_size; + leaves += 1; + printk(KERN_INFO + "-- dcache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->dcache.size, cpuinfo->dcache.block_size, + cpuinfo->dcache.sets, cpuinfo->dcache.ways); + } else + printk(KERN_INFO "-- dcache disabled\n"); + + if (cpu_cache_is_present(SPR_UPR_ICP)) { + iccfgr = mfspr(SPR_ICCFGR); + cpuinfo->icache.ways = 1 << (iccfgr & SPR_ICCFGR_NCW); + cpuinfo->icache.sets = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); + cpuinfo->icache.block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); + cpuinfo->icache.size = + cpuinfo->icache.sets * cpuinfo->icache.ways * cpuinfo->icache.block_size; + leaves += 1; + printk(KERN_INFO + "-- icache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->icache.size, cpuinfo->icache.block_size, + cpuinfo->icache.sets, cpuinfo->icache.ways); + } else + printk(KERN_INFO "-- icache disabled\n"); + + if (!leaves) + return -ENOENT; + + levels = 1; + + this_cpu_ci->num_leaves = leaves; + this_cpu_ci->num_levels = levels; + + return 0; +} + +int populate_cache_leaves(unsigned int cpu) +{ + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + int level = 1; + + if (cpu_cache_is_present(SPR_UPR_DCP)) { + ci_leaf_init(this_leaf, CACHE_TYPE_DATA, level, &cpuinfo->dcache, cpu); + this_leaf->attributes = ((mfspr(SPR_DCCFGR) & SPR_DCCFGR_CWS) >> 8) ? + CACHE_WRITE_BACK : CACHE_WRITE_THROUGH; + this_leaf++; + } + + if (cpu_cache_is_present(SPR_UPR_ICP)) + ci_leaf_init(this_leaf, CACHE_TYPE_INST, level, &cpuinfo->icache, cpu); + + this_cpu_ci->cpu_map_populated = true; + + return 0; +} diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b3edbb33b621..3a7b5baaa450 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -17,6 +17,7 @@ #include <linux/pagewalk.h> #include <asm/cpuinfo.h> +#include <asm/cacheflush.h> #include <asm/spr_defs.h> #include <asm/tlbflush.h> @@ -24,9 +25,6 @@ static int page_set_nocache(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - unsigned long cl; - struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; - pte_val(*pte) |= _PAGE_CI; /* @@ -36,8 +34,7 @@ page_set_nocache(pte_t *pte, unsigned long addr, flush_tlb_kernel_range(addr, addr + PAGE_SIZE); /* Flush page out of dcache */ - for (cl = __pa(addr); cl < __pa(next); cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBFR, cl); + local_dcache_range_flush(__pa(addr), __pa(next)); return 0; } @@ -98,21 +95,14 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) void arch_sync_dma_for_device(phys_addr_t addr, size_t size, enum dma_data_direction dir) { - unsigned long cl; - struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; - switch (dir) { case DMA_TO_DEVICE: /* Flush the dcache for the requested range */ - for (cl = addr; cl < addr + size; - cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBFR, cl); + local_dcache_range_flush(addr, addr + size); break; case DMA_FROM_DEVICE: /* Invalidate the dcache for the requested range */ - for (cl = addr; cl < addr + size; - cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBIR, cl); + local_dcache_range_inv(addr, addr + size); break; default: /* diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index be56eaafc8b9..a9fb9cc6779e 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -113,21 +113,6 @@ static void print_cpuinfo(void) return; } - if (upr & SPR_UPR_DCP) - printk(KERN_INFO - "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo->dcache_size, cpuinfo->dcache_block_size, - cpuinfo->dcache_ways); - else - printk(KERN_INFO "-- dcache disabled\n"); - if (upr & SPR_UPR_ICP) - printk(KERN_INFO - "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo->icache_size, cpuinfo->icache_block_size, - cpuinfo->icache_ways); - else - printk(KERN_INFO "-- icache disabled\n"); - if (upr & SPR_UPR_DMP) printk(KERN_INFO "-- dmmu: %4d entries, %lu way(s)\n", 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), @@ -155,8 +140,6 @@ static void print_cpuinfo(void) void __init setup_cpuinfo(void) { struct device_node *cpu; - unsigned long iccfgr, dccfgr; - unsigned long cache_set_size; int cpu_id = smp_processor_id(); struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu_id]; @@ -164,20 +147,6 @@ void __init setup_cpuinfo(void) if (!cpu) panic("Couldn't find CPU%d in device tree...\n", cpu_id); - iccfgr = mfspr(SPR_ICCFGR); - cpuinfo->icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW); - cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); - cpuinfo->icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); - cpuinfo->icache_size = - cache_set_size * cpuinfo->icache_ways * cpuinfo->icache_block_size; - - dccfgr = mfspr(SPR_DCCFGR); - cpuinfo->dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW); - cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); - cpuinfo->dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); - cpuinfo->dcache_size = - cache_set_size * cpuinfo->dcache_ways * cpuinfo->dcache_block_size; - if (of_property_read_u32(cpu, "clock-frequency", &cpuinfo->clock_frequency)) { printk(KERN_WARNING @@ -294,14 +263,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) unsigned int vr, cpucfgr; unsigned int avr; unsigned int version; +#ifdef CONFIG_SMP struct cpuinfo_or1k *cpuinfo = v; + seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid); +#endif vr = mfspr(SPR_VR); cpucfgr = mfspr(SPR_CPUCFGR); -#ifdef CONFIG_SMP - seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid); -#endif if (vr & SPR_VR_UVRP) { vr = mfspr(SPR_VR2); version = vr & SPR_VR2_VER; @@ -320,14 +289,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "revision\t\t: %d\n", vr & SPR_VR_REV); } seq_printf(m, "frequency\t\t: %ld\n", loops_per_jiffy * HZ); - seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache_size); - seq_printf(m, "dcache block size\t: %d bytes\n", - cpuinfo->dcache_block_size); - seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache_ways); - seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache_size); - seq_printf(m, "icache block size\t: %d bytes\n", - cpuinfo->icache_block_size); - seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache_ways); seq_printf(m, "immu\t\t\t: %d entries, %lu ways\n", 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW)); diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index eb43b73f3855..0f265b8e73ec 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -14,31 +14,70 @@ #include <asm/spr_defs.h> #include <asm/cache.h> #include <asm/cacheflush.h> +#include <asm/cpuinfo.h> #include <asm/tlbflush.h> -static __always_inline void cache_loop(struct page *page, const unsigned int reg) +/* + * Check if the cache component exists. + */ +bool cpu_cache_is_present(const unsigned int cache_type) { - unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT; - unsigned long line = paddr & ~(L1_CACHE_BYTES - 1); + unsigned long upr = mfspr(SPR_UPR); + unsigned long mask = SPR_UPR_UP | cache_type; + + return !((upr & mask) ^ mask); +} + +static __always_inline void cache_loop(unsigned long paddr, unsigned long end, + const unsigned short reg, const unsigned int cache_type) +{ + if (!cpu_cache_is_present(cache_type)) + return; - while (line < paddr + PAGE_SIZE) { - mtspr(reg, line); - line += L1_CACHE_BYTES; + while (paddr < end) { + mtspr(reg, paddr); + paddr += L1_CACHE_BYTES; } } +static __always_inline void cache_loop_page(struct page *page, const unsigned short reg, + const unsigned int cache_type) +{ + unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT; + unsigned long end = paddr + PAGE_SIZE; + + paddr &= ~(L1_CACHE_BYTES - 1); + + cache_loop(paddr, end, reg, cache_type); +} + void local_dcache_page_flush(struct page *page) { - cache_loop(page, SPR_DCBFR); + cache_loop_page(page, SPR_DCBFR, SPR_UPR_DCP); } EXPORT_SYMBOL(local_dcache_page_flush); void local_icache_page_inv(struct page *page) { - cache_loop(page, SPR_ICBIR); + cache_loop_page(page, SPR_ICBIR, SPR_UPR_ICP); } EXPORT_SYMBOL(local_icache_page_inv); +void local_dcache_range_flush(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_DCBFR, SPR_UPR_DCP); +} + +void local_dcache_range_inv(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_DCBIR, SPR_UPR_DCP); +} + +void local_icache_range_inv(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_ICBIR, SPR_UPR_ICP); +} + void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { @@ -58,4 +97,3 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, sync_icache_dcache(folio_page(folio, nr)); } } - diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index be1c2eb8bb94..e4904ca6f0a0 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -35,6 +35,7 @@ #include <asm/fixmap.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/cacheflush.h> int mem_init_done; @@ -176,8 +177,8 @@ void __init paging_init(void) barrier(); /* Invalidate instruction caches after code modification */ - mtspr(SPR_ICBIR, 0x900); - mtspr(SPR_ICBIR, 0xa00); + local_icache_block_inv(0x900); + local_icache_block_inv(0xa00); /* New TLB miss handlers and kernel page tables are in now place. * Make sure that page flags get updated for all pages in TLB by diff --git a/arch/parisc/math-emu/driver.c b/arch/parisc/math-emu/driver.c index 34495446e051..71829cb7bc81 100644 --- a/arch/parisc/math-emu/driver.c +++ b/arch/parisc/math-emu/driver.c @@ -97,9 +97,19 @@ handle_fpe(struct pt_regs *regs) memcpy(regs->fr, frcopy, sizeof regs->fr); if (signalcode != 0) { - force_sig_fault(signalcode >> 24, signalcode & 0xffffff, - (void __user *) regs->iaoq[0]); - return -1; + int sig = signalcode >> 24; + + if (sig == SIGFPE) { + /* + * Clear floating point trap bit to avoid trapping + * again on the first floating-point instruction in + * the userspace signal handler. + */ + regs->fr[0] &= ~(1ULL << 38); + } + force_sig_fault(sig, signalcode & 0xffffff, + (void __user *) regs->iaoq[0]); + return -1; } return signalcode ? -1 : 0; diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 1db60fe13802..3d8dc822282a 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -234,10 +234,8 @@ fi # suppress some warnings in recent ld versions nowarn="-z noexecstack" -if ! ld_is_lld; then - if [ "$LD_VERSION" -ge "$(echo 2.39 | ld_version)" ]; then - nowarn="$nowarn --no-warn-rwx-segments" - fi +if "${CROSS}ld" -v --no-warn-rwx-segments >/dev/null 2>&1; then + nowarn="$nowarn --no-warn-rwx-segments" fi platformo=$object/"$platform".o diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index 34a5aec4908f..126bf3b06ab7 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -258,10 +258,6 @@ static unsigned long get_stubs_size(const Elf64_Ehdr *hdr, break; } } - if (i == hdr->e_shnum) { - pr_err("%s: doesn't contain __patchable_function_entries.\n", me->name); - return -ENOEXEC; - } #endif pr_debug("Looks like a total of %lu stubs, max\n", relocs); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 311e2112d782..9f764bc42b8c 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -976,7 +976,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } - +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { if (radix_enabled()) @@ -984,6 +984,7 @@ bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) return false; } +#endif int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) @@ -1120,6 +1121,19 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in pmd_t *pmd; pte_t *pte; + /* + * Make sure we align the start vmemmap addr so that we calculate + * the correct start_pfn in altmap boundary check to decided whether + * we should use altmap or RAM based backing memory allocation. Also + * the address need to be aligned for set_pte operation. + + * If the start addr is already PMD_SIZE aligned we will try to use + * a pmd mapping. We don't want to be too aggressive here beacause + * that will cause more allocations in RAM. So only if the namespace + * vmemmap start addr is PMD_SIZE aligned we will use PMD mapping. + */ + + start = ALIGN_DOWN(start, PAGE_SIZE); for (addr = start; addr < end; addr = next) { next = pmd_addr_end(addr, end); @@ -1145,8 +1159,8 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in * in altmap block allocation failures, in which case * we fallback to RAM for vmemmap allocation. */ - if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || - altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + if (!IS_ALIGNED(addr, PMD_SIZE) || (altmap && + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { /* * make sure we don't create altmap mappings * covering things outside the device. diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 3fbe0295ce14..95d7ba73d43d 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -17,7 +17,7 @@ config PPC_POWERNV select MMU_NOTIFIER select FORCE_SMP select ARCH_SUPPORTS_PER_VMA_LOCK - select PPC_RADIX_BROADCAST_TLBIE + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config OPAL_PRD diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index a934c2a262f6..fa3c2fff082a 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -23,7 +23,7 @@ config PPC_PSERIES select FORCE_SMP select SWIOTLB select ARCH_SUPPORTS_PER_VMA_LOCK - select PPC_RADIX_BROADCAST_TLBIE + select PPC_RADIX_BROADCAST_TLBIE if PPC_RADIX_MMU default y config PARAVIRT diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 98a3ecdc65f6..36061f4732b7 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -70,6 +70,7 @@ config RISCV # LLD >= 14: https://github.com/llvm/llvm-project/issues/50505 select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000 select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000 + select ARCH_SUPPORTS_MSEAL_SYSTEM_MAPPINGS if 64BIT && MMU select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU select ARCH_SUPPORTS_RT @@ -222,6 +223,7 @@ config RISCV select THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT select UACCESS_MEMCPY if !MMU + select VDSO_GETRANDOM if HAVE_GENERIC_VDSO select USER_STACKTRACE_SUPPORT select ZONE_DMA32 if 64BIT diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index effa02c2e682..6086b38d5427 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -34,11 +34,6 @@ static inline void flush_dcache_page(struct page *page) flush_dcache_folio(page_folio(page)); } -/* - * RISC-V doesn't have an instruction to flush parts of the instruction cache, - * so instead we just flush the whole thing. - */ -#define flush_icache_range(start, end) flush_icache_all() #define flush_icache_user_page(vma, pg, addr, len) \ do { \ if (vma->vm_flags & VM_EXEC) \ @@ -78,6 +73,16 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ +/* + * RISC-V doesn't have an instruction to flush parts of the instruction cache, + * so instead we just flush the whole thing. + */ +#define flush_icache_range flush_icache_range +static inline void flush_icache_range(unsigned long start, unsigned long end) +{ + flush_icache_all(); +} + extern unsigned int riscv_cbom_block_size; extern unsigned int riscv_cboz_block_size; extern unsigned int riscv_cbop_block_size; diff --git a/arch/riscv/include/asm/vdso/getrandom.h b/arch/riscv/include/asm/vdso/getrandom.h new file mode 100644 index 000000000000..8dc92441702a --- /dev/null +++ b/arch/riscv/include/asm/vdso/getrandom.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + */ +#ifndef __ASM_VDSO_GETRANDOM_H +#define __ASM_VDSO_GETRANDOM_H + +#ifndef __ASSEMBLY__ + +#include <asm/unistd.h> + +static __always_inline ssize_t getrandom_syscall(void *_buffer, size_t _len, unsigned int _flags) +{ + register long ret asm("a0"); + register long nr asm("a7") = __NR_getrandom; + register void *buffer asm("a0") = _buffer; + register size_t len asm("a1") = _len; + register unsigned int flags asm("a2") = _flags; + + asm volatile ("ecall\n" + : "+r" (ret) + : "r" (nr), "r" (buffer), "r" (len), "r" (flags) + : "memory"); + + return ret; +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* __ASM_VDSO_GETRANDOM_H */ diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 0ead29826419..7ce2307738c2 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -9,8 +9,8 @@ CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif -CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) -CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_table.o += $(call cc-disable-warning, override-init) +CFLAGS_compat_syscall_table.o += $(call cc-disable-warning, override-init) ifdef CONFIG_KEXEC_CORE AFLAGS_kexec_relocate.o := -mcmodel=medany $(call cc-option,-mno-relax) diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c index 4b3dc8beaf77..cc15f7ca6cc1 100644 --- a/arch/riscv/kernel/probes/uprobes.c +++ b/arch/riscv/kernel/probes/uprobes.c @@ -167,6 +167,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, /* Initialize the slot */ void *kaddr = kmap_atomic(page); void *dst = kaddr + (vaddr & ~PAGE_MASK); + unsigned long start = (unsigned long)dst; memcpy(dst, src, len); @@ -176,13 +177,6 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, *(uprobe_opcode_t *)dst = __BUG_INSN_32; } + flush_icache_range(start, start + len); kunmap_atomic(kaddr); - - /* - * We probably need flush_icache_user_page() but it needs vma. - * This should work on most of architectures by default. If - * architecture needs to do something different it can define - * its own version of the function. - */ - flush_dcache_page(page); } diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 7c244de77180..15d8f75902f8 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -275,6 +275,9 @@ long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg) unsigned long pmm; u8 pmlen; + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + if (is_compat_thread(ti)) return -EINVAL; @@ -330,6 +333,9 @@ long get_tagged_addr_ctrl(struct task_struct *task) struct thread_info *ti = task_thread_info(task); long ret = 0; + if (!riscv_has_extension_unlikely(RISCV_ISA_EXT_SUPM)) + return -EINVAL; + if (is_compat_thread(ti)) return -EINVAL; diff --git a/arch/riscv/kernel/traps.c b/arch/riscv/kernel/traps.c index 8ff8e8b36524..9c83848797a7 100644 --- a/arch/riscv/kernel/traps.c +++ b/arch/riscv/kernel/traps.c @@ -198,47 +198,57 @@ asmlinkage __visible __trap_section void do_trap_insn_illegal(struct pt_regs *re DO_ERROR_INFO(do_trap_load_fault, SIGSEGV, SEGV_ACCERR, "load access fault"); -asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) +enum misaligned_access_type { + MISALIGNED_STORE, + MISALIGNED_LOAD, +}; +static const struct { + const char *type_str; + int (*handler)(struct pt_regs *regs); +} misaligned_handler[] = { + [MISALIGNED_STORE] = { + .type_str = "Oops - store (or AMO) address misaligned", + .handler = handle_misaligned_store, + }, + [MISALIGNED_LOAD] = { + .type_str = "Oops - load address misaligned", + .handler = handle_misaligned_load, + }, +}; + +static void do_trap_misaligned(struct pt_regs *regs, enum misaligned_access_type type) { + irqentry_state_t state; + if (user_mode(regs)) { irqentry_enter_from_user_mode(regs); + local_irq_enable(); + } else { + state = irqentry_nmi_enter(regs); + } - if (handle_misaligned_load(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); + if (misaligned_handler[type].handler(regs)) + do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, + misaligned_handler[type].type_str); + if (user_mode(regs)) { + local_irq_disable(); irqentry_exit_to_user_mode(regs); } else { - irqentry_state_t state = irqentry_nmi_enter(regs); - - if (handle_misaligned_load(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - load address misaligned"); - irqentry_nmi_exit(regs, state); } } -asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) +asmlinkage __visible __trap_section void do_trap_load_misaligned(struct pt_regs *regs) { - if (user_mode(regs)) { - irqentry_enter_from_user_mode(regs); - - if (handle_misaligned_store(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); - - irqentry_exit_to_user_mode(regs); - } else { - irqentry_state_t state = irqentry_nmi_enter(regs); - - if (handle_misaligned_store(regs)) - do_trap_error(regs, SIGBUS, BUS_ADRALN, regs->epc, - "Oops - store (or AMO) address misaligned"); + do_trap_misaligned(regs, MISALIGNED_LOAD); +} - irqentry_nmi_exit(regs, state); - } +asmlinkage __visible __trap_section void do_trap_store_misaligned(struct pt_regs *regs) +{ + do_trap_misaligned(regs, MISALIGNED_STORE); } + DO_ERROR_INFO(do_trap_store_fault, SIGSEGV, SEGV_ACCERR, "store (or AMO) access fault"); DO_ERROR_INFO(do_trap_ecall_s, diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c index 7443632445d4..56f06a27d45f 100644 --- a/arch/riscv/kernel/traps_misaligned.c +++ b/arch/riscv/kernel/traps_misaligned.c @@ -275,7 +275,7 @@ static unsigned long get_f32_rs(unsigned long insn, u8 fp_reg_offset, int __ret; \ \ if (user_mode(regs)) { \ - __ret = __get_user(insn, (type __user *) insn_addr); \ + __ret = get_user(insn, (type __user *) insn_addr); \ } else { \ insn = *(type *)insn_addr; \ __ret = 0; \ diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index cc2895d1fbc2..3a8e038b10a2 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -136,7 +136,7 @@ static int __setup_additional_pages(struct mm_struct *mm, ret = _install_special_mapping(mm, vdso_base, vdso_text_len, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_SEALED_SYSMAP), vdso_info->cm); if (IS_ERR(ret)) diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile index 4a5d131506fc..9ebb5e590f93 100644 --- a/arch/riscv/kernel/vdso/Makefile +++ b/arch/riscv/kernel/vdso/Makefile @@ -13,9 +13,17 @@ vdso-syms += flush_icache vdso-syms += hwprobe vdso-syms += sys_hwprobe +ifdef CONFIG_VDSO_GETRANDOM +vdso-syms += getrandom +endif + # Files to link into the vdso obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o +ifdef CONFIG_VDSO_GETRANDOM +obj-vdso += vgetrandom-chacha.o +endif + ccflags-y := -fno-stack-protector ccflags-y += -DDISABLE_BRANCH_PROFILING ccflags-y += -fno-builtin @@ -24,6 +32,10 @@ ifneq ($(c-gettimeofday-y),) CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) endif +ifneq ($(c-getrandom-y),) + CFLAGS_getrandom.o += -fPIC -include $(c-getrandom-y) +endif + CFLAGS_hwprobe.o += -fPIC # Build rules @@ -38,6 +50,7 @@ endif # Disable -pg to prevent insert call site CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) +CFLAGS_REMOVE_getrandom.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) CFLAGS_REMOVE_hwprobe.o = $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) # Force dependency diff --git a/arch/riscv/kernel/vdso/getrandom.c b/arch/riscv/kernel/vdso/getrandom.c new file mode 100644 index 000000000000..f21922e8cebd --- /dev/null +++ b/arch/riscv/kernel/vdso/getrandom.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + */ +#include <linux/types.h> + +ssize_t __vdso_getrandom(void *buffer, size_t len, unsigned int flags, void *opaque_state, size_t opaque_len) +{ + return __cvdso_getrandom(buffer, len, flags, opaque_state, opaque_len); +} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index 8e86965a8aae..7c15b0f4ee3b 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -80,6 +80,9 @@ VERSION #ifndef COMPAT_VDSO __vdso_riscv_hwprobe; #endif +#if defined(CONFIG_VDSO_GETRANDOM) && !defined(COMPAT_VDSO) + __vdso_getrandom; +#endif local: *; }; } diff --git a/arch/riscv/kernel/vdso/vgetrandom-chacha.S b/arch/riscv/kernel/vdso/vgetrandom-chacha.S new file mode 100644 index 000000000000..5f0dad8f2373 --- /dev/null +++ b/arch/riscv/kernel/vdso/vgetrandom-chacha.S @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2025 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved. + * + * Based on arch/loongarch/vdso/vgetrandom-chacha.S. + */ + +#include <asm/asm.h> +#include <linux/linkage.h> + +.text + +.macro ROTRI rd rs imm + slliw t0, \rs, 32 - \imm + srliw \rd, \rs, \imm + or \rd, \rd, t0 +.endm + +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3 + \op \d0, \d0, \s0 + \op \d1, \d1, \s1 + \op \d2, \d2, \s2 + \op \d3, \d3, \s3 +.endm + +/* + * a0: output bytes + * a1: 32-byte key input + * a2: 8-byte counter input/output + * a3: number of 64-byte blocks to write to output + */ +SYM_FUNC_START(__arch_chacha20_blocks_nostack) + +#define output a0 +#define key a1 +#define counter a2 +#define nblocks a3 +#define i a4 +#define state0 s0 +#define state1 s1 +#define state2 s2 +#define state3 s3 +#define state4 s4 +#define state5 s5 +#define state6 s6 +#define state7 s7 +#define state8 s8 +#define state9 s9 +#define state10 s10 +#define state11 s11 +#define state12 a5 +#define state13 a6 +#define state14 a7 +#define state15 t1 +#define cnt t2 +#define copy0 t3 +#define copy1 t4 +#define copy2 t5 +#define copy3 t6 + +/* Packs to be used with OP_4REG */ +#define line0 state0, state1, state2, state3 +#define line1 state4, state5, state6, state7 +#define line2 state8, state9, state10, state11 +#define line3 state12, state13, state14, state15 + +#define line1_perm state5, state6, state7, state4 +#define line2_perm state10, state11, state8, state9 +#define line3_perm state15, state12, state13, state14 + +#define copy copy0, copy1, copy2, copy3 + +#define _16 16, 16, 16, 16 +#define _20 20, 20, 20, 20 +#define _24 24, 24, 24, 24 +#define _25 25, 25, 25, 25 + + /* + * The ABI requires s0-s9 saved. + * This does not violate the stack-less requirement: no sensitive data + * is spilled onto the stack. + */ + addi sp, sp, -12*SZREG + REG_S s0, (sp) + REG_S s1, SZREG(sp) + REG_S s2, 2*SZREG(sp) + REG_S s3, 3*SZREG(sp) + REG_S s4, 4*SZREG(sp) + REG_S s5, 5*SZREG(sp) + REG_S s6, 6*SZREG(sp) + REG_S s7, 7*SZREG(sp) + REG_S s8, 8*SZREG(sp) + REG_S s9, 9*SZREG(sp) + REG_S s10, 10*SZREG(sp) + REG_S s11, 11*SZREG(sp) + + ld cnt, (counter) + + li copy0, 0x61707865 + li copy1, 0x3320646e + li copy2, 0x79622d32 + li copy3, 0x6b206574 + +.Lblock: + /* state[0,1,2,3] = "expand 32-byte k" */ + mv state0, copy0 + mv state1, copy1 + mv state2, copy2 + mv state3, copy3 + + /* state[4,5,..,11] = key */ + lw state4, (key) + lw state5, 4(key) + lw state6, 8(key) + lw state7, 12(key) + lw state8, 16(key) + lw state9, 20(key) + lw state10, 24(key) + lw state11, 28(key) + + /* state[12,13] = counter */ + mv state12, cnt + srli state13, cnt, 32 + + /* state[14,15] = 0 */ + mv state14, zero + mv state15, zero + + li i, 10 +.Lpermute: + /* odd round */ + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _16 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _20 + + OP_4REG addw line0, line1 + OP_4REG xor line3, line0 + OP_4REG ROTRI line3, _24 + + OP_4REG addw line2, line3 + OP_4REG xor line1, line2 + OP_4REG ROTRI line1, _25 + + /* even round */ + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _16 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _20 + + OP_4REG addw line0, line1_perm + OP_4REG xor line3_perm, line0 + OP_4REG ROTRI line3_perm, _24 + + OP_4REG addw line2_perm, line3_perm + OP_4REG xor line1_perm, line2_perm + OP_4REG ROTRI line1_perm, _25 + + addi i, i, -1 + bnez i, .Lpermute + + /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */ + OP_4REG addw line0, copy + sw state0, (output) + sw state1, 4(output) + sw state2, 8(output) + sw state3, 12(output) + + /* from now on state[0,1,2,3] are scratch registers */ + + /* state[0,1,2,3] = lo(key) */ + lw state0, (key) + lw state1, 4(key) + lw state2, 8(key) + lw state3, 12(key) + + /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */ + OP_4REG addw line1, line0 + sw state4, 16(output) + sw state5, 20(output) + sw state6, 24(output) + sw state7, 28(output) + + /* state[0,1,2,3] = hi(key) */ + lw state0, 16(key) + lw state1, 20(key) + lw state2, 24(key) + lw state3, 28(key) + + /* output[8,9,10,11] = tmp[0,1,2,3] + state[8,9,10,11] */ + OP_4REG addw line2, line0 + sw state8, 32(output) + sw state9, 36(output) + sw state10, 40(output) + sw state11, 44(output) + + /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */ + addw state12, state12, cnt + srli state0, cnt, 32 + addw state13, state13, state0 + sw state12, 48(output) + sw state13, 52(output) + sw state14, 56(output) + sw state15, 60(output) + + /* ++counter */ + addi cnt, cnt, 1 + + /* output += 64 */ + addi output, output, 64 + /* --nblocks */ + addi nblocks, nblocks, -1 + bnez nblocks, .Lblock + + /* counter = [cnt_lo, cnt_hi] */ + sd cnt, (counter) + + /* Zero out the potentially sensitive regs, in case nothing uses these + * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and + * state[0,...,11] are s0-s11 those we'll restore in the epilogue, we + * only need to zero state[12,...,15]. + */ + mv state12, zero + mv state13, zero + mv state14, zero + mv state15, zero + + REG_L s0, (sp) + REG_L s1, SZREG(sp) + REG_L s2, 2*SZREG(sp) + REG_L s3, 3*SZREG(sp) + REG_L s4, 4*SZREG(sp) + REG_L s5, 5*SZREG(sp) + REG_L s6, 6*SZREG(sp) + REG_L s7, 7*SZREG(sp) + REG_L s8, 8*SZREG(sp) + REG_L s9, 9*SZREG(sp) + REG_L s10, 10*SZREG(sp) + REG_L s11, 11*SZREG(sp) + addi sp, sp, 12*SZREG + + ret +SYM_FUNC_END(__arch_chacha20_blocks_nostack) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 60d684c76c58..02635bac91f1 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -77,6 +77,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) memcpy(cntx, reset_cntx, sizeof(*cntx)); spin_unlock(&vcpu->arch.reset_cntx_lock); + memset(&vcpu->arch.smstateen_csr, 0, sizeof(vcpu->arch.smstateen_csr)); + kvm_riscv_vcpu_fp_reset(vcpu); kvm_riscv_vcpu_vector_reset(vcpu); diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c index 97c8fde3cbfe..e737ba7949b1 100644 --- a/arch/riscv/mm/tlbflush.c +++ b/arch/riscv/mm/tlbflush.c @@ -7,6 +7,27 @@ #include <linux/mmu_notifier.h> #include <asm/sbi.h> #include <asm/mmu_context.h> +#include <asm/cpufeature.h> + +#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) + +static inline void local_sfence_inval_ir(void) +{ + asm volatile(SFENCE_INVAL_IR() ::: "memory"); +} + +static inline void local_sfence_w_inval(void) +{ + asm volatile(SFENCE_W_INVAL() ::: "memory"); +} + +static inline void local_sinval_vma(unsigned long vma, unsigned long asid) +{ + if (asid != FLUSH_TLB_NO_ASID) + asm volatile(SINVAL_VMA(%0, %1) : : "r" (vma), "r" (asid) : "memory"); + else + asm volatile(SINVAL_VMA(%0, zero) : : "r" (vma) : "memory"); +} /* * Flush entire TLB if number of entries to be flushed is greater @@ -27,6 +48,16 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start, return; } + if (has_svinval()) { + local_sfence_w_inval(); + for (i = 0; i < nr_ptes_in_range; ++i) { + local_sinval_vma(start, asid); + start += stride; + } + local_sfence_inval_ir(); + return; + } + for (i = 0; i < nr_ptes_in_range; ++i) { local_flush_tlb_page_asid(start, asid); start += stride; diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 6f2c9ce1b154..24b22f6a9e99 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -38,7 +38,6 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y @@ -92,7 +91,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y CONFIG_SLUB_STATS=y @@ -395,6 +393,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -405,6 +406,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -628,8 +632,16 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -654,7 +666,6 @@ CONFIG_NILFS2_FS=m CONFIG_BCACHEFS_FS=y CONFIG_BCACHEFS_QUOTA=y CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -724,11 +735,10 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_LOCKDOWN_LSM=y @@ -741,6 +751,8 @@ CONFIG_IMA=y CONFIG_IMA_DEFAULT_HASH_SHA256=y CONFIG_IMA_WRITE_POLICY=y CONFIG_IMA_APPRAISE=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_HARDENED_USERCOPY=y CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_CRYPTO_USER=m # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set @@ -756,7 +768,6 @@ CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -801,7 +812,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -812,9 +822,9 @@ CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_RANDOM32_SELFTEST=y CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index f18a7d97ac21..2b8b42d569bc 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -36,7 +36,6 @@ CONFIG_USER_NS=y CONFIG_CHECKPOINT_RESTORE=y CONFIG_SCHED_AUTOGROUP=y CONFIG_EXPERT=y -# CONFIG_SYSFS_SYSCALL is not set CONFIG_PROFILING=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y @@ -86,7 +85,6 @@ CONFIG_UNIXWARE_DISKLABEL=y CONFIG_IOSCHED_BFQ=y CONFIG_BINFMT_MISC=m CONFIG_ZSWAP=y -CONFIG_ZSMALLOC=y CONFIG_ZSMALLOC_STAT=y CONFIG_SLAB_BUCKETS=y # CONFIG_COMPAT_BRK is not set @@ -385,6 +383,9 @@ CONFIG_CLS_U32_MARK=y CONFIG_NET_CLS_FLOW=m CONFIG_NET_CLS_CGROUP=y CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y CONFIG_NET_CLS_ACT=y CONFIG_NET_ACT_POLICE=m CONFIG_NET_ACT_GACT=m @@ -395,6 +396,9 @@ CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m CONFIG_NET_ACT_SKBEDIT=m CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m CONFIG_NET_ACT_GATE=m CONFIG_NET_TC_SKB_EXT=y CONFIG_DNS_RESOLVER=y @@ -618,8 +622,16 @@ CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_MEM=m CONFIG_VIRTIO_INPUT=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_VDPA_SIM_NET=m +CONFIG_VDPA_SIM_BLOCK=m +CONFIG_VDPA_USER=m +CONFIG_MLX5_VDPA_NET=m +CONFIG_VP_VDPA=m CONFIG_VHOST_NET=m CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y @@ -641,7 +653,6 @@ CONFIG_NILFS2_FS=m CONFIG_BCACHEFS_FS=m CONFIG_BCACHEFS_QUOTA=y CONFIG_BCACHEFS_POSIX_ACL=y -CONFIG_FS_DAX=y CONFIG_EXPORTFS_BLOCK_OPS=y CONFIG_FS_ENCRYPTION=y CONFIG_FS_VERITY=y @@ -711,6 +722,7 @@ CONFIG_NLS_UTF8=m CONFIG_DLM=m CONFIG_UNICODE=y CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_KEY_NOTIFICATIONS=y CONFIG_SECURITY=y @@ -742,7 +754,6 @@ CONFIG_CRYPTO_AES_TI=m CONFIG_CRYPTO_ANUBIS=m CONFIG_CRYPTO_ARIA=m CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_CAMELLIA=m CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_DES=m @@ -788,7 +799,6 @@ CONFIG_CRYPTO_SHA3_512_S390=m CONFIG_CRYPTO_GHASH_S390=m CONFIG_CRYPTO_AES_S390=m CONFIG_CRYPTO_DES_S390=m -CONFIG_CRYPTO_CHACHA_S390=m CONFIG_CRYPTO_HMAC_S390=m CONFIG_ZCRYPT=m CONFIG_PKEY=m @@ -799,10 +809,10 @@ CONFIG_PKEY_UV=m CONFIG_CRYPTO_PAES_S390=m CONFIG_CRYPTO_DEV_VIRTIO=m CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_CRYPTO_KRB5=m +CONFIG_CRYPTO_KRB5_SELFTESTS=y CONFIG_CORDIC=m CONFIG_PRIME_NUMBERS=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m CONFIG_XZ_DEC_MICROLZMA=y CONFIG_DMA_CMA=y CONFIG_CMA_SIZE_MBYTES=0 diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 853b2326a171..8163c1702720 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -70,7 +70,6 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_INFO_DWARF4=y CONFIG_DEBUG_FS=y CONFIG_PANIC_ON_OOPS=y -# CONFIG_SCHED_DEBUG is not set CONFIG_RCU_CPU_STALL_TIMEOUT=60 # CONFIG_RCU_TRACE is not set # CONFIG_FTRACE is not set diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index dd291c9ad6a6..9980c17ba22d 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -602,7 +602,8 @@ SYM_CODE_START(stack_invalid) stmg %r0,%r7,__PT_R0(%r11) stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_R8(64,%r11),0(%r14) - stg %r10,__PT_ORIG_GPR2(%r11) # store last break to orig_gpr2 + GET_LC %r2 + mvc __PT_ORIG_GPR2(8,%r11),__LC_PGM_LAST_BREAK(%r2) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_invalid diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 9a929bbcc397..241f7251c873 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -428,6 +428,8 @@ static void __clp_add(struct clp_fh_list_entry *entry, void *data) return; } zdev = zpci_create_device(entry->fid, entry->fh, entry->config_state); + if (IS_ERR(zdev)) + return; list_add_tail(&zdev->entry, scan_list); } diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 3a08f9029a3f..1c6e0ae41b0c 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -55,6 +55,7 @@ do { \ goto err_label; \ } \ *((type *)dst) = get_unaligned((type *)(src)); \ + barrier(); \ current->thread.segv_continue = NULL; \ } while (0) @@ -66,6 +67,7 @@ do { \ if (__faulted) \ goto err_label; \ put_unaligned(*((type *)src), (type *)(dst)); \ + barrier(); \ current->thread.segv_continue = NULL; \ } while (0) diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index ce073150dc20..ef2272e92a43 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -225,20 +225,20 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, panic("Failed to sync kernel TLBs: %d", err); goto out; } - else if (current->mm == NULL) { - if (current->pagefault_disabled) { - if (!mc) { - show_regs(container_of(regs, struct pt_regs, regs)); - panic("Segfault with pagefaults disabled but no mcontext"); - } - if (!current->thread.segv_continue) { - show_regs(container_of(regs, struct pt_regs, regs)); - panic("Segfault without recovery target"); - } - mc_set_rip(mc, current->thread.segv_continue); - current->thread.segv_continue = NULL; - goto out; + else if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } + else if (current->mm == NULL) { show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4b9f378e05f6..5873c9e39919 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2368,6 +2368,7 @@ config STRICT_SIGALTSTACK_SIZE config CFI_AUTO_DEFAULT bool "Attempt to use FineIBT by default at boot time" depends on FINEIBT + depends on !RUST || RUSTC_VERSION >= 108800 default y help Attempt to use FineIBT by default at boot time. If enabled, diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 81f55da81967..640fcac3af74 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -59,7 +59,7 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ - cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@ + cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c index f676156d9f3d..0e9f84ab4bdc 100644 --- a/arch/x86/boot/compressed/mem.c +++ b/arch/x86/boot/compressed/mem.c @@ -34,14 +34,11 @@ static bool early_is_tdx_guest(void) void arch_accept_memory(phys_addr_t start, phys_addr_t end) { - static bool sevsnp; - /* Platform-specific memory-acceptance call goes here */ if (early_is_tdx_guest()) { if (!tdx_accept_memory(start, end)) panic("TDX: Failed to accept memory\n"); - } else if (sevsnp || (sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) { - sevsnp = true; + } else if (early_is_sevsnp_guest()) { snp_accept_memory(start, end); } else { error("Cannot accept memory: unknown platform\n"); diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 89ba168f4f0f..0003e4416efd 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -645,3 +645,43 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) sev_verify_cbit(top_level_pgt); } + +bool early_is_sevsnp_guest(void) +{ + static bool sevsnp; + + if (sevsnp) + return true; + + if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) + return false; + + sevsnp = true; + + if (!snp_vmpl) { + unsigned int eax, ebx, ecx, edx; + + /* + * CPUID Fn8000_001F_EAX[28] - SVSM support + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax & BIT(28)) { + struct msr m; + + /* Obtain the address of the calling area to use */ + boot_rdmsr(MSR_SVSM_CAA, &m); + boot_svsm_caa = (void *)m.q; + boot_svsm_caa_pa = m.q; + + /* + * The real VMPL level cannot be discovered, but the + * memory acceptance routines make no use of that so + * any non-zero value suffices here. + */ + snp_vmpl = U8_MAX; + } + } + return true; +} diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index 4e463f33186d..d3900384b8ab 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -13,12 +13,14 @@ bool sev_snp_enabled(void); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 sev_get_status(void); +bool early_is_sevsnp_guest(void); #else static inline bool sev_snp_enabled(void) { return false; } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 sev_get_status(void) { return 0; } +static inline bool early_is_sevsnp_guest(void) { return false; } #endif diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..139ad80d1df3 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -629,7 +629,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (!event->attr.freq && x86_pmu.limit_period) { + if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) @@ -754,7 +754,7 @@ void x86_pmu_enable_all(int added) } } -static inline int is_x86_event(struct perf_event *event) +int is_x86_event(struct perf_event *event) { int i; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 00dfe487bd00..c5f385413392 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4395,7 +4395,7 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) arr[pebs_enable] = (struct perf_guest_switch_msr){ .msr = MSR_IA32_PEBS_ENABLE, .host = cpuc->pebs_enabled & ~cpuc->intel_ctrl_guest_mask, - .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask, + .guest = pebs_mask & ~cpuc->intel_ctrl_host_mask & kvm_pmu->pebs_enable, }; if (arr[pebs_enable].host) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 18c3ab579b8b..9b20acc0e932 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -2379,8 +2379,25 @@ __intel_pmu_pebs_last_event(struct perf_event *event, */ intel_pmu_save_and_restart_reload(event, count); } - } else - intel_pmu_save_and_restart(event); + } else { + /* + * For a non-precise event, it's possible the + * counters-snapshotting records a positive value for the + * overflowed event. Then the HW auto-reload mechanism + * reset the counter to 0 immediately, because the + * pebs_event_reset is cleared if the PERF_X86_EVENT_AUTO_RELOAD + * is not set. The counter backwards may be observed in a + * PMI handler. + * + * Since the event value has been updated when processing the + * counters-snapshotting record, only needs to set the new + * period for the counter. + */ + if (is_pebs_counter_event_group(event)) + static_call(x86_pmu_set_period)(event); + else + intel_pmu_save_and_restart(event); + } } static __always_inline void diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 2c0ce0e9545e..46d120597bab 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -110,14 +110,21 @@ static inline bool is_topdown_event(struct perf_event *event) return is_metric_event(event) || is_slots_event(event); } +int is_x86_event(struct perf_event *event); + +static inline bool check_leader_group(struct perf_event *leader, int flags) +{ + return is_x86_event(leader) ? !!(leader->hw.flags & flags) : false; +} + static inline bool is_branch_counters_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_BRANCH_COUNTERS; + return check_leader_group(event->group_leader, PERF_X86_EVENT_BRANCH_COUNTERS); } static inline bool is_pebs_counter_event_group(struct perf_event *event) { - return event->group_leader->hw.flags & PERF_X86_EVENT_PEBS_CNTR; + return check_leader_group(event->group_leader, PERF_X86_EVENT_PEBS_CNTR); } struct amd_nb { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3bdae454a959..7bc174a1f1cb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> #include <asm/asm.h> +#include <asm/irq_remapping.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/reboot.h> @@ -2423,4 +2424,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); */ #define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) +static inline bool kvm_arch_has_irq_bypass(void) +{ + return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); +} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 695e569159c1..be7cddc414e4 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -17,10 +17,12 @@ struct ucode_cpu_info { void load_ucode_bsp(void); void load_ucode_ap(void); void microcode_bsp_resume(void); +bool __init microcode_loader_disabled(void); #else static inline void load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void microcode_bsp_resume(void) { } +static inline bool __init microcode_loader_disabled(void) { return false; } #endif extern unsigned long initrd_start_early; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index a33147520044..c88691b15f3c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,8 @@ #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> +#include <asm/cpufeature.h> + #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> @@ -29,16 +31,17 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * Instead of one PGD, we acquire two PGDs. Being order-1, it is - * both 8k in size and 8k-aligned. That lets us just flip bit 12 - * in a pointer to swap between the two 4k halves. + * In case of Page Table Isolation active, we acquire two PGDs instead of one. + * Being order-1, it is both 8k in size and 8k-aligned. That lets us just + * flip bit 12 in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +static inline unsigned int pgd_allocation_order(void) +{ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + return 1; + return 0; +} /* * Allocate and free page tables. diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 4a10d35e70aa..96cb992d50ef 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -1098,15 +1098,17 @@ static enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t siz static int __init save_microcode_in_initrd(void) { - unsigned int cpuid_1_eax = native_cpuid_eax(1); struct cpuinfo_x86 *c = &boot_cpu_data; struct cont_desc desc = { 0 }; + unsigned int cpuid_1_eax; enum ucode_state ret; struct cpio_data cp; - if (dis_ucode_ldr || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) + if (microcode_loader_disabled() || c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) return 0; + cpuid_1_eax = native_cpuid_eax(1); + if (!find_blobs_in_containers(&cp)) return -EINVAL; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index b3658d11e7b6..079f046ee26d 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -41,8 +41,8 @@ #include "internal.h" -static struct microcode_ops *microcode_ops; -bool dis_ucode_ldr = true; +static struct microcode_ops *microcode_ops; +static bool dis_ucode_ldr = false; bool force_minrev = IS_ENABLED(CONFIG_MICROCODE_LATE_FORCE_MINREV); module_param(force_minrev, bool, S_IRUSR | S_IWUSR); @@ -84,6 +84,9 @@ static bool amd_check_current_patch_level(void) u32 lvl, dummy, i; u32 *levels; + if (x86_cpuid_vendor() != X86_VENDOR_AMD) + return false; + native_rdmsr(MSR_AMD64_PATCH_LEVEL, lvl, dummy); levels = final_levels; @@ -95,27 +98,29 @@ static bool amd_check_current_patch_level(void) return false; } -static bool __init check_loader_disabled_bsp(void) +bool __init microcode_loader_disabled(void) { - static const char *__dis_opt_str = "dis_ucode_ldr"; - const char *cmdline = boot_command_line; - const char *option = __dis_opt_str; + if (dis_ucode_ldr) + return true; /* - * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not - * completely accurate as xen pv guests don't see that CPUID bit set but - * that's good enough as they don't land on the BSP path anyway. + * Disable when: + * + * 1) The CPU does not support CPUID. + * + * 2) Bit 31 in CPUID[1]:ECX is clear + * The bit is reserved for hypervisor use. This is still not + * completely accurate as XEN PV guests don't see that CPUID bit + * set, but that's good enough as they don't land on the BSP + * path anyway. + * + * 3) Certain AMD patch levels are not allowed to be + * overwritten. */ - if (native_cpuid_ecx(1) & BIT(31)) - return true; - - if (x86_cpuid_vendor() == X86_VENDOR_AMD) { - if (amd_check_current_patch_level()) - return true; - } - - if (cmdline_find_option_bool(cmdline, option) <= 0) - dis_ucode_ldr = false; + if (!have_cpuid_p() || + native_cpuid_ecx(1) & BIT(31) || + amd_check_current_patch_level()) + dis_ucode_ldr = true; return dis_ucode_ldr; } @@ -125,7 +130,10 @@ void __init load_ucode_bsp(void) unsigned int cpuid_1_eax; bool intel = true; - if (!have_cpuid_p()) + if (cmdline_find_option_bool(boot_command_line, "dis_ucode_ldr") > 0) + dis_ucode_ldr = true; + + if (microcode_loader_disabled()) return; cpuid_1_eax = native_cpuid_eax(1); @@ -146,9 +154,6 @@ void __init load_ucode_bsp(void) return; } - if (check_loader_disabled_bsp()) - return; - if (intel) load_ucode_intel_bsp(&early_data); else @@ -159,6 +164,11 @@ void load_ucode_ap(void) { unsigned int cpuid_1_eax; + /* + * Can't use microcode_loader_disabled() here - .init section + * hell. It doesn't have to either - the BSP variant must've + * parsed cmdline already anyway. + */ if (dis_ucode_ldr) return; @@ -810,7 +820,7 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &boot_cpu_data; int error; - if (dis_ucode_ldr) + if (microcode_loader_disabled()) return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 819199bc0119..2a397da43923 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -389,7 +389,7 @@ static int __init save_builtin_microcode(void) if (xchg(&ucode_patch_va, NULL) != UCODE_BSP_LOADED) return 0; - if (dis_ucode_ldr || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + if (microcode_loader_disabled() || boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return 0; uci.mc = get_microcode_blob(&uci, true); diff --git a/arch/x86/kernel/cpu/microcode/internal.h b/arch/x86/kernel/cpu/microcode/internal.h index 5df621752fef..50a9702ae4e2 100644 --- a/arch/x86/kernel/cpu/microcode/internal.h +++ b/arch/x86/kernel/cpu/microcode/internal.h @@ -94,7 +94,6 @@ static inline unsigned int x86_cpuid_family(void) return x86_family(eax); } -extern bool dis_ucode_ldr; extern bool force_minrev; #ifdef CONFIG_CPU_SUP_AMD diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9d8dd8deb2a7..9920122018a0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1299,6 +1299,14 @@ void __init e820__memblock_setup(void) memblock_add(entry->addr, entry->size); } + /* + * 32-bit systems are limited to 4BG of memory even with HIGHMEM and + * to even less without it. + * Discard memory after max_pfn - the actual limit detected at runtime. + */ + if (IS_ENABLED(CONFIG_X86_32)) + memblock_remove(PFN_PHYS(max_pfn), -1); + /* Throw away partial pages: */ memblock_trim_memory(PAGE_SIZE); diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index de001b2146ab..375f2d7f1762 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -145,10 +145,6 @@ void __init __no_stack_protector mk_early_pgtbl_32(void) *ptr = (unsigned long)ptep + PAGE_OFFSET; #ifdef CONFIG_MICROCODE_INITRD32 - /* Running on a hypervisor? */ - if (native_cpuid_ecx(1) & BIT(31)) - return; - params = (struct boot_params *)__pa_nodebug(&boot_params); if (!params->hdr.ramdisk_size || !params->hdr.ramdisk_image) return; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 80265162aeff..1f325304c4a8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -42,7 +42,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)image->arch.pgd, pgd_allocation_order()); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PGD_ALLOCATION_ORDER); + pgd_allocation_order()); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ccdc45e5b759..aa4d0221583c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -466,10 +466,18 @@ SECTIONS } /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + * COMPILE_TEST kernels can be large - CONFIG_KASAN, for example, can cause + * this. Let's assume that nobody will be running a COMPILE_TEST kernel and + * let's assert that fuller build coverage is more valuable than being able to + * run a COMPILE_TEST kernel. + */ +#ifndef CONFIG_COMPILE_TEST +/* + * The ASSERT() sync to . is intentional, for binutils 2.14 compatibility: */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); +#endif /* needed for Clang - see arch/x86/entry/entry.S */ PROVIDE(__ref_stack_chk_guard = __stack_chk_guard); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 050a0e229a4d..f2b36d32ef40 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -104,6 +104,9 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { + if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) + kvm_mmu_free_obsolete_roots(vcpu); + /* * Checking root.hpa is sufficient even when KVM has mirror root. * We can have either: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 63bb77ee1bb1..8d1b632e33d2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5974,6 +5974,7 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.root_mmu); __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } +EXPORT_SYMBOL_GPL(kvm_mmu_free_obsolete_roots); static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) @@ -7669,9 +7670,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7686,6 +7708,38 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; + if (WARN_ON_ONCE(range->end <= range->start)) + return false; + + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); + + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + /* Unmap the old attribute page. */ if (range->arg.attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE) range->attr_filter = KVM_FILTER_SHARED; @@ -7695,23 +7749,7 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, return kvm_unmap_gfn_range(kvm, range); } -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} - -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; -} static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) diff --git a/arch/x86/kvm/smm.c b/arch/x86/kvm/smm.c index 699e551ec93b..9864c057187d 100644 --- a/arch/x86/kvm/smm.c +++ b/arch/x86/kvm/smm.c @@ -131,6 +131,7 @@ void kvm_smm_changed(struct kvm_vcpu *vcpu, bool entering_smm) kvm_mmu_reset_context(vcpu); } +EXPORT_SYMBOL_GPL(kvm_smm_changed); void process_smi(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..7338879d1c0c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -796,12 +796,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); @@ -820,7 +823,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; @@ -896,10 +899,10 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", @@ -933,6 +936,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -951,33 +956,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -993,6 +971,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0bc708ee2788..a7a7dc507336 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3173,9 +3173,14 @@ skip_vmsa_free: kvfree(svm->sev_es.ghcb_sa); } +static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) +{ + return (((u64)control->exit_code_hi) << 32) | control->exit_code; +} + static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->sev_es.ghcb; + struct vmcb_control_area *control = &svm->vmcb->control; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -3184,18 +3189,24 @@ static void dump_ghcb(struct vcpu_svm *svm) return; } - nbits = sizeof(ghcb->save.valid_bitmap) * 8; + nbits = sizeof(svm->sev_es.valid_bitmap) * 8; - pr_err("GHCB (GPA=%016llx):\n", svm->vmcb->control.ghcb_gpa); + /* + * Print KVM's snapshot of the GHCB values that were (unsuccessfully) + * used to handle the exit. If the guest has since modified the GHCB + * itself, dumping the raw GHCB won't help debug why KVM was unable to + * handle the VMGEXIT that KVM observed. + */ + pr_err("GHCB (GPA=%016llx) snapshot:\n", svm->vmcb->control.ghcb_gpa); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_code", - ghcb->save.sw_exit_code, ghcb_sw_exit_code_is_valid(ghcb)); + kvm_ghcb_get_sw_exit_code(control), kvm_ghcb_sw_exit_code_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_1", - ghcb->save.sw_exit_info_1, ghcb_sw_exit_info_1_is_valid(ghcb)); + control->exit_info_1, kvm_ghcb_sw_exit_info_1_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_exit_info_2", - ghcb->save.sw_exit_info_2, ghcb_sw_exit_info_2_is_valid(ghcb)); + control->exit_info_2, kvm_ghcb_sw_exit_info_2_is_valid(svm)); pr_err("%-20s%016llx is_valid: %u\n", "sw_scratch", - ghcb->save.sw_scratch, ghcb_sw_scratch_is_valid(ghcb)); - pr_err("%-20s%*pb\n", "valid_bitmap", nbits, ghcb->save.valid_bitmap); + svm->sev_es.sw_scratch, kvm_ghcb_sw_scratch_is_valid(svm)); + pr_err("%-20s%*pb\n", "valid_bitmap", nbits, svm->sev_es.valid_bitmap); } static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) @@ -3266,11 +3277,6 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) memset(ghcb->save.valid_bitmap, 0, sizeof(ghcb->save.valid_bitmap)); } -static u64 kvm_ghcb_get_sw_exit_code(struct vmcb_control_area *control) -{ - return (((u64)control->exit_code_hi) << 32) | control->exit_code; -} - static int sev_es_validate_vmgexit(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d5d0c5c3300b..a89c271a1951 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -607,9 +607,6 @@ static void svm_disable_virtualization_cpu(void) kvm_cpu_svm_disable(); amd_pmu_disable_virt(); - - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); } static int svm_enable_virtualization_cpu(void) @@ -687,9 +684,6 @@ static int svm_enable_virtualization_cpu(void) rdmsr(MSR_TSC_AUX, sev_es_host_save_area(sd)->tsc_aux, msr_hi); } - if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) - msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); - return 0; } @@ -1518,6 +1512,63 @@ static void svm_vcpu_free(struct kvm_vcpu *vcpu) __free_pages(virt_to_page(svm->msrpm), get_order(MSRPM_SIZE)); } +#ifdef CONFIG_CPU_MITIGATIONS +static DEFINE_SPINLOCK(srso_lock); +static atomic_t srso_nr_vms; + +static void svm_srso_clear_bp_spec_reduce(void *ign) +{ + struct svm_cpu_data *sd = this_cpu_ptr(&svm_data); + + if (!sd->bp_spec_reduce_set) + return; + + msr_clear_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + sd->bp_spec_reduce_set = false; +} + +static void svm_srso_vm_destroy(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + if (atomic_dec_return(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + /* + * Verify a new VM didn't come along, acquire the lock, and increment + * the count before this task acquired the lock. + */ + if (atomic_read(&srso_nr_vms)) + return; + + on_each_cpu(svm_srso_clear_bp_spec_reduce, NULL, 1); +} + +static void svm_srso_vm_init(void) +{ + if (!cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) + return; + + /* + * Acquire the lock on 0 => 1 transitions to ensure a potential 1 => 0 + * transition, i.e. destroying the last VM, is fully complete, e.g. so + * that a delayed IPI doesn't clear BP_SPEC_REDUCE after a vCPU runs. + */ + if (atomic_inc_not_zero(&srso_nr_vms)) + return; + + guard(spinlock)(&srso_lock); + + atomic_inc(&srso_nr_vms); +} +#else +static void svm_srso_vm_init(void) { } +static void svm_srso_vm_destroy(void) { } +#endif + static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1550,6 +1601,11 @@ static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(vcpu->kvm))) kvm_set_user_return_msr(tsc_aux_uret_slot, svm->tsc_aux, -1ull); + if (cpu_feature_enabled(X86_FEATURE_SRSO_BP_SPEC_REDUCE) && + !sd->bp_spec_reduce_set) { + sd->bp_spec_reduce_set = true; + msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_BP_SPEC_REDUCE_BIT); + } svm->guest_state_loaded = true; } @@ -2231,6 +2287,10 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) */ if (!sev_es_guest(vcpu->kvm)) { clear_page(svm->vmcb); +#ifdef CONFIG_KVM_SMM + if (is_smm(vcpu)) + kvm_smm_changed(vcpu, false); +#endif kvm_vcpu_reset(vcpu, true); } @@ -5036,6 +5096,8 @@ static void svm_vm_destroy(struct kvm *kvm) { avic_vm_destroy(kvm); sev_vm_destroy(kvm); + + svm_srso_vm_destroy(); } static int svm_vm_init(struct kvm *kvm) @@ -5061,6 +5123,7 @@ static int svm_vm_init(struct kvm *kvm) return ret; } + svm_srso_vm_init(); return 0; } diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index d4490eaed55d..f16b068c4228 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -335,6 +335,8 @@ struct svm_cpu_data { u32 next_asid; u32 min_asid; + bool bp_spec_reduce_set; + struct vmcb *save_area; unsigned long save_area_pa; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ccda95e53f62..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, @@ -319,7 +326,7 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ __entry->requests = READ_ONCE(vcpu->requests); \ @@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 51116fe69a50..d70e5b90087d 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -297,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -335,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -357,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + enable_remapped_mode = false; + + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -369,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3712dde0bf9d..9896fd574bfc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4597,7 +4597,7 @@ static bool kvm_is_vm_type_supported(unsigned long type) return type < 32 && (kvm_caps.supported_vm_types & BIT(type)); } -static inline u32 kvm_sync_valid_fields(struct kvm *kvm) +static inline u64 kvm_sync_valid_fields(struct kvm *kvm) { return kvm && kvm->arch.has_protected_state ? 0 : KVM_SYNC_X86_VALID_FIELDS; } @@ -11098,7 +11098,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } @@ -11492,7 +11493,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { struct kvm_queued_exception *ex = &vcpu->arch.exception; struct kvm_run *kvm_run = vcpu->run; - u32 sync_valid_fields; + u64 sync_valid_fields; int r; r = kvm_mmu_post_init_vm(vcpu->kvm); @@ -13556,25 +13557,27 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13584,9 +13587,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13594,12 +13597,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } @@ -13612,7 +13621,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index caedb3ef6688..f5dd84eb55dc 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -996,8 +996,8 @@ AVXcode: 4 83: Grp1 Ev,Ib (1A),(es) # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, # CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ -84: CTESTSCC (ev) -85: CTESTSCC (es) | CTESTSCC (66),(es) +84: CTESTSCC Eb,Gb (ev) +85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es) 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) 8f: POP2 Bq,Rq (000),(11B),(ev) a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811..f7ae44d3dd9e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -360,7 +360,7 @@ static inline pgd_t *_pgd_alloc(struct mm_struct *mm) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -380,7 +380,7 @@ static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index eb83348f9305..b6d6750e4bd1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -899,8 +899,9 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, cond_mitigation(tsk); /* - * Let nmi_uaccess_okay() and finish_asid_transition() - * know that CR3 is changing. + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); @@ -1204,8 +1205,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -1214,8 +1223,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac57259a432b..a4b4ebd41b8f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -73,7 +73,7 @@ int __init efi_alloc_page_tables(void) gfp_t gfp_mask; gfp_mask = GFP_KERNEL | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_allocation_order()); if (!efi_pgd) goto fail; @@ -96,7 +96,7 @@ free_p4d: if (pgtable_l5_enabled()) free_page((unsigned long)pgd_page_vaddr(*pgd)); free_pgd: - free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)efi_pgd, pgd_allocation_order()); fail: return -ENOMEM; } diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index ab5c8e47049c..9193a7790a71 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index 26fb4835d3e9..61e4ca1e0ab5 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -31,8 +31,8 @@ struct faultinfo { #define ___backtrack_faulted(_faulted) \ asm volatile ( \ - "mov $0, %0\n" \ "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "mov $0, %0\n" \ "jmp _end_%=\n" \ "__get_kernel_nofault_faulted_%=:\n" \ "mov $1, %0;" \ |