diff options
Diffstat (limited to 'arch/x86/xen')
33 files changed, 1741 insertions, 906 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 85246dd9faa1..98d8a50d2aed 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,6 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the @@ -80,7 +81,6 @@ config XEN_PVH bool "Xen PVH guest support" depends on XEN && XEN_PVHVM && ACPI select PVH - def_bool n help Support for running as a Xen PVH guest. @@ -92,3 +92,12 @@ config XEN_DOM0 select X86_X2APIC if XEN_PVH && X86_64 help Support running as a Xen Dom0 guest. + +config XEN_PV_MSR_SAFE + bool "Always use safe MSR accesses in PV guests" + default y + depends on XEN_PV + help + Use safe (not faulting) MSR access functions even if the MSR access + should not fault anyway. + The default can be changed by using the "xen_msr_safe" boot parameter. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3c5b52fbe4a7..a9ec8c9f5c5d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_PV_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 62d34b6611c5..bb0f3f368446 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -10,8 +10,6 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" -#include "pmu.h" -#include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { @@ -33,13 +31,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static u32 xen_set_apic_id(unsigned int x) -{ - WARN_ON(1); - return x; -} - -static unsigned int xen_get_apic_id(unsigned long x) +static u32 xen_get_apic_id(u32 x) { return ((x)>>24) & 0xFFu; } @@ -49,20 +41,20 @@ static u32 xen_apic_read(u32 reg) struct xen_platform_op op = { .cmd = XENPF_get_cpuinfo, .interface_version = XENPF_INTERFACE_VERSION, - .u.pcpu_info.xen_cpuid = 0, }; - int ret; - - /* Shouldn't need this as APIC is turned off for PV, and we only - * get called on the bootup processor. But just in case. */ - if (!xen_initial_domain() || smp_processor_id()) - return 0; + int ret, cpu; if (reg == APIC_LVR) return 0x14; if (reg != APIC_ID) return 0; + cpu = smp_processor_id(); + if (!xen_initial_domain()) + return cpu ? cpuid_to_apicid[cpu] << 24 : 0; + + op.u.pcpu_info.xen_cpuid = cpu; + ret = HYPERVISOR_platform_op(&op); if (ret) op.u.pcpu_info.apic_id = BAD_APICID; @@ -81,6 +73,11 @@ static void xen_apic_write(u32 reg, u32 val) WARN(1,"register: %x, value: %x\n", reg, val); } +static void xen_apic_eoi(void) +{ + WARN_ON_ONCE(1); +} + static u64 xen_apic_icr_read(void) { return 0; @@ -92,11 +89,6 @@ static void xen_apic_icr_write(u32 low, u32 id) WARN_ON(1); } -static u32 xen_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static int xen_apic_probe_pv(void) { if (xen_pv_domain()) @@ -110,99 +102,47 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_id_always_valid(u32 apicid) -{ - return 1; -} - -static int xen_id_always_registered(void) -{ - return 1; -} - -static int xen_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - -static void xen_noop(void) -{ -} - -static void xen_silent_inquire(int apicid) -{ -} - -static int xen_cpu_present_to_apicid(int cpu) +static u32 xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return cpu_data(cpu).apicid; + return cpu_data(cpu).topo.apicid; else return BAD_APICID; } -static struct apic xen_pv_apic = { - .name = "Xen PV", - .probe = xen_apic_probe_pv, +static struct apic xen_pv_apic __ro_after_init = { + .name = "Xen PV", + .probe = xen_apic_probe_pv, .acpi_madt_oem_check = xen_madt_oem_check, - .apic_id_valid = xen_id_always_valid, - .apic_id_registered = xen_id_always_registered, /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ - .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ - .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ - .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ - .get_apic_id = xen_get_apic_id, - .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ + .max_apic_id = UINT_MAX, + .get_apic_id = xen_get_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP - .send_IPI_mask = xen_send_IPI_mask, - .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, - .send_IPI_allbutself = xen_send_IPI_allbutself, - .send_IPI_all = xen_send_IPI_all, - .send_IPI_self = xen_send_IPI_self, + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, #endif - /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */ - .inquire_remote_apic = xen_silent_inquire, - .read = xen_apic_read, .write = xen_apic_write, - .eoi_write = xen_apic_write, + .eoi = xen_apic_eoi, - .icr_read = xen_apic_icr_read, - .icr_write = xen_apic_icr_write, - .wait_icr_idle = xen_noop, - .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, }; +apic_driver(xen_pv_apic); -static void __init xen_apic_check(void) -{ - if (apic == &xen_pv_apic) - return; - - pr_info("Switched APIC routing from %s to %s.\n", apic->name, - xen_pv_apic.name); - apic = &xen_pv_apic; -} void __init xen_init_apic(void) { x86_apic_ops.io_apic_read = xen_io_apic_read; - /* On PV guests the APIC CPUID bit is disabled so none of the - * routines end up executing. */ - if (!xen_initial_domain()) - apic = &xen_pv_apic; - - x86_platform.apic_post_init = xen_apic_check; } -apic_driver(xen_pv_apic); diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 532410998684..b8c9f2a7d9b6 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -3,7 +3,7 @@ #include <linux/debugfs.h> #include <linux/slab.h> -#include "debugfs.h" +#include "xen-ops.h" static struct dentry *d_xen_debug; diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h deleted file mode 100644 index 6b813ad1091c..000000000000 --- a/arch/x86/xen/debugfs.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_DEBUGFS_H -#define _XEN_DEBUGFS_H - -struct dentry * __init xen_init_debugfs(void); - -#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 7d7ffb9c826a..7250d0e0e1a9 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -16,6 +16,8 @@ #include <asm/setup.h> #include <asm/xen/hypercall.h> +#include "xen-ops.h" + static efi_char16_t vendor[100] __initdata; static efi_system_table_t efi_systab_xen __initdata = { @@ -136,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params) if (efi_systab_xen == NULL) return; - strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", + strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", sizeof(boot_params->efi_info.efi_loader_signature)); boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen); boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30c6e986a6cd..53282dc7d5ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1,11 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG -#include <linux/memblock.h> -#endif #include <linux/console.h> #include <linux/cpu.h> +#include <linux/instrumentation.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <linux/slab.h> #include <linux/panic_notifier.h> @@ -22,20 +21,22 @@ #include <asm/setup.h> #include "xen-ops.h" -#include "smp.h" -#include "pmu.h" -EXPORT_SYMBOL_GPL(hypercall_page); +DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm); +EXPORT_STATIC_CALL_TRAMP(xen_hypercall); /* * Pointer to the xen_vcpu_info structure or * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info * but during boot it is switched to point to xen_vcpu_info. - * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events. + * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events. + * Make sure that xen_vcpu_info doesn't cross a page boundary by making it + * cache-line aligned (the struct is guaranteed to have a size of 64 bytes, + * which matches the cache line size of 64-bit x86 processors). */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ DEFINE_PER_CPU(uint32_t, xen_vcpu_id); @@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; -__read_mostly int xen_have_vector_callback; +__read_mostly bool xen_have_vector_callback = true; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* @@ -69,6 +70,65 @@ EXPORT_SYMBOL(xen_start_flags); */ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; +/* Number of pages released from the initial allocation. */ +unsigned long xen_released_pages; + +static __ref void xen_get_vendor(void) +{ + init_cpu_devs(); + cpu_detect(&boot_cpu_data); + get_cpu_vendor(&boot_cpu_data); +} + +void xen_hypercall_setfunc(void) +{ + if (static_call_query(xen_hypercall) != xen_hypercall_hvm) + return; + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + static_call_update(xen_hypercall, xen_hypercall_amd); + else + static_call_update(xen_hypercall, xen_hypercall_intel); +} + +/* + * Evaluate processor vendor in order to select the correct hypercall + * function for HVM/PVH guests. + * Might be called very early in boot before vendor has been set by + * early_cpu_init(). + */ +noinstr void *__xen_hypercall_setfunc(void) +{ + void (*func)(void); + + /* + * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty + * dependency chain: it is being called via the xen_hypercall static + * call when running as a PVH or HVM guest. Hypercalls need to be + * noinstr due to PV guests using hypercalls in noinstr code. So we + * can safely tag the function body as "instrumentation ok", since + * the PV guest requirement is not of interest here (xen_get_vendor() + * calls noinstr functions, and static_call_update_early() might do + * so, too). + */ + instrumentation_begin(); + + xen_get_vendor(); + + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) + func = xen_hypercall_amd; + else + func = xen_hypercall_intel; + + static_call_update_early(xen_hypercall, func); + + instrumentation_end(); + + return func; +} + static int xen_cpu_up_online(unsigned int cpu) { xen_init_lock_cpu(cpu); @@ -160,6 +220,7 @@ void xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES); BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* @@ -346,3 +407,74 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +/* Amount of extra memory space we add to the e820 ranges */ +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns) +{ + unsigned int i; + + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); + + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); +} + +#ifdef CONFIG_XEN_UNPOPULATED_ALLOC +int __init arch_xen_unpopulated_init(struct resource **res) +{ + unsigned int i; + + if (!xen_domain()) + return -ENODEV; + + /* Must be set strictly before calling xen_free_unpopulated_pages(). */ + *res = &iomem_resource; + + /* + * Initialize with pages from the extra memory regions (see + * arch/x86/xen/setup.c). + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + unsigned int j; + + for (j = 0; j < xen_extra_mem[i].n_pfns; j++) { + struct page *pg = + pfn_to_page(xen_extra_mem[i].start_pfn + j); + + xen_free_unpopulated_pages(1, &pg); + } + + /* + * Account for the region being in the physmap but unpopulated. + * The value in xen_released_pages is used by the balloon + * driver to know how much of the physmap is unpopulated and + * set an accurate initial memory target. + */ + xen_released_pages += xen_extra_mem[i].n_pfns; + /* Zero so region is not also added to the balloon driver. */ + xen_extra_mem[i].n_pfns = 0; + } + + return 0; +} +#endif diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 8b71b1dd7639..fe57ff85d004 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -4,9 +4,12 @@ #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/memblock.h> +#include <linux/virtio_anchor.h> #include <xen/features.h> #include <xen/events.h> +#include <xen/hvm.h> +#include <xen/interface/hvm/hvm_op.h> #include <xen/interface/memory.h> #include <asm/apic.h> @@ -25,11 +28,12 @@ #include <asm/xen/page.h> #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" static unsigned long shared_info_pfn; +__ro_after_init bool xen_percpu_upcall; +EXPORT_SYMBOL_GPL(xen_percpu_upcall); + void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; @@ -102,15 +106,8 @@ static void __init init_hvm_pv_info(void) /* PVH set up hypercall page in xen_prepare_pvh(). */ if (xen_pvh_domain()) pv_info.name = "Xen PVH"; - else { - u64 pfn; - uint32_t msr; - + else pv_info.name = "Xen HVM"; - msr = cpuid_ebx(base + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - } xen_setup_features(); @@ -125,9 +122,12 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) { struct pt_regs *old_regs = set_irq_regs(regs); + if (xen_percpu_upcall) + apic_eoi(); + inc_irq_stat(irq_hv_callback_count); - xen_hvm_evtchn_do_upcall(); + xen_evtchn_do_upcall(); set_irq_regs(old_regs); } @@ -139,7 +139,9 @@ static void xen_hvm_shutdown(void) if (kexec_in_progress) xen_reboot(SHUTDOWN_soft_reset); } +#endif +#ifdef CONFIG_CRASH_DUMP static void xen_hvm_crash_shutdown(struct pt_regs *regs) { native_machine_crash_shutdown(regs); @@ -152,15 +154,14 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) int rc = 0; /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). + * If a CPU was offlined earlier and offlining timed out then the + * lock mechanism is still initialized. Uninit it unconditionally + * as it's safe to call even if already uninited. Interrupts and + * timer have already been handled in xen_cpu_dead_hvm(). */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } + xen_uninit_lock_cpu(cpu); - if (cpu_acpi_id(cpu) != U32_MAX) + if (cpu_acpi_id(cpu) != CPU_ACPIID_INVALID) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; @@ -168,6 +169,15 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) if (!xen_have_vector_callback) return 0; + if (xen_percpu_upcall) { + rc = xen_set_upcall_vector(cpu); + if (rc) { + WARN(1, "HVMOP_set_evtchn_upcall_vector" + " for CPU %d failed: %d\n", cpu, rc); + return rc; + } + } + if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); @@ -188,14 +198,13 @@ static int xen_cpu_dead_hvm(unsigned int cpu) return 0; } -static bool no_vector_callback __initdata; - static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) return; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); init_hvm_pv_info(); @@ -211,9 +220,6 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) - xen_have_vector_callback = 1; - xen_hvm_smp_init(); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); xen_unplug_emulated_devices(); @@ -223,6 +229,8 @@ static void __init xen_hvm_guest_init(void) #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; +#endif +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = xen_hvm_crash_shutdown; #endif } @@ -239,7 +247,7 @@ early_param("xen_nopv", xen_parse_nopv); static __init int xen_parse_no_vector_callback(char *arg) { - no_vector_callback = true; + xen_have_vector_callback = false; return 0; } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); @@ -285,6 +293,10 @@ static uint32_t __init xen_platform_hvm(void) if (xen_pv_domain()) return 0; + /* Set correct hypercall function. */ + if (xen_domain) + xen_hypercall_setfunc(); + if (xen_pvh_domain() && nopv) { /* Guest booting via the Xen-PVH boot entry goes here */ pr_info("\"nopv\" parameter is ignored in PVH guest\n"); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 70fb2ea85e90..26bbaf4b7330 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -23,6 +23,7 @@ #include <linux/start_kernel.h> #include <linux/sched.h> #include <linux/kprobes.h> +#include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/export.h> #include <linux/mm.h> @@ -31,6 +32,8 @@ #include <linux/gfp.h> #include <linux/edd.h> #include <linux/reboot.h> +#include <linux/virtio_anchor.h> +#include <linux/stackprotector.h> #include <xen/xen.h> #include <xen/events.h> @@ -46,6 +49,7 @@ #include <xen/hvc-console.h> #include <xen/acpi.h> +#include <asm/cpuid/api.h> #include <asm/paravirt.h> #include <asm/apic.h> #include <asm/page.h> @@ -57,18 +61,20 @@ #include <asm/processor.h> #include <asm/proto.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/setup.h> #include <asm/desc.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/stackprotector.h> #include <asm/hypervisor.h> #include <asm/mach_traps.h> +#include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#include <asm/irq_stack.h> #ifdef CONFIG_X86_IOPL_IOPERM #include <asm/io_bitmap.h> #endif @@ -76,16 +82,12 @@ #ifdef CONFIG_ACPI #include <linux/acpi.h> #include <asm/acpi.h> -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <acpi/processor.h> #include <xen/interface/platform.h> #endif #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "multicalls.h" -#include "pmu.h" #include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ @@ -94,10 +96,58 @@ void *xen_initial_gdt; static int xen_cpu_up_prepare_pv(unsigned int cpu); static int xen_cpu_dead_pv(unsigned int cpu); +#ifndef CONFIG_PREEMPTION +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +/* + * In case of scheduling the flag must be cleared and restored after + * returning from schedule as the task might move to a different CPU. + */ +static __always_inline bool get_and_clear_inhcall(void) +{ + bool inhcall = __this_cpu_read(xen_in_preemptible_hcall); + + __this_cpu_write(xen_in_preemptible_hcall, false); + return inhcall; +} + +static __always_inline void restore_inhcall(bool inhcall) +{ + __this_cpu_write(xen_in_preemptible_hcall, inhcall); +} + +#else + +static __always_inline bool get_and_clear_inhcall(void) { return false; } +static __always_inline void restore_inhcall(bool inhcall) { } + +#endif + struct tls_descs { struct desc_struct desc[3]; }; +DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE; + +enum xen_lazy_mode xen_get_lazy_mode(void) +{ + if (in_interrupt()) + return XEN_LAZY_NONE; + + return this_cpu_read(xen_lazy_mode); +} + /* * Updating the 3 TLS descriptors in the GDT on every task switch is * surprisingly expensive so we avoid updating them if they haven't @@ -107,9 +157,69 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); + +static int __init parse_xen_msr_safe(char *str) +{ + if (str) + return kstrtobool(str, &xen_msr_safe); + return -EINVAL; +} +early_param("xen_msr_safe", parse_xen_msr_safe); + +/* Get MTRR settings from Xen and put them into mtrr_state. */ +static void __init xen_set_mtrr_data(void) +{ +#ifdef CONFIG_MTRR + struct xen_platform_op op = { + .cmd = XENPF_read_memtype, + .interface_version = XENPF_INTERFACE_VERSION, + }; + unsigned int reg; + unsigned long mask; + uint32_t eax, width; + static struct mtrr_var_range var[MTRR_MAX_VAR_RANGES] __initdata; + + /* Get physical address width (only 64-bit cpus supported). */ + width = 36; + eax = cpuid_eax(0x80000000); + if ((eax >> 16) == 0x8000 && eax >= 0x80000008) { + eax = cpuid_eax(0x80000008); + width = eax & 0xff; + } + + for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) { + op.u.read_memtype.reg = reg; + if (HYPERVISOR_platform_op(&op)) + break; + + /* + * Only called in dom0, which has all RAM PFNs mapped at + * RAM MFNs, and all PCI space etc. is identity mapped. + * This means we can treat MFN == PFN regarding MTRR settings. + */ + var[reg].base_lo = op.u.read_memtype.type; + var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT; + var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT); + mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1); + mask &= (1UL << width) - 1; + if (mask) + mask |= MTRR_PHYSMASK_V; + var[reg].mask_lo = mask; + var[reg].mask_hi = mask >> 32; + } + + /* Only overwrite MTRR state if any MTRR could be got from Xen. */ + if (reg) + guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE); +#endif +} + static void __init xen_pv_init_platform(void) { - xen_set_restricted_virtio_memory_access(); + /* PV guests can't operate virtio devices without grants. */ + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -121,6 +231,14 @@ static void __init xen_pv_init_platform(void) /* pvclock is in shared info area */ xen_init_time_ops(); + + if (xen_initial_domain()) + xen_set_mtrr_data(); + else + guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); + + /* Adjust nr_cpu_ids before "enumeration" happens */ + xen_smp_count_cpus(); } static void __init xen_pv_guest_late_init(void) @@ -137,14 +255,22 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { - unsigned maskebx = ~0; + unsigned int maskebx = ~0; + unsigned int or_ebx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ switch (*ax) { - case CPUID_MWAIT_LEAF: + case 0x1: + /* Replace initial APIC ID in bits 24-31 of EBX. */ + /* See xen_pv_smp_config() for related topology preparations. */ + maskebx = 0x00ffffff; + or_ebx = smp_processor_id() << 24; + break; + + case CPUID_LEAF_MWAIT: /* Synthesize the values.. */ *ax = 0; *bx = 0; @@ -166,6 +292,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, : "0" (*ax), "2" (*cx)); *bx &= maskebx; + *bx |= or_ebx; } static bool __init xen_check_mwait(void) @@ -213,24 +340,24 @@ static bool __init xen_check_mwait(void) * ecx and edx. The hypercall provides only partial information. */ - ax = CPUID_MWAIT_LEAF; + ax = CPUID_LEAF_MWAIT; bx = 0; cx = 0; dx = 0; native_cpuid(&ax, &bx, &cx, &dx); - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so, * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. */ buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP); set_xen_guest_handle(op.u.set_pminfo.pdc, buf); if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; } @@ -262,6 +389,7 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); setup_clear_cpu_cap(X86_FEATURE_SME); + setup_clear_cpu_cap(X86_FEATURE_LKGS); /* * Xen PV would need some work to support PCID: CR3 handling as well @@ -293,10 +421,25 @@ static noinstr unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } +static void xen_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(XEN_LAZY_CPU); +} + static void xen_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + xen_mc_flush(); - paravirt_end_context_switch(next); + leave_lazy(XEN_LAZY_CPU); + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } static unsigned long xen_store_tr(void) @@ -403,7 +546,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gdt(const struct desc_ptr *dtr) @@ -454,7 +597,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - pfn = virt_to_pfn(va); + pfn = virt_to_pfn((void *)va); mfn = pfn_to_mfn(pfn); pte = pfn_pte(pfn, PAGE_KERNEL_RO); @@ -499,7 +642,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * exception between the new %fs descriptor being loaded and * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (xen_get_lazy_mode() == XEN_LAZY_CPU) loadsegment(fs, 0); xen_mc_batch(); @@ -508,7 +651,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 1); load_TLS_descriptor(t, cpu, 2); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gs_index(unsigned int idx) @@ -583,6 +726,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check) } #endif +static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + inc_irq_stat(irq_hv_callback_count); + + xen_evtchn_do_upcall(); + + set_irq_regs(old_regs); +} + +__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + bool inhcall; + + instrumentation_begin(); + run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs); + + inhcall = get_and_clear_inhcall(); + if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) { + irqentry_exit_cond_resched(); + instrumentation_end(); + restore_inhcall(inhcall); + } else { + instrumentation_end(); + irqentry_exit(regs, state); + } +} + struct trap_array_entry { void (*orig)(void); void (*xen)(void); @@ -609,7 +782,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), @@ -625,7 +798,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), TRAP_ENTRY(exc_simd_coprocessor_error, false ), -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET TRAP_ENTRY(exc_control_protection, false ), #endif }; @@ -762,6 +935,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + static const struct trap_info zero = { }; unsigned out; trace_xen_cpu_load_idt(desc); @@ -771,7 +945,7 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); out = xen_convert_trap_info(desc, traps, false); - memset(&traps[out], 0, sizeof(traps[0])); + traps[out] = zero; xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -839,7 +1013,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } @@ -903,7 +1077,7 @@ static void xen_write_cr0(unsigned long cr0) MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_write_cr4(unsigned long cr4) @@ -913,39 +1087,54 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static u64 xen_read_msr_safe(unsigned int msr, int *err) +static u64 xen_do_read_msr(u32 msr, int *err) { - u64 val; + u64 val = 0; /* Avoid uninitialized value for safe variant. */ - if (pmu_msr_read(msr, &val, err)) + if (pmu_msr_chk_emulated(msr, &val, true)) return val; - val = native_read_msr_safe(msr, err); + if (err) + *err = native_read_msr_safe(msr, &val); + else + val = native_read_msr(msr); + switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; + if (smp_processor_id() == 0) + val |= MSR_IA32_APICBASE_BSP; + else + val &= ~MSR_IA32_APICBASE_BSP; break; } return val; } -static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +static void set_seg(u32 which, u64 base) { - int ret; - unsigned int which; - u64 base; - - ret = 0; + if (HYPERVISOR_set_segment_base(which, base)) + WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); +} +/* + * Support write_msr_safe() and write_msr() semantics. + * With err == NULL write_msr() semantics are selected. + * Supplying an err pointer requires err to be pre-initialized with 0. + */ +static void xen_do_write_msr(u32 msr, u64 val, int *err) +{ switch (msr) { - case MSR_FS_BASE: which = SEGBASE_FS; goto set; - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - - set: - base = ((u64)high << 32) | low; - if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EIO; + case MSR_FS_BASE: + set_seg(SEGBASE_FS, val); + break; + + case MSR_KERNEL_GS_BASE: + set_seg(SEGBASE_GS_USER, val); + break; + + case MSR_GS_BASE: + set_seg(SEGBASE_GS_KERNEL, val); break; case MSR_STAR: @@ -961,31 +1150,45 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) break; default: - if (!pmu_msr_write(msr, low, high, &ret)) - ret = native_write_msr_safe(msr, low, high); + if (pmu_msr_chk_emulated(msr, &val, false)) + return; + + if (err) + *err = native_write_msr_safe(msr, val); + else + native_write_msr(msr, val); } +} - return ret; +static int xen_read_msr_safe(u32 msr, u64 *val) +{ + int err = 0; + + *val = xen_do_read_msr(msr, &err); + return err; } -static u64 xen_read_msr(unsigned int msr) +static int xen_write_msr_safe(u32 msr, u64 val) { - /* - * This will silently swallow a #GP from RDMSR. It may be worth - * changing that. - */ - int err; + int err = 0; + + xen_do_write_msr(msr, val, &err); - return xen_read_msr_safe(msr, &err); + return err; } -static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) +static u64 xen_read_msr(u32 msr) { - /* - * This will silently swallow a #GP from WRMSR. It may be worth - * changing that. - */ - xen_write_msr_safe(msr, low, high); + int err = 0; + + return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); +} + +static void xen_write_msr(u32 msr, u64 val) +{ + int err; + + xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL); } /* This is called once we have the cpu_possible_mask */ @@ -1022,8 +1225,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, - .read_msr = xen_read_msr, .write_msr = xen_write_msr, @@ -1055,7 +1256,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, + .start_context_switch = xen_start_context_switch, .end_context_switch = xen_end_context_switch, }, }; @@ -1164,7 +1365,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; @@ -1202,6 +1403,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_domain_type = XEN_PV_DOMAIN; xen_start_flags = xen_start_info->flags; + /* Interrupts are guaranteed to be off initially. */ + early_boot_irqs_disabled = true; + static_call_update_early(xen_hypercall, xen_hypercall_pv); xen_setup_features(); @@ -1220,10 +1424,12 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_vcpu_info_reset(0); x86_platform.get_nmi_reason = xen_get_nmi_reason; + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; - x86_init.irqs.intr_mode_init = x86_init_noop; + x86_init.irqs.intr_mode_init = x86_64_probe_apic; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; x86_init.hyper.init_platform = xen_pv_init_platform; @@ -1263,12 +1469,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_init_capabilities(); -#ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. */ xen_init_apic(); -#endif machine_ops = xen_machine_ops; @@ -1292,7 +1496,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); local_irq_disable(); - early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, @@ -1341,7 +1544,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_platform.set_legacy_features = xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_init_vga(info, xen_start_info->console.dom0.info_size, + &boot_params.screen_info); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bcae606bbc5c..9d25d9373945 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> +#include <linux/cpufreq.h> +#include <linux/cpuidle.h> #include <linux/export.h> +#include <linux/mm.h> #include <xen/hvc-console.h> +#include <xen/acpi.h> +#include <asm/bootparam.h> #include <asm/io_apic.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> +#include <asm/setup.h> #include <xen/xen.h> #include <asm/xen/interface.h> @@ -25,24 +31,142 @@ bool __ro_after_init xen_pvh; EXPORT_SYMBOL_GPL(xen_pvh); -void __init xen_pvh_init(struct boot_params *boot_params) +#ifdef CONFIG_XEN_DOM0 +int xen_pvh_setup_gsi(int gsi, int trigger, int polarity) +{ + int ret; + struct physdev_setup_gsi setup_gsi; + + setup_gsi.gsi = gsi; + setup_gsi.triggering = (trigger == ACPI_EDGE_SENSITIVE ? 0 : 1); + setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + + ret = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (ret == -EEXIST) { + xen_raw_printk("Already setup the GSI :%d\n", gsi); + ret = 0; + } else if (ret) + xen_raw_printk("Fail to setup GSI (%d)!\n", gsi); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_pvh_setup_gsi); +#endif + +/* + * Reserve e820 UNUSABLE regions to inflate the memory balloon. + * + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are + * located as the same place as in the native memory map, but since dom0 gets + * less memory than the total amount of host RAM the ranges that can't be + * populated are converted from RAM -> UNUSABLE. Use such regions (up to the + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at + * boot. Doing so prevents the guest (even if just temporary) from using holes + * in the memory map in order to map grants or foreign addresses, and + * hopefully limits the risk of a clash with a device MMIO region. Ideally the + * hypervisor should notify us which memory ranges are suitable for creating + * foreign mappings, but that's not yet implemented. + */ +static void __init pvh_reserve_extra_memory(void) { - u32 msr; - u64 pfn; + struct boot_params *bootp = &boot_params; + unsigned int i, ram_pages = 0, extra_pages; + + for (i = 0; i < bootp->e820_entries; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + + if (e->type != E820_TYPE_RAM) + continue; + ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr); + } + + /* Max amount of extra memory. */ + extra_pages = EXTRA_MEM_RATIO * ram_pages; + + /* + * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping + * purposes. + */ + for (i = 0; i < bootp->e820_entries && extra_pages; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + unsigned long pages; + + if (e->type != E820_TYPE_UNUSABLE) + continue; + + pages = min(extra_pages, + PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr)); + + if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) { + struct boot_e820_entry *next; + if (bootp->e820_entries == + ARRAY_SIZE(bootp->e820_table)) + /* No space left to split - skip region. */ + continue; + + /* Split entry. */ + next = e + 1; + memmove(next, e, + (bootp->e820_entries - i) * sizeof(*e)); + bootp->e820_entries++; + next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages); + e->size = next->addr - e->addr; + next->size -= e->size; + } + e->type = E820_TYPE_RAM; + extra_pages -= pages; + + xen_add_extra_mem(PFN_UP(e->addr), pages); + } +} + +static void __init pvh_arch_setup(void) +{ + pvh_reserve_extra_memory(); + + if (xen_initial_domain()) { + xen_add_preferred_consoles(); + + /* + * Disable usage of CPU idle and frequency drivers: when + * running as hardware domain the exposed native ACPI tables + * causes idle and/or frequency drivers to attach and + * malfunction. It's Xen the entity that controls the idle and + * frequency states. + * + * For unprivileged domains the exposed ACPI tables are + * fabricated and don't contain such data. + */ + disable_cpuidle(); + disable_cpufreq(); + WARN_ON(xen_set_default_idle()); + } +} + +void __init xen_pvh_init(struct boot_params *boot_params) +{ xen_pvh = 1; xen_domain_type = XEN_HVM_DOMAIN; xen_start_flags = pvh_start_info.flags; - msr = cpuid_ebx(xen_cpuid_base() + 2); - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - if (xen_initial_domain()) - x86_init.oem.arch_setup = xen_add_preferred_consoles; + x86_init.oem.arch_setup = pvh_arch_setup; x86_init.oem.banner = xen_banner; xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + int ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } } void __init mem_map_via_hcall(struct boot_params *boot_params_p) diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 06c3c2fb4b06..39982f955cfe 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -static void xen_safe_halt(void) +static noinstr void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) @@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = { /* Initial interrupt flag handling only called while interrupts off. */ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), - .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), + .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func), .safe_halt = xen_safe_halt, .halt = xen_halt, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 60e9c37fd79f..c4c479373249 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -5,8 +5,7 @@ #include <asm/xen/hypercall.h> #include <xen/interface/memory.h> -#include "multicalls.h" -#include "mmu.h" +#include "xen-ops.h" unsigned long arbitrary_virt_to_mfn(void *vaddr) { diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h deleted file mode 100644 index 6e4c6bd62203..000000000000 --- a/arch/x86/xen/mmu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_MMU_H - -#include <linux/linkage.h> -#include <asm/page.h> - -enum pt_level { - PT_PGD, - PT_P4D, - PT_PUD, - PT_PMD, - PT_PTE -}; - - -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); - -void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte); - -unsigned long xen_read_cr2_direct(void); - -extern void xen_init_mmu_ops(void); -extern void xen_hvm_init_mmu_ops(void); -#endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 509bdee3ab90..337955652202 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -5,7 +5,7 @@ #include <xen/interface/xen.h> #include <xen/hvm.h> -#include "mmu.h" +#include "xen-ops.h" #ifdef CONFIG_PROC_VMCORE /* diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index ee29fb558f2e..2a4a8deaf612 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -34,7 +34,7 @@ * would need to validate the whole pagetable before going on. * Naturally, this is quite slow. The solution is to "pin" a * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured + * when it is not actively in use. This means that Xen can be assured * that it is still valid when you do load it into %cr3, and doesn't * need to revalidate it. * @@ -82,9 +82,23 @@ #include <xen/hvc-console.h> #include <xen/swiotlb-xen.h> -#include "multicalls.h" -#include "mmu.h" -#include "debugfs.h" +#include "xen-ops.h" + +/* + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order + * to avoid warnings with "-Wmissing-prototypes". + */ +pteval_t xen_pte_val(pte_t pte); +pgdval_t xen_pgd_val(pgd_t pgd); +pmdval_t xen_pmd_val(pmd_t pmd); +pudval_t xen_pud_val(pud_t pud); +p4dval_t xen_p4d_val(p4d_t p4d); +pte_t xen_make_pte(pteval_t pte); +pgd_t xen_make_pgd(pgdval_t pgd); +pmd_t xen_make_pmd(pmdval_t pmd); +pud_t xen_make_pud(pudval_t pud); +p4d_t xen_make_p4d(p4dval_t p4d); +pte_t xen_make_pte_init(pteval_t pte); #ifdef CONFIG_X86_VSYSCALL_EMULATION /* l3 pud for userspace vsyscall mapping */ @@ -97,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; */ static DEFINE_SPINLOCK(xen_reservation_lock); +/* Protected by xen_reservation_lock. */ +#define MIN_CONTIG_ORDER 9 /* 2MB */ +static unsigned int discontig_frames_order = MIN_CONTIG_ORDER; +static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata; +static unsigned long *discontig_frames __refdata = discontig_frames_early; +static bool discontig_frames_dyn; + +static int alloc_discontig_frames(unsigned int order) +{ + unsigned long *new_array, *old_array; + unsigned int old_order; + unsigned long flags; + + BUG_ON(order < MIN_CONTIG_ORDER); + BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE); + + new_array = (unsigned long *)__get_free_pages(GFP_KERNEL, + order - MIN_CONTIG_ORDER); + if (!new_array) + return -ENOMEM; + + spin_lock_irqsave(&xen_reservation_lock, flags); + + old_order = discontig_frames_order; + + if (order > discontig_frames_order || !discontig_frames_dyn) { + if (!discontig_frames_dyn) + old_array = NULL; + else + old_array = discontig_frames; + + discontig_frames = new_array; + discontig_frames_order = order; + discontig_frames_dyn = true; + } else { + old_array = new_array; + } + + spin_unlock_irqrestore(&xen_reservation_lock, flags); + + free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER); + + return 0; +} + /* * Note about cr3 (pagetable base) values: * @@ -112,7 +171,7 @@ static DEFINE_SPINLOCK(xen_reservation_lock); * looking at another vcpu's cr3 value, it should use this variable. */ DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; @@ -150,7 +209,7 @@ void make_lowmem_page_readwrite(void *vaddr) if (pte == NULL) return; /* vaddr missing */ - ptev = pte_mkwrite(*pte); + ptev = pte_mkwrite_novma(*pte); if (HYPERVISOR_update_va_mapping(address, ptev, 0)) BUG(); @@ -220,7 +279,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -254,7 +313,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) { struct mmu_update u; - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + if (xen_get_lazy_mode() != XEN_LAZY_MMU) return false; xen_mc_batch(); @@ -263,7 +322,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) u.val = pte_val_ma(pteval); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); return true; } @@ -289,16 +348,17 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte) +static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, pte_t pte) { struct mmu_update u; @@ -309,7 +369,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } /* Assume pteval_t is equivalent to all the other *val_t types. */ @@ -403,7 +463,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -483,7 +543,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) __xen_set_p4d_hyper(ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -515,10 +575,9 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) if (user_ptr) __xen_set_p4d_hyper((p4d_t *)user_ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } -#if CONFIG_PGTABLE_LEVELS >= 5 __visible p4dval_t xen_p4d_val(p4d_t p4d) { return pte_mfn_to_pfn(p4d.p4d); @@ -532,7 +591,6 @@ __visible p4d_t xen_make_p4d(p4dval_t p4d) return native_make_p4d(p4d); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d); -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, void (*func)(struct mm_struct *mm, struct page *, @@ -650,8 +708,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; -#if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); +#if defined(CONFIG_SPLIT_PTE_PTLOCKS) + ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -766,6 +824,7 @@ void xen_mm_pin_all(void) { struct page *page; + spin_lock(&init_mm.page_table_lock); spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { @@ -776,6 +835,7 @@ void xen_mm_pin_all(void) } spin_unlock(&pgd_lock); + spin_unlock(&init_mm.page_table_lock); } static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page, @@ -797,6 +857,9 @@ static void __init xen_after_bootmem(void) SetPagePinned(virt_to_page(level3_user_vsyscall)); #endif xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); + + if (alloc_discontig_frames(MIN_CONTIG_ORDER)) + BUG(); } static void xen_unpin_page(struct mm_struct *mm, struct page *page, @@ -872,6 +935,7 @@ void xen_mm_unpin_all(void) { struct page *page; + spin_lock(&init_mm.page_table_lock); spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { @@ -883,16 +947,10 @@ void xen_mm_unpin_all(void) } spin_unlock(&pgd_lock); + spin_unlock(&init_mm.page_table_lock); } -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_enter_mmap(struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -904,7 +962,7 @@ static void drop_mm_ref_this_cpu(void *info) struct mm_struct *mm = info; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) - leave_mm(smp_processor_id()); + leave_mm(); /* * If this cpu still has a stale cr3 reference, then make sure @@ -1050,7 +1108,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) pte_t *pte_tbl; int i; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PMD_SIZE); return; @@ -1073,7 +1131,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) pmd_t *pmd_tbl; int i; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PUD_SIZE); return; @@ -1095,7 +1153,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) pud_t *pud_tbl; int i; - if (p4d_large(*p4d)) { + if (p4d_leaf(*p4d)) { pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, P4D_SIZE); return; @@ -1236,7 +1294,7 @@ static noinline void xen_flush_tlb(void) op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1256,7 +1314,7 @@ static void xen_flush_tlb_one_user(unsigned long addr) op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1293,7 +1351,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus, MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } static unsigned long xen_read_cr3(void) @@ -1352,7 +1410,7 @@ static void xen_write_cr3(unsigned long cr3) else __xen_write_cr3(false, 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } /* @@ -1387,7 +1445,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) __xen_write_cr3(true, cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } static int xen_pgd_alloc(struct mm_struct *mm) @@ -1545,10 +1603,11 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, __set_pfn_prot(pfn, PAGE_KERNEL_RO); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) && + !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } } @@ -1573,12 +1632,12 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) if (pinned) { xen_mc_batch(); - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) + if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS)) __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); __set_pfn_prot(pfn, PAGE_KERNEL); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); ClearPagePinned(page); } @@ -1795,7 +1854,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) */ xen_mc_batch(); __xen_write_cr3(true, __pa(init_top_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -1854,7 +1913,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pud_present(pud)) return 0; pa = pud_val(pud) & PTE_PFN_MASK; - if (pud_large(pud)) + if (pud_leaf(pud)) return pa + (vaddr & ~PUD_MASK); pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * @@ -1862,7 +1921,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pmd_present(pmd)) return 0; pa = pmd_val(pmd) & PTE_PFN_MASK; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) return pa + (vaddr & ~PMD_MASK); pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * @@ -2010,10 +2069,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; @@ -2074,6 +2130,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } +static void xen_enter_lazy_mmu(void) +{ + enter_lazy(XEN_LAZY_MMU); +} + +static void xen_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (xen_get_lazy_mode() == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + static void __init xen_post_allocator_init(void) { pv_ops.mmu.set_pte = xen_set_pte; @@ -2098,7 +2171,7 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - paravirt_leave_lazy_mmu(); + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } @@ -2114,7 +2187,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_multi = xen_flush_tlb_multi, - .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, @@ -2148,19 +2220,16 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, -#if CONFIG_PGTABLE_LEVELS >= 5 .p4d_val = PV_CALLEE_SAVE(xen_p4d_val), .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), -#endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, + .enter_mmap = xen_enter_mmap, .exit_mmap = xen_exit_mmap, .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, + .enter = xen_enter_lazy_mmu, .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, + .flush = xen_flush_lazy_mmu, }, .set_fixmap = xen_set_fixmap, @@ -2177,10 +2246,6 @@ void __init xen_init_mmu_ops(void) memset(dummy_mapping, 0xff, PAGE_SIZE); } -/* Protected by xen_reservation_lock. */ -#define MAX_CONTIG_ORDER 9 /* 2MB */ -static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; - #define VOID_PTE (mfn_pte(0, __pgprot(0))) static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, unsigned long *in_frames, @@ -2194,13 +2259,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, mcs = __xen_mc_entry(0); if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); + in_frames[i] = virt_to_mfn((void *)vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY); if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); + out_frames[i] = virt_to_pfn((void *)vaddr); } xen_mc_issue(0); } @@ -2242,7 +2307,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, MULTI_update_va_mapping(mcs.mc, vaddr, mfn_pte(mfn, PAGE_KERNEL), flags); - set_phys_to_machine(virt_to_pfn(vaddr), mfn); + set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn); } xen_mc_issue(0); @@ -2297,29 +2362,30 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle) { - unsigned long *in_frames = discontig_frames, out_frame; + unsigned long *in_frames, out_frame; unsigned long flags; int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ + if (unlikely(order > discontig_frames_order)) { + if (!discontig_frames_dyn) + return -ENOMEM; - if (unlikely(order > MAX_CONTIG_ORDER)) - return -ENOMEM; + if (alloc_discontig_frames(order)) + return -ENOMEM; + } memset((void *) vstart, 0, PAGE_SIZE << order); spin_lock_irqsave(&xen_reservation_lock, flags); + in_frames = discontig_frames; + /* 1. Zap current PTEs, remembering MFNs. */ xen_zap_pfn_range(vstart, order, in_frames, NULL); /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); + out_frame = virt_to_pfn((void *)vstart); success = xen_exchange_memory(1UL << order, 0, in_frames, 1, order, &out_frame, address_bits); @@ -2338,12 +2404,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) { - unsigned long *out_frames = discontig_frames, in_frame; + unsigned long *out_frames, in_frame; unsigned long flags; int success; unsigned long vstart; - if (unlikely(order > MAX_CONTIG_ORDER)) + if (unlikely(order > discontig_frames_order)) return; vstart = (unsigned long)phys_to_virt(pstart); @@ -2351,8 +2417,10 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); + out_frames = discontig_frames; + /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); + in_frame = virt_to_mfn((void *)vstart); /* 2. Zap current PTEs. */ xen_zap_pfn_range(vstart, order, NULL, out_frames); @@ -2383,7 +2451,7 @@ static noinline void xen_flush_tlb_all(void) op->cmd = MMUEXT_TLB_FLUSH_ALL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -2501,7 +2569,7 @@ out: } EXPORT_SYMBOL_GPL(xen_remap_pfn); -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_VMCORE_INFO phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 07054572297f..7237d56a9d3f 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -23,26 +23,21 @@ #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/debugfs.h> +#include <linux/jump_label.h> +#include <linux/printk.h> #include <asm/xen/hypercall.h> -#include "multicalls.h" -#include "debugfs.h" +#include "xen-ops.h" #define MC_BATCH 32 -#define MC_DEBUG 0 - #define MC_ARGS (MC_BATCH * 16) struct mc_buffer { unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; -#if MC_DEBUG - struct multicall_entry debug[MC_BATCH]; - void *caller[MC_BATCH]; -#endif unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); @@ -50,13 +45,105 @@ struct mc_buffer { } callbacks[MC_BATCH]; }; +struct mc_debug_data { + struct multicall_entry entries[MC_BATCH]; + void *caller[MC_BATCH]; + size_t argsz[MC_BATCH]; + unsigned long *args[MC_BATCH]; +}; + static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +static struct mc_debug_data mc_debug_data_early __initdata; +static struct mc_debug_data __percpu *mc_debug_data_ptr; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +static struct static_key mc_debug __ro_after_init; +static bool mc_debug_enabled __initdata; + +static struct mc_debug_data * __ref get_mc_debug(void) +{ + if (!mc_debug_data_ptr) + return &mc_debug_data_early; + + return this_cpu_ptr(mc_debug_data_ptr); +} + +static int __init xen_parse_mc_debug(char *arg) +{ + mc_debug_enabled = true; + static_key_slow_inc(&mc_debug); + + return 0; +} +early_param("xen_mc_debug", xen_parse_mc_debug); + +static int __init mc_debug_enable(void) +{ + unsigned long flags; + struct mc_debug_data __percpu *mcdb; + + if (!mc_debug_enabled) + return 0; + + mcdb = alloc_percpu(struct mc_debug_data); + if (!mcdb) { + pr_err("xen_mc_debug inactive\n"); + static_key_slow_dec(&mc_debug); + return -ENOMEM; + } + + /* Be careful when switching to percpu debug data. */ + local_irq_save(flags); + xen_mc_flush(); + mc_debug_data_ptr = mcdb; + local_irq_restore(flags); + + pr_info("xen_mc_debug active\n"); + + return 0; +} +early_initcall(mc_debug_enable); + +/* Number of parameters of hypercalls used via multicalls. */ +static const uint8_t hpcpars[] = { + [__HYPERVISOR_mmu_update] = 4, + [__HYPERVISOR_stack_switch] = 2, + [__HYPERVISOR_fpu_taskswitch] = 1, + [__HYPERVISOR_update_descriptor] = 2, + [__HYPERVISOR_update_va_mapping] = 3, + [__HYPERVISOR_mmuext_op] = 4, +}; + +static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb, + int idx) +{ + unsigned int arg; + unsigned int opidx = mcdb->entries[idx].op & 0xff; + unsigned int pars = 0; + + pr_err(" call %2d: op=%lu result=%ld caller=%pS ", idx + 1, + mcdb->entries[idx].op, b->entries[idx].result, + mcdb->caller[idx]); + if (opidx < ARRAY_SIZE(hpcpars)) + pars = hpcpars[opidx]; + if (pars) { + pr_cont("pars="); + for (arg = 0; arg < pars; arg++) + pr_cont("%lx ", mcdb->entries[idx].args[arg]); + } + if (mcdb->argsz[idx]) { + pr_cont("args="); + for (arg = 0; arg < mcdb->argsz[idx] / 8; arg++) + pr_cont("%lx ", mcdb->args[idx][arg]); + } + pr_cont("\n"); +} + void xen_mc_flush(void) { struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_entry *mc; + struct mc_debug_data *mcdb = NULL; int ret = 0; unsigned long flags; int i; @@ -69,10 +156,11 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); -#if MC_DEBUG - memcpy(b->debug, b->entries, - b->mcidx * sizeof(struct multicall_entry)); -#endif + if (static_key_false(&mc_debug)) { + mcdb = get_mc_debug(); + memcpy(mcdb->entries, b->entries, + b->mcidx * sizeof(struct multicall_entry)); + } switch (b->mcidx) { case 0: @@ -103,21 +191,14 @@ void xen_mc_flush(void) pr_err("%d of %d multicall(s) failed: cpu %d\n", ret, b->mcidx, smp_processor_id()); for (i = 0; i < b->mcidx; i++) { - if (b->entries[i].result < 0) { -#if MC_DEBUG - pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n", - i + 1, - b->debug[i].op, - b->debug[i].args[0], - b->entries[i].result, - b->caller[i]); -#else + if (static_key_false(&mc_debug)) { + print_debug_data(b, mcdb, i); + } else if (b->entries[i].result < 0) { pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n", i + 1, b->entries[i].op, b->entries[i].args[0], b->entries[i].result); -#endif } } } @@ -155,9 +236,13 @@ struct multicall_space __xen_mc_entry(size_t args) } ret.mc = &b->entries[b->mcidx]; -#if MC_DEBUG - b->caller[b->mcidx] = __builtin_return_address(0); -#endif + if (static_key_false(&mc_debug)) { + struct mc_debug_data *mcdb = get_mc_debug(); + + mcdb->caller[b->mcidx] = __builtin_return_address(0); + mcdb->argsz[b->mcidx] = args; + mcdb->args[b->mcidx] = (unsigned long *)(&b->args[argidx]); + } b->mcidx++; ret.args = &b->args[argidx]; b->argidx = argidx + args; diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h deleted file mode 100644 index 1c51b2c87f30..000000000000 --- a/arch/x86/xen/multicalls.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_MULTICALLS_H -#define _XEN_MULTICALLS_H - -#include <trace/events/xen.h> - -#include "xen-ops.h" - -/* Multicalls */ -struct multicall_space -{ - struct multicall_entry *mc; - void *args; -}; - -/* Allocate room for a multicall and its args */ -struct multicall_space __xen_mc_entry(size_t args); - -DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); - -/* Call to start a batch of multiple __xen_mc_entry()s. Must be - paired with xen_mc_issue() */ -static inline void xen_mc_batch(void) -{ - unsigned long flags; - - /* need to disable interrupts until this entry is complete */ - local_irq_save(flags); - trace_xen_mc_batch(paravirt_get_lazy_mode()); - __this_cpu_write(xen_mc_irq_flags, flags); -} - -static inline struct multicall_space xen_mc_entry(size_t args) -{ - xen_mc_batch(); - return __xen_mc_entry(args); -} - -/* Flush all pending multicalls */ -void xen_mc_flush(void); - -/* Issue a multicall if we're not in a lazy mode */ -static inline void xen_mc_issue(unsigned mode) -{ - trace_xen_mc_issue(mode); - - if ((paravirt_get_lazy_mode() & mode) == 0) - xen_mc_flush(); - - /* restore flags saved in xen_mc_batch */ - local_irq_restore(this_cpu_read(xen_mc_irq_flags)); -} - -/* Set up a callback to be called when the current batch is flushed */ -void xen_mc_callback(void (*fn)(void *), void *data); - -/* - * Try to extend the arguments of the previous multicall command. The - * previous command's op must match. If it does, then it attempts to - * extend the argument space allocated to the multicall entry by - * arg_size bytes. - * - * The returned multicall_space will return with mc pointing to the - * command on success, or NULL on failure, and args pointing to the - * newly allocated space. - */ -struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); - -#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58db86f7b384..56914e21e303 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -70,6 +70,7 @@ #include <linux/memblock.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/acpi.h> #include <asm/cache.h> #include <asm/setup.h> @@ -80,8 +81,8 @@ #include <asm/xen/hypervisor.h> #include <xen/balloon.h> #include <xen/grant_table.h> +#include <xen/hvc-console.h> -#include "multicalls.h" #include "xen-ops.h" #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) @@ -134,11 +135,6 @@ static inline unsigned p2m_mid_index(unsigned long pfn) return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - static void p2m_top_mfn_init(unsigned long *top) { unsigned i; @@ -182,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) static void * __ref alloc_p2m_page(void) { if (unlikely(!slab_is_available())) { - void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - return ptr; + return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return (void *)__get_free_page(GFP_KERNEL); @@ -560,7 +550,6 @@ int xen_alloc_p2m_entry(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; - unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -570,12 +559,12 @@ int xen_alloc_p2m_entry(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); - if (old_mfn != missing_mfn) { - free_p2m_page(mid_mfn); - mid_mfn = mfn_to_virt(old_mfn); - } else { + /* try_cmpxchg() updates missing_mfn on failure. */ + if (try_cmpxchg(top_mfn_p, &missing_mfn, mid_mfn_mfn)) { p2m_top_mfn_p[topidx] = mid_mfn; + } else { + free_p2m_page(mid_mfn); + mid_mfn = mfn_to_virt(missing_mfn); } } } else { @@ -736,7 +725,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, * immediate unmapping. */ map_ops[i].status = GNTST_general_error; - unmap[0].host_addr = map_ops[i].host_addr, + unmap[0].host_addr = map_ops[i].host_addr; unmap[0].handle = map_ops[i].handle; map_ops[i].handle = INVALID_GRANT_HANDLE; if (map_ops[i].flags & GNTMAP_device_map) @@ -746,7 +735,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, if (kmap_ops) { kmap_ops[i].status = GNTST_general_error; - unmap[1].host_addr = kmap_ops[i].host_addr, + unmap[1].host_addr = kmap_ops[i].host_addr; unmap[1].handle = kmap_ops[i].handle; kmap_ops[i].handle = INVALID_GRANT_HANDLE; if (kmap_ops[i].flags & GNTMAP_device_map) @@ -799,9 +788,104 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + phys_addr_t maddr; + phys_addr_t paddr; + size_t size; +} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init; +static unsigned int nr_nonram_remap __ro_after_init; + +/* + * Do the real remapping of non-RAM regions as specified in the + * xen_nonram_remap[] array. + * In case of an error just crash the system. + */ +void __init xen_do_remap_nonram(void) +{ + unsigned int i; + unsigned int remapped = 0; + const struct nonram_remap *remap = xen_nonram_remap; + unsigned long pfn, mfn, end_pfn; + + for (i = 0; i < nr_nonram_remap; i++) { + end_pfn = PFN_UP(remap->paddr + remap->size); + pfn = PFN_DOWN(remap->paddr); + mfn = PFN_DOWN(remap->maddr); + while (pfn < end_pfn) { + if (!set_phys_to_machine(pfn, mfn)) + panic("Failed to set p2m mapping for pfn=%lx mfn=%lx\n", + pfn, mfn); + + pfn++; + mfn++; + remapped++; + } + + remap++; + } + + pr_info("Remapped %u non-RAM page(s)\n", remapped); +} + +#ifdef CONFIG_ACPI +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into account. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + * phys is related to remap->maddr on input and will be rebased to remap->paddr. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, + acpi_size size) +{ + unsigned int i; + const struct nonram_remap *remap = xen_nonram_remap; + + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys += remap->paddr - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} +#endif /* CONFIG_ACPI */ + +/* + * Add a new non-RAM remap entry. + * In case of no free entry found, just crash the system. + */ +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size) +{ + BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK)); + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + +#ifdef CONFIG_ACPI + /* Switch to the Xen acpi_os_ioremap() variant. */ + if (nr_nonram_remap == 0) + acpi_os_ioremap = xen_acpi_os_ioremap; +#endif + + xen_nonram_remap[nr_nonram_remap].maddr = maddr; + xen_nonram_remap[nr_nonram_remap].paddr = paddr; + xen_nonram_remap[nr_nonram_remap].size = size; + + nr_nonram_remap++; +} + #ifdef CONFIG_XEN_DEBUG_FS #include <linux/debugfs.h> -#include "debugfs.h" static int p2m_dump_show(struct seq_file *m, void *v) { static const char * const type_name[] = { diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 21ecbe754cb2..8f89ce0b67e3 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -2,6 +2,7 @@ #include <linux/types.h> #include <linux/interrupt.h> +#include <asm/msr.h> #include <asm/xen/hypercall.h> #include <xen/xen.h> #include <xen/page.h> @@ -10,7 +11,6 @@ #include <xen/interface/xenpmu.h> #include "xen-ops.h" -#include "pmu.h" /* x86_pmu.handle_irq definition */ #include "../events/perf_event.h" @@ -129,8 +129,12 @@ static inline uint32_t get_fam15h_addr(u32 addr) return addr; } -static inline bool is_amd_pmu_msr(unsigned int msr) +static bool is_amd_pmu_msr(u32 msr) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return false; + if ((msr >= MSR_F15H_PERF_CTL && msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || (msr >= MSR_K7_EVNTSEL0 && @@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr) return false; } -static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index) { u32 msr_index_pmc; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return false; + switch (msr_index) { case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_IA32_DS_AREA: @@ -186,8 +195,7 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) } } -static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, - int index, bool is_read) +static bool xen_intel_pmu_emulate(u32 msr, u64 *val, int type, int index, bool is_read) { uint64_t *reg = NULL; struct xen_pmu_intel_ctxt *ctxt; @@ -249,7 +257,7 @@ static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, return false; } -static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +static bool xen_amd_pmu_emulate(u32 msr, u64 *val, bool is_read) { uint64_t *reg = NULL; int i, off = 0; @@ -290,51 +298,20 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) return false; } -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) -{ - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, val, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } - } else { - int type, index; - - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } - } - - return false; -} - -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read) { - uint64_t val = ((uint64_t)high << 32) | low; + int type, index = 0; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, &val, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } - } else { - int type, index; + if (is_amd_pmu_msr(msr)) + return xen_amd_pmu_emulate(msr, val, is_read); - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } - } + if (is_intel_pmu_msr(msr, &type, &index)) + return xen_intel_pmu_emulate(msr, val, type, index, is_read); return false; } -static unsigned long long xen_amd_read_pmc(int counter) +static u64 xen_amd_read_pmc(int counter) { struct xen_pmu_amd_ctxt *ctxt; uint64_t *counter_regs; @@ -342,11 +319,12 @@ static unsigned long long xen_amd_read_pmc(int counter) uint8_t xenpmu_flags = get_xenpmu_flags(); if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { - uint32_t msr; - int err; + u32 msr; + u64 val; msr = amd_counters_base + (counter * amd_msr_step); - return native_read_msr_safe(msr, &err); + native_read_msr_safe(msr, &val); + return val; } ctxt = &xenpmu_data->pmu.c.amd; @@ -354,7 +332,7 @@ static unsigned long long xen_amd_read_pmc(int counter) return counter_regs[counter]; } -static unsigned long long xen_intel_read_pmc(int counter) +static u64 xen_intel_read_pmc(int counter) { struct xen_pmu_intel_ctxt *ctxt; uint64_t *fixed_counters; @@ -363,15 +341,16 @@ static unsigned long long xen_intel_read_pmc(int counter) uint8_t xenpmu_flags = get_xenpmu_flags(); if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { - uint32_t msr; - int err; + u32 msr; + u64 val; if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); else msr = MSR_IA32_PERFCTR0 + counter; - return native_read_msr_safe(msr, &err); + native_read_msr_safe(msr, &val); + return val; } ctxt = &xenpmu_data->pmu.c.intel; @@ -384,7 +363,7 @@ static unsigned long long xen_intel_read_pmc(int counter) return arch_cntr_pair[counter].counter; } -unsigned long long xen_read_pmc(int counter) +u64 xen_read_pmc(int counter) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return xen_amd_read_pmc(counter); diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h deleted file mode 100644 index 65c58894fc79..000000000000 --- a/arch/x86/xen/pmu.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __XEN_PMU_H -#define __XEN_PMU_H - -#include <xen/interface/xenpmu.h> - -extern bool is_xen_pmu; - -irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); -#ifdef CONFIG_XEN_HAVE_VPMU -void xen_pmu_init(int cpu); -void xen_pmu_finish(int cpu); -#else -static inline void xen_pmu_init(int cpu) {} -static inline void xen_pmu_finish(int cpu) {} -#endif -bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); -bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); -int pmu_apic_update(uint32_t reg); -unsigned long long xen_read_pmc(int counter); - -#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index cfa99e8f054b..3823e52aef52 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -6,19 +6,21 @@ */ #include <linux/init.h> +#include <linux/iscsi_ibft.h> #include <linux/sched.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/pm.h> #include <linux/memblock.h> #include <linux/cpuidle.h> #include <linux/cpufreq.h> #include <linux/memory_hotplug.h> +#include <linux/acpi.h> #include <asm/elf.h> #include <asm/vdso.h> #include <asm/e820/api.h> #include <asm/setup.h> -#include <asm/acpi.h> #include <asm/numa.h> #include <asm/idtentry.h> #include <asm/xen/hypervisor.h> @@ -32,19 +34,18 @@ #include <xen/features.h> #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Amount of extra memory space we add to the e820 ranges */ -struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; - -/* Number of pages released from the initial allocation. */ -unsigned long xen_released_pages; +/* Memory map would allow PCI passthrough. */ +bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -59,18 +60,6 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; -/* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define EXTRA_MEM_RATIO (10) - static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -85,41 +74,12 @@ static void __init xen_parse_512gb(void) arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); if (!arg) val = true; - else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val)) return; xen_512gb_limit = val; } -static void __init xen_add_extra_mem(unsigned long start_pfn, - unsigned long n_pfns) -{ - int i; - - /* - * No need to check for zero size, should happen rarely and will only - * write a new entry regarded to be unused due to zero size. - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - /* Add new region. */ - if (xen_extra_mem[i].n_pfns == 0) { - xen_extra_mem[i].start_pfn = start_pfn; - xen_extra_mem[i].n_pfns = n_pfns; - break; - } - /* Append to existing region. */ - if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == - start_pfn) { - xen_extra_mem[i].n_pfns += n_pfns; - break; - } - } - if (i == XEN_EXTRA_MEM_MAX_REGIONS) - printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - - memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); -} - static void __init xen_del_extra_mem(unsigned long start_pfn, unsigned long n_pfns) { @@ -252,7 +212,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -260,7 +220,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -338,7 +298,7 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; ident_pfn_iter < ident_end_pfn; @@ -381,15 +341,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -398,19 +357,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -437,18 +396,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -476,8 +435,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) - ret_val = func(start_pfn, end_pfn, nr_pages, - ret_val); + ret_val = func(start_pfn, end_pfn, ret_val); start = end; } } @@ -501,7 +459,7 @@ void __init xen_remap_memory(void) unsigned long pfn_s = ~0UL; unsigned long len = 0; - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); while (xen_remap_mfn != INVALID_P2M_ENTRY) { /* Map the remap information */ @@ -534,6 +492,8 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + xen_do_remap_nonram(); } static unsigned long __init xen_get_pages_limit(void) @@ -607,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -665,6 +625,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) } /* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + phys_addr_t swap_addr, swap_size, entry_end; + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E820 map. */ + entry = xen_e820_table.entries + + xen_e820_table.nr_entries; + xen_e820_table.nr_entries++; + + /* Fill new entry (keep size and page offset). */ + entry->type = swap_entry->type; + entry->addr = entry_end - swap_size + + swap_addr - swap_entry->addr; + entry->size = swap_entry->size; + + /* Convert old entry to RAM, align to pages. */ + swap_entry->type = E820_TYPE_RAM; + swap_entry->addr = swap_addr; + swap_entry->size = swap_size; + + /* Remember PFN<->MFN relation for P2M update. */ + xen_add_remap_nonram(swap_addr, entry_end - swap_size, + swap_size); + + /* Order E820 table and merge entries. */ + e820__update_table(&xen_e820_table); + + return; + } + + entry++; + } + + xen_raw_console_write("No suitable area found for required E820 entry remapping action\n"); + BUG(); +} + +/* + * Look for non-RAM memory types in a specific guest physical area and move + * those away if possible (ACPI NVS only for now). + */ +static void __init xen_e820_resolve_conflicts(phys_addr_t start, + phys_addr_t size) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t end; + + if (!size) + return; + + end = start + size; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->addr >= end) + return; + + if (entry->addr + entry->size > start && + entry->type == E820_TYPE_NVS) + xen_e820_swap_entry_with_ram(entry); + + entry++; + } +} + +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM or + * some other type which can be moved to higher PFNs while keeping the MFNs. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + xen_e820_resolve_conflicts(start, size); + + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + +/* * Like memcpy, but with physical addresses for dest and src. */ static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, @@ -723,20 +788,20 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn, pfn_s, n_pfns; + unsigned long pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; + unsigned long maxmem_pages; int i; int op; xen_parse_512gb(); - max_pfn = xen_get_pages_limit(); - max_pfn = min(max_pfn, xen_start_info->nr_pages); - mem_end = PFN_PHYS(max_pfn); + ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages); + mem_end = PFN_PHYS(ini_nr_pages); memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); @@ -763,27 +828,58 @@ char * __init xen_memory_setup(void) BUG_ON(memmap.nr_entries == 0); xen_e820_table.nr_entries = memmap.nr_entries; - /* - * Xen won't allow a 1:1 mapping to be created to UNUSABLE - * regions, so if we're using the machine memory map leave the - * region as RAM as it is in the pseudo-physical map. - * - * UNUSABLE regions in domUs are not handled and will need - * a patch in the future. - */ - if (xen_initial_domain()) + if (xen_initial_domain()) { + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ xen_ignore_unusable(); +#ifdef CONFIG_ISCSI_IBFT_FIND + /* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */ + xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED; + xen_e820_table.nr_entries++; +#endif + } + /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. + */ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* + * Check for a conflict of the xen_start_info memory with the target + * E820 map. + */ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ - max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages); + max_pages += xen_foreach_remap_area(xen_count_remap_pages); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; + if (max_pages > ini_nr_pages) + extra_pages += max_pages - ini_nr_pages; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -792,8 +888,8 @@ char * __init xen_memory_setup(void) * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages, max_pages - max_pfn); + maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM)); + extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages); i = 0; addr = xen_e820_table.entries[0].addr; size = xen_e820_table.entries[0].size; @@ -803,6 +899,9 @@ char * __init xen_memory_setup(void) chunk_size = size; type = xen_e820_table.entries[i].type; + if (type == E820_TYPE_RESERVED) + xen_pv_pci_possible = true; + if (type == E820_TYPE_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); @@ -846,23 +945,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* - * Check whether the kernel itself conflicts with the target E820 map. - * Failing now is better than running into weird problems later due - * to relocating (and even reusing) pages with kernel text or data. - */ - if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } - - /* - * Check for a conflict of the hypervisor supplied page tables with - * the target E820 map. - */ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ @@ -890,7 +972,7 @@ char * __init xen_memory_setup(void) * Set identity map on non-RAM pages and prepare remapping the * underlying RAM. */ - xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk); + xen_foreach_remap_area(xen_set_identity_and_remap_chunk); pr_info("Released %ld page(s)\n", xen_released_pages); @@ -910,17 +992,9 @@ static int register_callback(unsigned type, const void *func) void xen_enable_sysenter(void) { - int ret; - unsigned sysenter_feature; - - sysenter_feature = X86_FEATURE_SYSENTER32; - - if (!boot_cpu_has(sysenter_feature)) - return; - - ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat); - if(ret != 0) - setup_clear_cpu_cap(sysenter_feature); + if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) && + register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); } void xen_enable_syscall(void) @@ -934,22 +1008,15 @@ void xen_enable_syscall(void) mechanism for syscalls. */ } - if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { - ret = register_callback(CALLBACKTYPE_syscall32, - xen_entry_SYSCALL_compat); - if (ret != 0) - setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); - } + if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) && + register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } static void __init xen_pvmmu_arch_setup(void) { - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_pae_extended_cr3); - if (register_callback(CALLBACKTYPE_event, xen_asm_exc_xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c3e1f9a7d43a..05f92c812ac8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -9,7 +9,6 @@ #include <xen/hvc-console.h> #include "xen-ops.h" -#include "smp.h" static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; @@ -32,30 +31,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) void xen_smp_intr_free(unsigned int cpu) { + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); per_cpu(xen_resched_irq, cpu).irq = -1; - kfree(per_cpu(xen_resched_irq, cpu).name); - per_cpu(xen_resched_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); per_cpu(xen_callfunc_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfunc_irq, cpu).name); - per_cpu(xen_callfunc_irq, cpu).name = NULL; } + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; if (per_cpu(xen_debug_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); per_cpu(xen_debug_irq, cpu).irq = -1; - kfree(per_cpu(xen_debug_irq, cpu).name); - per_cpu(xen_debug_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, NULL); per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); - per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } } @@ -65,6 +64,9 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + if (!resched_name) + goto fail_mem; + per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, @@ -74,9 +76,11 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_resched_irq, cpu).irq = rc; - per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + if (!callfunc_name) + goto fail_mem; + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, @@ -86,10 +90,13 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfunc_irq, cpu).irq = rc; - per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + if (!debug_name) + goto fail_mem; + + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, IRQF_PERCPU | IRQF_NOBALANCING, @@ -97,10 +104,13 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + if (!callfunc_name) + goto fail_mem; + + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, @@ -110,10 +120,11 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; - per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; return 0; + fail_mem: + rc = -ENOMEM; fail: xen_smp_intr_free(cpu); return rc; @@ -123,8 +134,6 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) { if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); - else - calculate_max_logical_packages(); } void xen_smp_send_reschedule(int cpu) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h deleted file mode 100644 index bd02f9d50107..000000000000 --- a/arch/x86/xen/smp.h +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_SMP_H - -#ifdef CONFIG_SMP -extern void xen_send_IPI_mask(const struct cpumask *mask, - int vector); -extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, - int vector); -extern void xen_send_IPI_allbutself(int vector); -extern void xen_send_IPI_all(int vector); -extern void xen_send_IPI_self(int vector); - -extern int xen_smp_intr_init(unsigned int cpu); -extern void xen_smp_intr_free(unsigned int cpu); -int xen_smp_intr_init_pv(unsigned int cpu); -void xen_smp_intr_free_pv(unsigned int cpu); - -void xen_smp_cpus_done(unsigned int max_cpus); - -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(const struct cpumask *mask); -void xen_smp_send_call_function_single_ipi(int cpu); - -struct xen_common_irq { - int irq; - char *name; -}; -#else /* CONFIG_SMP */ - -static inline int xen_smp_intr_init(unsigned int cpu) -{ - return 0; -} -static inline void xen_smp_intr_free(unsigned int cpu) {} - -static inline int xen_smp_intr_init_pv(unsigned int cpu) -{ - return 0; -} -static inline void xen_smp_intr_free_pv(unsigned int cpu) {} -#endif /* CONFIG_SMP */ - -#endif diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index b70afdff419c..485c1d8804f7 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -5,8 +5,6 @@ #include <xen/events.h> #include "xen-ops.h" -#include "smp.h" - static void __init xen_hvm_smp_prepare_boot_cpu(void) { @@ -55,18 +53,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { - if (common_cpu_die(cpu) == 0) { - if (xen_have_vector_callback) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - } + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); } } #else -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { BUG(); } @@ -77,7 +73,7 @@ void __init xen_hvm_smp_init(void) smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_cpus_done = xen_smp_cpus_done; - smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.cleanup_dead_cpu = xen_hvm_cleanup_dead_cpu; if (!xen_have_vector_callback) { #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index ba7af2eca755..9bb8ff8bff30 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -29,6 +29,7 @@ #include <asm/idtentry.h> #include <asm/desc.h> #include <asm/cpu.h> +#include <asm/apic.h> #include <asm/io_apic.h> #include <xen/interface/xen.h> @@ -45,9 +46,6 @@ #include <xen/hvc-console.h> #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -55,14 +53,15 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); -void asm_cpu_bringup_and_idle(void); static void cpu_bringup(void) { int cpu; cr4_init(); + cpuhp_ap_sync_alive(); cpu_init(); + fpu__init_cpu(); touch_softlockup_watchdog(); /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ @@ -71,8 +70,7 @@ static void cpu_bringup(void) xen_enable_syscall(); } cpu = smp_processor_id(); - smp_store_cpu_info(cpu); - cpu_data(cpu).x86_max_cores = 1; + identify_secondary_cpu(cpu); set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); @@ -83,7 +81,7 @@ static void cpu_bringup(void) set_cpu_online(cpu, true); - cpu_set_state_online(cpu); /* Implies full memory barrier. */ + smp_mb(); /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -97,18 +95,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void) void xen_smp_intr_free_pv(unsigned int cpu) { + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; if (per_cpu(xen_irq_work, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; } + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; } } @@ -118,6 +116,7 @@ int xen_smp_intr_init_pv(unsigned int cpu) char *callfunc_name, *pmu_name; callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + per_cpu(xen_irq_work, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, @@ -127,10 +126,10 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; if (is_xen_pmu) { pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + per_cpu(xen_pmu_irq, cpu).name = pmu_name; rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, xen_pmu_irq_handler, IRQF_PERCPU|IRQF_NOBALANCING, @@ -138,7 +137,6 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; } return 0; @@ -148,40 +146,18 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init _get_smp_config(unsigned int early) +static void __init xen_pv_smp_config(void) { - int i, rc; - unsigned int subtract = 0; - - if (early) - return; - - num_processors = 0; - disabled_cpus = 0; - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } else { - set_cpu_possible(i, false); - set_cpu_present(i, false); - subtract++; - } - } -#ifdef CONFIG_HOTPLUG_CPU - /* This is akin to using 'nr_cpus' on the Linux command line. - * Which is OK as when we use 'dom0_max_vcpus=X' we can only - * have up to X, while nr_cpu_ids is greater than X. This - * normally is not a problem, except when CPU hotplugging - * is involved and then there might be more than X CPUs - * in the guest - which will not work as there is no - * hypercall to expand the max number of VCPUs an already - * running guest has. So cap it up to X. */ - if (subtract) - nr_cpu_ids = nr_cpu_ids - subtract; -#endif + u32 apicid = 0; + int i; + + topology_register_boot_apic(apicid); + for (i = 0; i < nr_cpu_ids; i++) + topology_register_apic(apicid++, CPU_ACPIID_INVALID, true); + + /* Pretend to be a proper enumerated system */ + smp_found_config = 1; } static void __init xen_pv_smp_prepare_boot_cpu(void) @@ -209,7 +185,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { char *m = (max_cpus == 0) ? "The nosmp parameter is incompatible with Xen; " \ "use Xen dom0_max_vcpus=1 parameter" : @@ -222,8 +198,6 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) smp_prepare_cpus_common(); - cpu_data(0).x86_max_cores = 1; - speculative_store_bypass_ht_init(); xen_pmu_init(0); @@ -254,15 +228,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct desc_struct *gdt; unsigned long gdt_mfn; - /* used to tell cpu_init() that it can proceed with initialization */ - cpumask_set_cpu(cpu, cpu_callout_mask); if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt == NULL) { cpumask_clear_cpu(cpu, xen_cpu_initialized_map); - cpumask_clear_cpu(cpu, cpu_callout_mask); return -ENOMEM; } @@ -316,7 +287,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) +static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) { int rc; @@ -326,14 +297,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_setup_runstate_info(cpu); - /* - * PV VCPUs are always successfully taken down (see 'while' loop - * in xen_cpu_die()), so -EBUSY is an error. - */ - rc = cpu_check_up_prepare(cpu); - if (rc) - return rc; - /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -343,15 +306,20 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_pmu_init(cpu); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); - BUG_ON(rc); - - while (cpu_report_state(cpu) != CPU_ONLINE) - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + /* + * Why is this a BUG? If the hypercall fails then everything can be + * rolled back, no? + */ + BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)); return 0; } +static void xen_pv_poll_sync_state(void) +{ + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); +} + #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -367,35 +335,26 @@ static int xen_pv_cpu_disable(void) static void xen_pv_cpu_die(unsigned int cpu) { - while (HYPERVISOR_vcpu_op(VCPUOP_is_up, - xen_vcpu_nr(cpu), NULL)) { + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), NULL)) { __set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ/10); } +} - if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - xen_pmu_finish(cpu); - } +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } -static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ +static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); - cpu_bringup(); - /* - * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) - * clears certain data that the cpu_idle loop (which called us - * and that we return from) expects. The only way to get that - * data back is to call: - */ - tick_nohz_idle_enter(); - tick_nohz_idle_stop_tick_protected(); - - cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); + xen_cpu_bringup_again((unsigned long)task_pt_regs(current)); + BUG(); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -409,7 +368,12 @@ static void xen_pv_cpu_die(unsigned int cpu) BUG(); } -static void xen_pv_play_dead(void) +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + BUG(); +} + +static void __noreturn xen_pv_play_dead(void) { BUG(); } @@ -442,13 +406,29 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +void __init xen_smp_count_cpus(void) +{ + unsigned int cpus; + + for (cpus = 0; cpus < nr_cpu_ids; cpus++) { + if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpus, NULL) < 0) + break; + } + + pr_info("Xen PV: Detected %u vCPUS\n", cpus); + if (cpus < nr_cpu_ids) + set_nr_cpu_ids(cpus); +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, .smp_cpus_done = xen_smp_cpus_done, - .cpu_up = xen_pv_cpu_up, + .kick_ap_alive = xen_pv_kick_ap, .cpu_die = xen_pv_cpu_die, + .cleanup_dead_cpu = xen_pv_cleanup_dead_cpu, + .poll_sync_state = xen_pv_poll_sync_state, .cpu_disable = xen_pv_cpu_disable, .play_dead = xen_pv_play_dead, @@ -464,6 +444,12 @@ void __init xen_smp_init(void) smp_ops = xen_smp_ops; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = _get_smp_config; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + + /* XEN/PV Dom0 has halfways sane topology information via CPUID/MADT */ + if (xen_initial_domain()) + x86_init.mpparse.parse_smp_cfg = x86_init_noop; + else + x86_init.mpparse.parse_smp_cfg = xen_pv_smp_config; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 043c73dfd2c9..8e4efe0fb6f9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -18,7 +18,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest); -static bool xen_pvspin = true; static void xen_qlock_kick(int cpu) { @@ -68,13 +67,14 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) + if (nopvspin) return; WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", cpu, per_cpu(lock_kicker_irq, cpu)); name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + per_cpu(irq_name, cpu) = name; irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, @@ -85,7 +85,6 @@ void xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; - per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -95,9 +94,11 @@ void xen_uninit_lock_cpu(int cpu) { int irq; - if (!xen_pvspin) + if (nopvspin) return; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; /* * When booting the kernel with 'mitigations=auto,nosmt', the secondary * CPUs are not activated, and lock_kicker_irq is not initialized. @@ -108,8 +109,6 @@ void xen_uninit_lock_cpu(int cpu) unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; - kfree(per_cpu(irq_name, cpu)); - per_cpu(irq_name, cpu) = NULL; } PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); @@ -125,10 +124,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); void __init xen_init_spinlocks(void) { /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1 || nopvspin) - xen_pvspin = false; + if (num_possible_cpus() == 1) + nopvspin = true; - if (!xen_pvspin) { + if (nopvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); static_branch_disable(&virt_spin_lock_key); return; @@ -143,12 +142,3 @@ void __init xen_init_spinlocks(void) pv_ops.lock.kick = xen_qlock_kick; pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } - -static __init int xen_parse_nopvspin(char *arg) -{ - pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); - xen_pvspin = false; - return 0; -} -early_param("xen_nopvspin", xen_parse_nopvspin); - diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d83152c761b..ba2f17e64321 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -13,10 +13,9 @@ #include <asm/xen/hypercall.h> #include <asm/xen/page.h> #include <asm/fixmap.h> +#include <asm/msr.h> #include "xen-ops.h" -#include "mmu.h" -#include "pmu.h" static DEFINE_PER_CPU(u64, spec_ctrl); @@ -41,7 +40,7 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) - wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); + wrmsrq(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl)); /* Boot processor notified via generic timekeeping_resume() */ if (smp_processor_id() == 0) @@ -57,9 +56,9 @@ static void xen_vcpu_notify_suspend(void *data) tick_suspend_local(); if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, tmp); + rdmsrq(MSR_IA32_SPEC_CTRL, tmp); this_cpu_write(spec_ctrl, tmp); - wrmsrl(MSR_IA32_SPEC_CTRL, 0); + wrmsrq(MSR_IA32_SPEC_CTRL, 0); } } diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 9d548b0c772f..0c4f7554b7cc 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -5,6 +5,7 @@ #include <xen/hvm.h> #include <xen/features.h> #include <xen/interface/features.h> +#include <xen/events.h> #include "xen-ops.h" @@ -14,6 +15,13 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_hvm_init_shared_info(); xen_vcpu_restore(); } - xen_setup_callback_vector(); + if (xen_percpu_upcall) { + unsigned int cpu; + + for_each_online_cpu(cpu) + BUG_ON(xen_set_upcall_vector(cpu)); + } else { + xen_setup_callback_vector(); + } xen_unplug_emulated_devices(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 9ef0a5cca96e..96521b1874ac 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <asm/xen/cpuid.h> #include <xen/events.h> #include <xen/features.h> @@ -29,7 +30,7 @@ #include "xen-ops.h" /* Minimum amount of time until next clock event fires */ -#define TIMER_SLOP 100000 +#define TIMER_SLOP 1 static u64 xen_sched_clock_offset __read_mostly; @@ -60,9 +61,16 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } -static u64 xen_sched_clock(void) +static noinstr u64 xen_sched_clock(void) { - return xen_clocksource_read() - xen_sched_clock_offset; + struct pvclock_vcpu_time_info *src; + u64 ret; + + src = &__this_cpu_read(xen_vcpu)->time; + ret = pvclock_clocksource_read_nowd(src); + ret -= xen_sched_clock_offset; + + return ret; } static void xen_read_wallclock(struct timespec64 *ts) @@ -474,15 +482,47 @@ static void xen_setup_vsyscall_time_info(void) xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; } +/* + * Check if it is possible to safely use the tsc as a clocksource. This is + * only true if the hypervisor notifies the guest that its tsc is invariant, + * the tsc is stable, and the tsc instruction will never be emulated. + */ +static int __init xen_tsc_safe_clocksource(void) +{ + u32 eax, ebx, ecx, edx; + + if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC))) + return 0; + + if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC))) + return 0; + + if (check_tsc_unstable()) + return 0; + + /* Leaf 4, sub-leaf 0 (0x40000x03) */ + cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx); + + return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE; +} + static void __init xen_time_init(void) { struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); struct timespec64 tp; - /* As Dom0 is never moved, no penalty on using TSC there */ + /* + * As Dom0 is never moved, no penalty on using TSC there. + * + * If it is possible for the guest to determine that the tsc is a safe + * clocksource, then set xen_clocksource rating below that of the tsc + * so that the system prefers tsc instead. + */ if (xen_initial_domain()) xen_clocksource.rating = 275; + else if (xen_tsc_safe_clocksource()) + xen_clocksource.rating = 299; clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 14ea32e734d5..f7547807b0bd 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -2,17 +2,15 @@ #include <linux/screen_info.h> #include <linux/init.h> -#include <asm/bootparam.h> #include <asm/setup.h> #include <xen/interface/xen.h> #include "xen-ops.h" -void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size, + struct screen_info *screen_info) { - struct screen_info *screen_info = &boot_params.screen_info; - /* This is drawn from a dump from vgacon:startup in * standard Linux. */ screen_info->orig_video_mode = 3; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6b4fdf6b9542..461bb1526502 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -20,15 +20,39 @@ #include <linux/init.h> #include <linux/linkage.h> +#include <linux/objtool.h> #include <../entry/calling.h> .pushsection .noinstr.text, "ax" /* + * PV hypercall interface to the hypervisor. + * + * Called via inline asm(), so better preserve %rcx and %r11. + * + * Input: + * %eax: hypercall number + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %rax + */ +SYM_FUNC_START(xen_hypercall_pv) + ANNOTATE_NOENDBR + push %rcx + push %r11 + UNWIND_HINT_SAVE + syscall + UNWIND_HINT_RESTORE + pop %r11 + pop %rcx + RET +SYM_FUNC_END(xen_hypercall_pv) + +/* * Disabling events is simply a matter of making the event mask * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + ENDBR + movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -67,9 +91,10 @@ SYM_FUNC_END(check_events) * then enter the hypervisor to get them handled. */ SYM_FUNC_START(xen_irq_enable_direct) + ENDBR FRAME_BEGIN /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) /* * Preempt here doesn't matter because that will deal with any @@ -78,7 +103,7 @@ SYM_FUNC_START(xen_irq_enable_direct) */ /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending) jz 1f call check_events @@ -97,13 +122,15 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + ENDBR + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah RET SYM_FUNC_END(xen_save_fl_direct) SYM_FUNC_START(xen_read_cr2) + ENDBR FRAME_BEGIN _ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX _ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX @@ -112,8 +139,9 @@ SYM_FUNC_START(xen_read_cr2) SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) + ENDBR FRAME_BEGIN - _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX + _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END RET SYM_FUNC_END(xen_read_cr2_direct); @@ -148,7 +176,7 @@ xen_pv_trap asm_exc_page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET xen_pv_trap asm_exc_control_protection #endif #ifdef CONFIG_X86_MCE @@ -156,7 +184,7 @@ xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION -xen_pv_trap entry_INT80_compat +xen_pv_trap asm_int80_emulation #endif xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback @@ -165,7 +193,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback SYM_CODE_START(xen_early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ENDBR pop %rcx pop %r11 @@ -176,7 +204,6 @@ SYM_CODE_START(xen_early_idt_handler_array) SYM_CODE_END(xen_early_idt_handler_array) __FINIT -hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 /* * Xen64 iret frame: * @@ -186,17 +213,26 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * cs * rip <-- standard iret frame * - * flags + * flags <-- xen_iret must push from here on * - * rcx } - * r11 }<-- pushed by hypercall page - * rsp->rax } + * rcx + * r11 + * rsp->rax */ +.macro xen_hypercall_iret + pushq $0 /* Flags */ + push %rcx + push %r11 + push %rax + mov $__HYPERVISOR_iret, %eax + syscall /* Do the IRET. */ + ud2 /* The SYSCALL should never return. */ +.endm + SYM_CODE_START(xen_iret) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_iret) /* @@ -262,10 +298,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe @@ -284,10 +320,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe @@ -301,8 +337,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) ENDBR lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $0 - jmp hypercall_iret + xen_hypercall_iret SYM_CODE_END(xen_entry_SYSENTER_compat) SYM_CODE_END(xen_entry_SYSCALL_compat) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index ffaa62167f6e..5dad6c51cdc3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -6,9 +6,11 @@ #include <linux/elfnote.h> #include <linux/init.h> +#include <linux/instrumentation.h> #include <asm/boot.h> #include <asm/asm.h> +#include <asm/frame.h> #include <asm/msr.h> #include <asm/page_types.h> #include <asm/percpu.h> @@ -20,47 +22,23 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> -.pushsection .noinstr.text, "ax" - .balign PAGE_SIZE -SYM_CODE_START(hypercall_page) - .rept (PAGE_SIZE / 32) - UNWIND_HINT_FUNC - ANNOTATE_NOENDBR - ANNOTATE_UNRET_SAFE - ret - /* - * Xen will write the hypercall page, and sort out ENDBR. - */ - .skip 31, 0xcc - .endr - -#define HYPERCALL(n) \ - .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \ - .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32 -#include <asm/xen-hypercalls.h> -#undef HYPERCALL -SYM_CODE_END(hypercall_page) -.popsection - #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR cld - mov initial_stack(%rip), %rsp + leaq __top_init_kernel_stack(%rip), %rsp - /* Set up %gs. - * - * The base of %gs always points to fixed_percpu_data. If the - * stack protector canary is enabled, it is located at %gs:40. + /* + * Set up GSBASE. * Note that, on SMP, the boot cpu uses init data section until * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax - cdq + xorl %eax, %eax + xorl %edx, %edx wrmsr mov %rsi, %rdi @@ -71,42 +49,132 @@ SYM_CODE_END(startup_xen) #ifdef CONFIG_XEN_PV_SMP .pushsection .text SYM_CODE_START(asm_cpu_bringup_and_idle) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ENDBR call cpu_bringup_and_idle SYM_CODE_END(asm_cpu_bringup_and_idle) + +SYM_CODE_START(xen_cpu_bringup_again) + UNWIND_HINT_FUNC + mov %rdi, %rsp + UNWIND_HINT_REGS + call cpu_bringup_and_idle +SYM_CODE_END(xen_cpu_bringup_again) .popsection #endif #endif + .pushsection .noinstr.text, "ax" +/* + * Xen hypercall interface to the hypervisor. + * + * Input: + * %eax: hypercall number + * 32-bit: + * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall + * 64-bit: + * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall + * Output: %[er]ax + */ +SYM_FUNC_START(xen_hypercall_hvm) + ENDBR + FRAME_BEGIN + /* Save all relevant registers (caller save and arguments). */ +#ifdef CONFIG_X86_32 + push %eax + push %ebx + push %ecx + push %edx + push %esi + push %edi +#else + push %rax + push %rcx + push %rdx + push %rdi + push %rsi + push %r11 + push %r10 + push %r9 + push %r8 +#endif + /* Set the vendor specific function. */ + call __xen_hypercall_setfunc + /* Set ZF = 1 if AMD, Restore saved registers. */ +#ifdef CONFIG_X86_32 + lea xen_hypercall_amd, %ebx + cmp %eax, %ebx + pop %edi + pop %esi + pop %edx + pop %ecx + pop %ebx + pop %eax +#else + lea xen_hypercall_amd(%rip), %rcx + cmp %rax, %rcx + pop %r8 + pop %r9 + pop %r10 + pop %r11 + pop %rsi + pop %rdi + pop %rdx + pop %rcx + pop %rax +#endif + FRAME_END + /* Use correct hypercall function. */ + jz xen_hypercall_amd + jmp xen_hypercall_intel +SYM_FUNC_END(xen_hypercall_hvm) + +SYM_FUNC_START(xen_hypercall_amd) + ANNOTATE_NOENDBR + vmmcall + RET +SYM_FUNC_END(xen_hypercall_amd) + +SYM_FUNC_START(xen_hypercall_intel) + ANNOTATE_NOENDBR + vmcall + RET +SYM_FUNC_END(xen_hypercall_intel) + .popsection + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") -#ifdef CONFIG_X86_32 - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) -#else +#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) -#endif -#ifdef CONFIG_XEN_PV - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) -#endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, - .ascii "!writable_page_tables|pae_pgdir_above_4gb") - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_dom0) | \ - (1 << XENFEAT_linux_rsdp_unrestricted)) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .globl xen_elfnote_entry; + xen_elfnote_entry: _ASM_PTR xen_elfnote_entry_value - .) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) - ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) +# define FEATURES_PV (1 << XENFEAT_writable_page_tables) +#else +# define FEATURES_PV 0 +#endif +#ifdef CONFIG_XEN_PVH +# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted) +#else +# define FEATURES_PVH 0 +#endif +#ifdef CONFIG_XEN_DOM0 +# define FEATURES_DOM0 (1 << XENFEAT_dom0) +#else +# define FEATURES_DOM0 0 +#endif + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a8bb972193d..090349baec09 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -5,8 +5,15 @@ #include <linux/init.h> #include <linux/clocksource.h> #include <linux/irqreturn.h> +#include <linux/linkage.h> + +#include <xen/interface/xenpmu.h> #include <xen/xen-ops.h> +#include <asm/page.h> + +#include <trace/events/xen.h> + /* These are code, but not functions. Defined in entry.S */ extern const char xen_failsafe_callback[]; @@ -21,16 +28,13 @@ extern void *xen_initial_gdt; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); -DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; extern struct shared_info *HYPERVISOR_shared_info; -extern bool xen_fifo_events; - void xen_setup_mfn_list_list(void); void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); @@ -43,8 +47,12 @@ void xen_mm_unpin_all(void); #ifdef CONFIG_X86_64 void __init xen_relocate_p2m(void); #endif +void __init xen_do_remap_nonram(void); +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, + unsigned long size); -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); @@ -72,8 +80,6 @@ void xen_restore_time_memory_area(void); void xen_init_time_ops(void); void xen_hvm_init_time_ops(void); -irqreturn_t xen_debug_interrupt(int irq, void *dev_id); - bool xen_vcpu_stolen(int vcpu); void xen_vcpu_setup(int cpu); @@ -108,11 +114,12 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_PV_DOM0 -void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size, + struct screen_info *); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, - size_t size) + size_t size, struct screen_info *si) { } #endif @@ -147,9 +154,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), void xen_pin_vcpu(int cpu); void xen_emergency_restart(void); +void xen_force_evtchn_callback(void); + #ifdef CONFIG_XEN_PV void xen_pv_pre_suspend(void); void xen_pv_post_suspend(int suspend_cancelled); +void xen_start_kernel(struct start_info *si); #else static inline void xen_pv_pre_suspend(void) {} static inline void xen_pv_post_suspend(int suspend_cancelled) {} @@ -161,4 +171,164 @@ void xen_hvm_post_suspend(int suspend_cancelled); static inline void xen_hvm_post_suspend(int suspend_cancelled) {} #endif +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns); + +struct dentry * __init xen_init_debugfs(void); + +enum pt_level { + PT_PGD, + PT_P4D, + PT_PUD, + PT_PMD, + PT_PTE +}; + +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); +unsigned long xen_read_cr2_direct(void); +void xen_init_mmu_ops(void); +void xen_hvm_init_mmu_ops(void); + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space __xen_mc_entry(size_t args); + +DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); + +/* Call to start a batch of multiple __xen_mc_entry()s. Must be + paired with xen_mc_issue() */ +static inline void xen_mc_batch(void) +{ + unsigned long flags; + + /* need to disable interrupts until this entry is complete */ + local_irq_save(flags); + trace_xen_mc_batch(xen_get_lazy_mode()); + __this_cpu_write(xen_mc_irq_flags, flags); +} + +static inline struct multicall_space xen_mc_entry(size_t args) +{ + xen_mc_batch(); + return __xen_mc_entry(args); +} + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in a lazy mode */ +static inline void xen_mc_issue(unsigned mode) +{ + trace_xen_mc_issue(mode); + + if ((xen_get_lazy_mode() & mode) == 0) + xen_mc_flush(); + + /* restore flags saved in xen_mc_batch */ + local_irq_restore(this_cpu_read(xen_mc_irq_flags)); +} + +/* Set up a callback to be called when the current batch is flushed */ +void xen_mc_callback(void (*fn)(void *), void *data); + +/* + * Try to extend the arguments of the previous multicall command. The + * previous command's op must match. If it does, then it attempts to + * extend the argument space allocated to the multicall entry by + * arg_size bytes. + * + * The returned multicall_space will return with mc pointing to the + * command on success, or NULL on failure, and args pointing to the + * newly allocated space. + */ +struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); + +extern bool is_xen_pmu; + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +#ifdef CONFIG_XEN_HAVE_VPMU +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +#else +static inline void xen_pmu_init(int cpu) {} +static inline void xen_pmu_finish(int cpu) {} +#endif +bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read); +int pmu_apic_update(uint32_t reg); +u64 xen_read_pmc(int counter); + +#ifdef CONFIG_SMP + +void asm_cpu_bringup_and_idle(void); +asmlinkage void cpu_bringup_and_idle(void); + +extern void xen_send_IPI_mask(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector); +extern void xen_send_IPI_allbutself(int vector); +extern void xen_send_IPI_all(int vector); +extern void xen_send_IPI_self(int vector); + +extern int xen_smp_intr_init(unsigned int cpu); +extern void xen_smp_intr_free(unsigned int cpu); +int xen_smp_intr_init_pv(unsigned int cpu); +void xen_smp_intr_free_pv(unsigned int cpu); + +void xen_smp_count_cpus(void); +void xen_smp_cpus_done(unsigned int max_cpus); + +void xen_smp_send_reschedule(int cpu); +void xen_smp_send_call_function_ipi(const struct cpumask *mask); +void xen_smp_send_call_function_single_ipi(int cpu); + +void __noreturn xen_cpu_bringup_again(unsigned long stack); + +struct xen_common_irq { + int irq; + char *name; +}; +#else /* CONFIG_SMP */ + +static inline int xen_smp_intr_init(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free(unsigned int cpu) {} + +static inline int xen_smp_intr_init_pv(unsigned int cpu) +{ + return 0; +} +static inline void xen_smp_intr_free_pv(unsigned int cpu) {} +static inline void xen_smp_count_cpus(void) { } +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_XEN_PV +void xen_hypercall_pv(void); +#endif +void xen_hypercall_hvm(void); +void xen_hypercall_amd(void); +void xen_hypercall_intel(void); +void xen_hypercall_setfunc(void); +void *__xen_hypercall_setfunc(void); + #endif /* XEN_OPS_H */ |