diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 11 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/xen/apic.c | 116 | ||||
-rw-r--r-- | arch/x86/xen/efi.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 42 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_hvm.c | 47 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pv.c | 235 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pvh.c | 82 | ||||
-rw-r--r-- | arch/x86/xen/irq.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/mmu_pv.c | 121 | ||||
-rw-r--r-- | arch/x86/xen/multicalls.h | 4 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 5 | ||||
-rw-r--r-- | arch/x86/xen/pmu.c | 71 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 112 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 38 | ||||
-rw-r--r-- | arch/x86/xen/smp.h | 8 | ||||
-rw-r--r-- | arch/x86/xen/smp_hvm.c | 16 | ||||
-rw-r--r-- | arch/x86/xen/smp_pv.c | 159 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 6 | ||||
-rw-r--r-- | arch/x86/xen/suspend_hvm.c | 10 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 46 | ||||
-rw-r--r-- | arch/x86/xen/vga.c | 6 | ||||
-rw-r--r-- | arch/x86/xen/xen-asm.S | 26 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 50 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 28 |
25 files changed, 766 insertions, 483 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 85246dd9faa1..77e788e928cd 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -9,6 +9,7 @@ config XEN select PARAVIRT_CLOCK select X86_HV_CALLBACK_VECTOR depends on X86_64 || (X86_32 && X86_PAE) + depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8) depends on X86_LOCAL_APIC && X86_TSC help This is the Linux Xen port. Enabling this will allow the @@ -80,7 +81,6 @@ config XEN_PVH bool "Xen PVH guest support" depends on XEN && XEN_PVHVM && ACPI select PVH - def_bool n help Support for running as a Xen PVH guest. @@ -92,3 +92,12 @@ config XEN_DOM0 select X86_X2APIC if XEN_PVH && X86_64 help Support running as a Xen Dom0 guest. + +config XEN_PV_MSR_SAFE + bool "Always use safe MSR accesses in PV guests" + default y + depends on XEN_PV + help + Use safe (not faulting) MSR access functions even if the MSR access + should not fault anyway. + The default can be changed by using the "xen_msr_safe" boot parameter. diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 3c5b52fbe4a7..a9ec8c9f5c5d 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o -obj-$(CONFIG_XEN_PV_DOM0) += vga.o +obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_XEN_EFI) += efi.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 62d34b6611c5..8b045dd25196 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -33,13 +33,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) return 0xfd; } -static u32 xen_set_apic_id(unsigned int x) -{ - WARN_ON(1); - return x; -} - -static unsigned int xen_get_apic_id(unsigned long x) +static u32 xen_get_apic_id(u32 x) { return ((x)>>24) & 0xFFu; } @@ -49,20 +43,20 @@ static u32 xen_apic_read(u32 reg) struct xen_platform_op op = { .cmd = XENPF_get_cpuinfo, .interface_version = XENPF_INTERFACE_VERSION, - .u.pcpu_info.xen_cpuid = 0, }; - int ret; - - /* Shouldn't need this as APIC is turned off for PV, and we only - * get called on the bootup processor. But just in case. */ - if (!xen_initial_domain() || smp_processor_id()) - return 0; + int ret, cpu; if (reg == APIC_LVR) return 0x14; if (reg != APIC_ID) return 0; + cpu = smp_processor_id(); + if (!xen_initial_domain()) + return cpu ? cpuid_to_apicid[cpu] << 24 : 0; + + op.u.pcpu_info.xen_cpuid = cpu; + ret = HYPERVISOR_platform_op(&op); if (ret) op.u.pcpu_info.apic_id = BAD_APICID; @@ -81,6 +75,11 @@ static void xen_apic_write(u32 reg, u32 val) WARN(1,"register: %x, value: %x\n", reg, val); } +static void xen_apic_eoi(void) +{ + WARN_ON_ONCE(1); +} + static u64 xen_apic_icr_read(void) { return 0; @@ -92,11 +91,6 @@ static void xen_apic_icr_write(u32 low, u32 id) WARN_ON(1); } -static u32 xen_safe_apic_wait_icr_idle(void) -{ - return 0; -} - static int xen_apic_probe_pv(void) { if (xen_pv_domain()) @@ -110,99 +104,47 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id) return xen_pv_domain(); } -static int xen_id_always_valid(u32 apicid) -{ - return 1; -} - -static int xen_id_always_registered(void) -{ - return 1; -} - -static int xen_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return initial_apic_id >> index_msb; -} - -static void xen_noop(void) -{ -} - -static void xen_silent_inquire(int apicid) -{ -} - -static int xen_cpu_present_to_apicid(int cpu) +static u32 xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return cpu_data(cpu).apicid; + return cpu_data(cpu).topo.apicid; else return BAD_APICID; } -static struct apic xen_pv_apic = { - .name = "Xen PV", - .probe = xen_apic_probe_pv, +static struct apic xen_pv_apic __ro_after_init = { + .name = "Xen PV", + .probe = xen_apic_probe_pv, .acpi_madt_oem_check = xen_madt_oem_check, - .apic_id_valid = xen_id_always_valid, - .apic_id_registered = xen_id_always_registered, /* .delivery_mode and .dest_mode_logical not used by XENPV */ .disable_esr = 0, - .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */ - .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */ - .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */ - .setup_apic_routing = NULL, .cpu_present_to_apicid = xen_cpu_present_to_apicid, - .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */ - .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */ - .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */ - .get_apic_id = xen_get_apic_id, - .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */ + .max_apic_id = UINT_MAX, + .get_apic_id = xen_get_apic_id, .calc_dest_apicid = apic_flat_calc_apicid, #ifdef CONFIG_SMP - .send_IPI_mask = xen_send_IPI_mask, - .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, - .send_IPI_allbutself = xen_send_IPI_allbutself, - .send_IPI_all = xen_send_IPI_all, - .send_IPI_self = xen_send_IPI_self, + .send_IPI_mask = xen_send_IPI_mask, + .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself, + .send_IPI_allbutself = xen_send_IPI_allbutself, + .send_IPI_all = xen_send_IPI_all, + .send_IPI_self = xen_send_IPI_self, #endif - /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */ - .inquire_remote_apic = xen_silent_inquire, - .read = xen_apic_read, .write = xen_apic_write, - .eoi_write = xen_apic_write, + .eoi = xen_apic_eoi, - .icr_read = xen_apic_icr_read, - .icr_write = xen_apic_icr_write, - .wait_icr_idle = xen_noop, - .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, }; +apic_driver(xen_pv_apic); -static void __init xen_apic_check(void) -{ - if (apic == &xen_pv_apic) - return; - - pr_info("Switched APIC routing from %s to %s.\n", apic->name, - xen_pv_apic.name); - apic = &xen_pv_apic; -} void __init xen_init_apic(void) { x86_apic_ops.io_apic_read = xen_io_apic_read; - /* On PV guests the APIC CPUID bit is disabled so none of the - * routines end up executing. */ - if (!xen_initial_domain()) - apic = &xen_pv_apic; - - x86_platform.apic_post_init = xen_apic_check; } -apic_driver(xen_pv_apic); diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 7d7ffb9c826a..7250d0e0e1a9 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -16,6 +16,8 @@ #include <asm/setup.h> #include <asm/xen/hypercall.h> +#include "xen-ops.h" + static efi_char16_t vendor[100] __initdata; static efi_system_table_t efi_systab_xen __initdata = { @@ -136,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params) if (efi_systab_xen == NULL) return; - strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", + strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen", sizeof(boot_params->efi_info.efi_loader_signature)); boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen); boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 30c6e986a6cd..a01ca255b0c6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -6,6 +6,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/kexec.h> +#include <linux/memblock.h> #include <linux/slab.h> #include <linux/panic_notifier.h> @@ -32,10 +33,13 @@ EXPORT_SYMBOL_GPL(hypercall_page); * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info * but during boot it is switched to point to xen_vcpu_info. - * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events. + * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events. + * Make sure that xen_vcpu_info doesn't cross a page boundary by making it + * cache-line aligned (the struct is guaranteed to have a size of 64 bytes, + * which matches the cache line size of 64-bit x86 processors). */ DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); /* Linux <-> Xen vCPU id mapping */ DEFINE_PER_CPU(uint32_t, xen_vcpu_id); @@ -51,7 +55,7 @@ EXPORT_SYMBOL_GPL(xen_start_info); struct shared_info xen_dummy_shared_info; -__read_mostly int xen_have_vector_callback; +__read_mostly bool xen_have_vector_callback = true; EXPORT_SYMBOL_GPL(xen_have_vector_callback); /* @@ -160,6 +164,7 @@ void xen_vcpu_setup(int cpu) int err; struct vcpu_info *vcpup; + BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES); BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); /* @@ -346,3 +351,34 @@ void xen_arch_unregister_cpu(int num) } EXPORT_SYMBOL(xen_arch_unregister_cpu); #endif + +/* Amount of extra memory space we add to the e820 ranges */ +struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; + +void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns) +{ + unsigned int i; + + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { + /* Add new region. */ + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; + break; + } + /* Append to existing region. */ + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; + break; + } + } + if (i == XEN_EXTRA_MEM_MAX_REGIONS) + printk(KERN_WARNING "Warning: not enough extra memory regions\n"); + + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); +} diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 8b71b1dd7639..c001a2296582 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -4,9 +4,12 @@ #include <linux/cpu.h> #include <linux/kexec.h> #include <linux/memblock.h> +#include <linux/virtio_anchor.h> #include <xen/features.h> #include <xen/events.h> +#include <xen/hvm.h> +#include <xen/interface/hvm/hvm_op.h> #include <xen/interface/memory.h> #include <asm/apic.h> @@ -30,6 +33,9 @@ static unsigned long shared_info_pfn; +__ro_after_init bool xen_percpu_upcall; +EXPORT_SYMBOL_GPL(xen_percpu_upcall); + void xen_hvm_init_shared_info(void) { struct xen_add_to_physmap xatp; @@ -125,9 +131,12 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback) { struct pt_regs *old_regs = set_irq_regs(regs); + if (xen_percpu_upcall) + apic_eoi(); + inc_irq_stat(irq_hv_callback_count); - xen_hvm_evtchn_do_upcall(); + xen_evtchn_do_upcall(); set_irq_regs(old_regs); } @@ -139,7 +148,9 @@ static void xen_hvm_shutdown(void) if (kexec_in_progress) xen_reboot(SHUTDOWN_soft_reset); } +#endif +#ifdef CONFIG_CRASH_DUMP static void xen_hvm_crash_shutdown(struct pt_regs *regs) { native_machine_crash_shutdown(regs); @@ -152,15 +163,14 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) int rc = 0; /* - * This can happen if CPU was offlined earlier and - * offlining timed out in common_cpu_die(). + * If a CPU was offlined earlier and offlining timed out then the + * lock mechanism is still initialized. Uninit it unconditionally + * as it's safe to call even if already uninited. Interrupts and + * timer have already been handled in xen_cpu_dead_hvm(). */ - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - } + xen_uninit_lock_cpu(cpu); - if (cpu_acpi_id(cpu) != U32_MAX) + if (cpu_acpi_id(cpu) != CPU_ACPIID_INVALID) per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); else per_cpu(xen_vcpu_id, cpu) = cpu; @@ -168,6 +178,15 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) if (!xen_have_vector_callback) return 0; + if (xen_percpu_upcall) { + rc = xen_set_upcall_vector(cpu); + if (rc) { + WARN(1, "HVMOP_set_evtchn_upcall_vector" + " for CPU %d failed: %d\n", cpu, rc); + return rc; + } + } + if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); @@ -188,14 +207,13 @@ static int xen_cpu_dead_hvm(unsigned int cpu) return 0; } -static bool no_vector_callback __initdata; - static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) return; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); init_hvm_pv_info(); @@ -211,9 +229,6 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) - xen_have_vector_callback = 1; - xen_hvm_smp_init(); WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); xen_unplug_emulated_devices(); @@ -223,6 +238,8 @@ static void __init xen_hvm_guest_init(void) #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; +#endif +#ifdef CONFIG_CRASH_DUMP machine_ops.crash_shutdown = xen_hvm_crash_shutdown; #endif } @@ -239,7 +256,7 @@ early_param("xen_nopv", xen_parse_nopv); static __init int xen_parse_no_vector_callback(char *arg) { - no_vector_callback = true; + xen_have_vector_callback = false; return 0; } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 70fb2ea85e90..ace2eb054053 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -23,6 +23,7 @@ #include <linux/start_kernel.h> #include <linux/sched.h> #include <linux/kprobes.h> +#include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/export.h> #include <linux/mm.h> @@ -31,6 +32,8 @@ #include <linux/gfp.h> #include <linux/edd.h> #include <linux/reboot.h> +#include <linux/virtio_anchor.h> +#include <linux/stackprotector.h> #include <xen/xen.h> #include <xen/events.h> @@ -63,9 +66,9 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/stackprotector.h> #include <asm/hypervisor.h> #include <asm/mach_traps.h> +#include <asm/mtrr.h> #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> @@ -76,7 +79,7 @@ #ifdef CONFIG_ACPI #include <linux/acpi.h> #include <asm/acpi.h> -#include <acpi/pdc_intel.h> +#include <acpi/proc_cap_intel.h> #include <acpi/processor.h> #include <xen/interface/platform.h> #endif @@ -98,6 +101,17 @@ struct tls_descs { struct desc_struct desc[3]; }; +DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE; +DEFINE_PER_CPU(unsigned int, xen_lazy_nesting); + +enum xen_lazy_mode xen_get_lazy_mode(void) +{ + if (in_interrupt()) + return XEN_LAZY_NONE; + + return this_cpu_read(xen_lazy_mode); +} + /* * Updating the 3 TLS descriptors in the GDT on every task switch is * surprisingly expensive so we avoid updating them if they haven't @@ -107,9 +121,69 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE); + +static int __init parse_xen_msr_safe(char *str) +{ + if (str) + return kstrtobool(str, &xen_msr_safe); + return -EINVAL; +} +early_param("xen_msr_safe", parse_xen_msr_safe); + +/* Get MTRR settings from Xen and put them into mtrr_state. */ +static void __init xen_set_mtrr_data(void) +{ +#ifdef CONFIG_MTRR + struct xen_platform_op op = { + .cmd = XENPF_read_memtype, + .interface_version = XENPF_INTERFACE_VERSION, + }; + unsigned int reg; + unsigned long mask; + uint32_t eax, width; + static struct mtrr_var_range var[MTRR_MAX_VAR_RANGES] __initdata; + + /* Get physical address width (only 64-bit cpus supported). */ + width = 36; + eax = cpuid_eax(0x80000000); + if ((eax >> 16) == 0x8000 && eax >= 0x80000008) { + eax = cpuid_eax(0x80000008); + width = eax & 0xff; + } + + for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) { + op.u.read_memtype.reg = reg; + if (HYPERVISOR_platform_op(&op)) + break; + + /* + * Only called in dom0, which has all RAM PFNs mapped at + * RAM MFNs, and all PCI space etc. is identity mapped. + * This means we can treat MFN == PFN regarding MTRR settings. + */ + var[reg].base_lo = op.u.read_memtype.type; + var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT; + var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT); + mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1); + mask &= (1UL << width) - 1; + if (mask) + mask |= MTRR_PHYSMASK_V; + var[reg].mask_lo = mask; + var[reg].mask_hi = mask >> 32; + } + + /* Only overwrite MTRR state if any MTRR could be got from Xen. */ + if (reg) + mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE); +#endif +} + static void __init xen_pv_init_platform(void) { - xen_set_restricted_virtio_memory_access(); + /* PV guests can't operate virtio devices without grants. */ + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); @@ -121,6 +195,14 @@ static void __init xen_pv_init_platform(void) /* pvclock is in shared info area */ xen_init_time_ops(); + + if (xen_initial_domain()) + xen_set_mtrr_data(); + else + mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK); + + /* Adjust nr_cpu_ids before "enumeration" happens */ + xen_smp_count_cpus(); } static void __init xen_pv_guest_late_init(void) @@ -220,17 +302,17 @@ static bool __init xen_check_mwait(void) native_cpuid(&ax, &bx, &cx, &dx); - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so, * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. */ buf[0] = ACPI_PDC_REVISION_ID; buf[1] = 1; - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP); set_xen_guest_handle(op.u.set_pminfo.pdc, buf); if ((HYPERVISOR_platform_op(&op) == 0) && - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) { cpuid_leaf5_ecx_val = cx; cpuid_leaf5_edx_val = dx; } @@ -262,6 +344,7 @@ static void __init xen_init_capabilities(void) setup_clear_cpu_cap(X86_FEATURE_ACC); setup_clear_cpu_cap(X86_FEATURE_X2APIC); setup_clear_cpu_cap(X86_FEATURE_SME); + setup_clear_cpu_cap(X86_FEATURE_LKGS); /* * Xen PV would need some work to support PCID: CR3 handling as well @@ -293,10 +376,25 @@ static noinstr unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } +static void xen_start_context_switch(struct task_struct *prev) +{ + BUG_ON(preemptible()); + + if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); + } + enter_lazy(XEN_LAZY_CPU); +} + static void xen_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + xen_mc_flush(); - paravirt_end_context_switch(next); + leave_lazy(XEN_LAZY_CPU); + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } static unsigned long xen_store_tr(void) @@ -403,7 +501,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gdt(const struct desc_ptr *dtr) @@ -454,7 +552,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG_ON(size > PAGE_SIZE); BUG_ON(va & ~PAGE_MASK); - pfn = virt_to_pfn(va); + pfn = virt_to_pfn((void *)va); mfn = pfn_to_mfn(pfn); pte = pfn_pte(pfn, PAGE_KERNEL_RO); @@ -499,7 +597,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * exception between the new %fs descriptor being loaded and * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (xen_get_lazy_mode() == XEN_LAZY_CPU) loadsegment(fs, 0); xen_mc_batch(); @@ -508,7 +606,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) load_TLS_descriptor(t, cpu, 1); load_TLS_descriptor(t, cpu, 2); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_load_gs_index(unsigned int idx) @@ -609,7 +707,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_int3, false ), TRAP_ENTRY(exc_overflow, false ), #ifdef CONFIG_IA32_EMULATION - { entry_INT80_compat, xen_entry_INT80_compat, false }, + TRAP_ENTRY(int80_emulation, false ), #endif TRAP_ENTRY(exc_page_fault, false ), TRAP_ENTRY(exc_divide_error, false ), @@ -625,7 +723,7 @@ static struct trap_array_entry trap_array[] = { TRAP_ENTRY(exc_coprocessor_error, false ), TRAP_ENTRY(exc_alignment_check, false ), TRAP_ENTRY(exc_simd_coprocessor_error, false ), -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET TRAP_ENTRY(exc_control_protection, false ), #endif }; @@ -762,6 +860,7 @@ static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; + static const struct trap_info zero = { }; unsigned out; trace_xen_cpu_load_idt(desc); @@ -771,7 +870,7 @@ static void xen_load_idt(const struct desc_ptr *desc) memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); out = xen_convert_trap_info(desc, traps, false); - memset(&traps[out], 0, sizeof(traps[0])); + traps[out] = zero; xen_mc_flush(); if (HYPERVISOR_set_trap_table(traps)) @@ -839,7 +938,7 @@ static void xen_load_sp0(unsigned long sp0) mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } @@ -903,7 +1002,7 @@ static void xen_write_cr0(unsigned long cr0) MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); } static void xen_write_cr4(unsigned long cr4) @@ -913,14 +1012,18 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static u64 xen_read_msr_safe(unsigned int msr, int *err) +static u64 xen_do_read_msr(unsigned int msr, int *err) { - u64 val; + u64 val = 0; /* Avoid uninitialized value for safe variant. */ if (pmu_msr_read(msr, &val, err)) return val; - val = native_read_msr_safe(msr, err); + if (err) + val = native_read_msr_safe(msr, err); + else + val = native_read_msr(msr); + switch (msr) { case MSR_IA32_APICBASE: val &= ~X2APIC_ENABLE; @@ -929,23 +1032,39 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) return val; } -static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +static void set_seg(unsigned int which, unsigned int low, unsigned int high, + int *err) { - int ret; - unsigned int which; - u64 base; + u64 base = ((u64)high << 32) | low; - ret = 0; + if (HYPERVISOR_set_segment_base(which, base) == 0) + return; + + if (err) + *err = -EIO; + else + WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base); +} +/* + * Support write_msr_safe() and write_msr() semantics. + * With err == NULL write_msr() semantics are selected. + * Supplying an err pointer requires err to be pre-initialized with 0. + */ +static void xen_do_write_msr(unsigned int msr, unsigned int low, + unsigned int high, int *err) +{ switch (msr) { - case MSR_FS_BASE: which = SEGBASE_FS; goto set; - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; - - set: - base = ((u64)high << 32) | low; - if (HYPERVISOR_set_segment_base(which, base) != 0) - ret = -EIO; + case MSR_FS_BASE: + set_seg(SEGBASE_FS, low, high, err); + break; + + case MSR_KERNEL_GS_BASE: + set_seg(SEGBASE_GS_USER, low, high, err); + break; + + case MSR_GS_BASE: + set_seg(SEGBASE_GS_KERNEL, low, high, err); break; case MSR_STAR: @@ -961,31 +1080,42 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) break; default: - if (!pmu_msr_write(msr, low, high, &ret)) - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, err)) { + if (err) + *err = native_write_msr_safe(msr, low, high); + else + native_write_msr(msr, low, high); + } } +} + +static u64 xen_read_msr_safe(unsigned int msr, int *err) +{ + return xen_do_read_msr(msr, err); +} + +static int xen_write_msr_safe(unsigned int msr, unsigned int low, + unsigned int high) +{ + int err = 0; + + xen_do_write_msr(msr, low, high, &err); - return ret; + return err; } static u64 xen_read_msr(unsigned int msr) { - /* - * This will silently swallow a #GP from RDMSR. It may be worth - * changing that. - */ int err; - return xen_read_msr_safe(msr, &err); + return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL); } static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) { - /* - * This will silently swallow a #GP from WRMSR. It may be worth - * changing that. - */ - xen_write_msr_safe(msr, low, high); + int err; + + xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL); } /* This is called once we have the cpu_possible_mask */ @@ -1022,7 +1152,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { .write_cr4 = xen_write_cr4, - .wbinvd = native_wbinvd, + .wbinvd = pv_native_wbinvd, .read_msr = xen_read_msr, .write_msr = xen_write_msr, @@ -1055,7 +1185,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = { #endif .io_delay = xen_io_delay, - .start_context_switch = paravirt_start_context_switch, + .start_context_switch = xen_start_context_switch, .end_context_switch = xen_end_context_switch, }, }; @@ -1164,7 +1294,7 @@ static void __init xen_setup_gdt(int cpu) pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot; pv_ops.cpu.load_gdt = xen_load_gdt_boot; - switch_to_new_gdt(cpu); + switch_gdt_and_percpu_base(cpu); pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry; pv_ops.cpu.load_gdt = xen_load_gdt; @@ -1220,10 +1350,12 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_vcpu_info_reset(0); x86_platform.get_nmi_reason = xen_get_nmi_reason; + x86_platform.realmode_reserve = x86_init_noop; + x86_platform.realmode_init = x86_init_noop; x86_init.resources.memory_setup = xen_memory_setup; x86_init.irqs.intr_mode_select = x86_init_noop; - x86_init.irqs.intr_mode_init = x86_init_noop; + x86_init.irqs.intr_mode_init = x86_64_probe_apic; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; x86_init.hyper.init_platform = xen_pv_init_platform; @@ -1263,12 +1395,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) xen_init_capabilities(); -#ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. */ xen_init_apic(); -#endif machine_ops = xen_machine_ops; @@ -1341,7 +1471,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si) x86_platform.set_legacy_features = xen_dom0_set_legacy_features; - xen_init_vga(info, xen_start_info->console.dom0.info_size); + xen_init_vga(info, xen_start_info->console.dom0.info_size, + &boot_params.screen_info); xen_start_info->console.domU.mfn = 0; xen_start_info->console.domU.evtchn = 0; diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c index bcae606bbc5c..27a2a02ef8fb 100644 --- a/arch/x86/xen/enlighten_pvh.c +++ b/arch/x86/xen/enlighten_pvh.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/acpi.h> #include <linux/export.h> +#include <linux/mm.h> #include <xen/hvc-console.h> +#include <asm/bootparam.h> #include <asm/io_apic.h> #include <asm/hypervisor.h> #include <asm/e820/api.h> @@ -43,6 +45,19 @@ void __init xen_pvh_init(struct boot_params *boot_params) x86_init.oem.banner = xen_banner; xen_efi_init(boot_params); + + if (xen_initial_domain()) { + struct xen_platform_op op = { + .cmd = XENPF_get_dom0_console, + }; + int ret = HYPERVISOR_platform_op(&op); + + if (ret > 0) + xen_init_vga(&op.u.dom0_console, + min(ret * sizeof(char), + sizeof(op.u.dom0_console)), + &boot_params->screen_info); + } } void __init mem_map_via_hcall(struct boot_params *boot_params_p) @@ -59,3 +74,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p) } boot_params_p->e820_entries = memmap.nr_entries; } + +/* + * Reserve e820 UNUSABLE regions to inflate the memory balloon. + * + * On PVH dom0 the host memory map is used, RAM regions available to dom0 are + * located as the same place as in the native memory map, but since dom0 gets + * less memory than the total amount of host RAM the ranges that can't be + * populated are converted from RAM -> UNUSABLE. Use such regions (up to the + * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at + * boot. Doing so prevents the guest (even if just temporary) from using holes + * in the memory map in order to map grants or foreign addresses, and + * hopefully limits the risk of a clash with a device MMIO region. Ideally the + * hypervisor should notify us which memory ranges are suitable for creating + * foreign mappings, but that's not yet implemented. + */ +void __init xen_reserve_extra_memory(struct boot_params *bootp) +{ + unsigned int i, ram_pages = 0, extra_pages; + + for (i = 0; i < bootp->e820_entries; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + + if (e->type != E820_TYPE_RAM) + continue; + ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr); + } + + /* Max amount of extra memory. */ + extra_pages = EXTRA_MEM_RATIO * ram_pages; + + /* + * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping + * purposes. + */ + for (i = 0; i < bootp->e820_entries && extra_pages; i++) { + struct boot_e820_entry *e = &bootp->e820_table[i]; + unsigned long pages; + + if (e->type != E820_TYPE_UNUSABLE) + continue; + + pages = min(extra_pages, + PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr)); + + if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) { + struct boot_e820_entry *next; + + if (bootp->e820_entries == + ARRAY_SIZE(bootp->e820_table)) + /* No space left to split - skip region. */ + continue; + + /* Split entry. */ + next = e + 1; + memmove(next, e, + (bootp->e820_entries - i) * sizeof(*e)); + bootp->e820_entries++; + next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages); + e->size = next->addr - e->addr; + next->size -= e->size; + } + e->type = E820_TYPE_RAM; + extra_pages -= pages; + + xen_add_extra_mem(PFN_UP(e->addr), pages); + } +} diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 06c3c2fb4b06..39982f955cfe 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void) (void)HYPERVISOR_xen_version(0, NULL); } -static void xen_safe_halt(void) +static noinstr void xen_safe_halt(void) { /* Blocking includes an implicit local_irq_enable(). */ if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) @@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = { /* Initial interrupt flag handling only called while interrupts off. */ .save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0), .irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop), - .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG), + .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func), .safe_halt = xen_safe_halt, .halt = xen_halt, diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index ee29fb558f2e..54e0d311dcc9 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -34,7 +34,7 @@ * would need to validate the whole pagetable before going on. * Naturally, this is quite slow. The solution is to "pin" a * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured + * when it is not actively in use. This means that Xen can be assured * that it is still valid when you do load it into %cr3, and doesn't * need to revalidate it. * @@ -86,6 +86,22 @@ #include "mmu.h" #include "debugfs.h" +/* + * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order + * to avoid warnings with "-Wmissing-prototypes". + */ +pteval_t xen_pte_val(pte_t pte); +pgdval_t xen_pgd_val(pgd_t pgd); +pmdval_t xen_pmd_val(pmd_t pmd); +pudval_t xen_pud_val(pud_t pud); +p4dval_t xen_p4d_val(p4d_t p4d); +pte_t xen_make_pte(pteval_t pte); +pgd_t xen_make_pgd(pgdval_t pgd); +pmd_t xen_make_pmd(pmdval_t pmd); +pud_t xen_make_pud(pudval_t pud); +p4d_t xen_make_p4d(p4dval_t p4d); +pte_t xen_make_pte_init(pteval_t pte); + #ifdef CONFIG_X86_VSYSCALL_EMULATION /* l3 pud for userspace vsyscall mapping */ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; @@ -150,7 +166,7 @@ void make_lowmem_page_readwrite(void *vaddr) if (pte == NULL) return; /* vaddr missing */ - ptev = pte_mkwrite(*pte); + ptev = pte_mkwrite_novma(*pte); if (HYPERVISOR_update_va_mapping(address, ptev, 0)) BUG(); @@ -220,7 +236,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -254,7 +270,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) { struct mmu_update u; - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + if (xen_get_lazy_mode() != XEN_LAZY_MMU) return false; xen_mc_batch(); @@ -263,7 +279,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) u.val = pte_val_ma(pteval); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); return true; } @@ -309,7 +325,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } /* Assume pteval_t is equivalent to all the other *val_t types. */ @@ -403,7 +419,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -483,7 +499,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) __xen_set_p4d_hyper(ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -515,7 +531,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val) if (user_ptr) __xen_set_p4d_hyper((p4d_t *)user_ptr, val); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } #if CONFIG_PGTABLE_LEVELS >= 5 @@ -651,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) spinlock_t *ptl = NULL; #if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); + ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif @@ -885,14 +901,7 @@ void xen_mm_unpin_all(void) spin_unlock(&pgd_lock); } -static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next); - spin_unlock(&next->page_table_lock); -} - -static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_enter_mmap(struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -904,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info) struct mm_struct *mm = info; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) - leave_mm(smp_processor_id()); + leave_mm(); /* * If this cpu still has a stale cr3 reference, then make sure @@ -1050,7 +1059,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) pte_t *pte_tbl; int i; - if (pmd_large(*pmd)) { + if (pmd_leaf(*pmd)) { pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PMD_SIZE); return; @@ -1073,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) pmd_t *pmd_tbl; int i; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PUD_SIZE); return; @@ -1095,7 +1104,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) pud_t *pud_tbl; int i; - if (p4d_large(*p4d)) { + if (p4d_leaf(*p4d)) { pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, P4D_SIZE); return; @@ -1236,7 +1245,7 @@ static noinline void xen_flush_tlb(void) op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1256,7 +1265,7 @@ static void xen_flush_tlb_one_user(unsigned long addr) op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -1293,7 +1302,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus, MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } static unsigned long xen_read_cr3(void) @@ -1352,7 +1361,7 @@ static void xen_write_cr3(unsigned long cr3) else __xen_write_cr3(false, 0); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } /* @@ -1387,7 +1396,7 @@ static void __init xen_write_cr3_init(unsigned long cr3) __xen_write_cr3(true, cr3); - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ + xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */ } static int xen_pgd_alloc(struct mm_struct *mm) @@ -1548,7 +1557,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned) __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); } } @@ -1578,7 +1587,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level) __set_pfn_prot(pfn, PAGE_KERNEL); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); ClearPagePinned(page); } @@ -1795,7 +1804,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) */ xen_mc_batch(); __xen_write_cr3(true, __pa(init_top_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + xen_mc_issue(XEN_LAZY_CPU); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -1854,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pud_present(pud)) return 0; pa = pud_val(pud) & PTE_PFN_MASK; - if (pud_large(pud)) + if (pud_leaf(pud)) return pa + (vaddr & ~PUD_MASK); pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * @@ -1862,7 +1871,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pmd_present(pmd)) return 0; pa = pmd_val(pmd) & PTE_PFN_MASK; - if (pmd_large(pmd)) + if (pmd_leaf(pmd)) return pa + (vaddr & ~PMD_MASK); pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * @@ -2074,6 +2083,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } +static void xen_enter_lazy_mmu(void) +{ + enter_lazy(XEN_LAZY_MMU); +} + +static void xen_flush_lazy_mmu(void) +{ + preempt_disable(); + + if (xen_get_lazy_mode() == XEN_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + static void __init xen_post_allocator_init(void) { pv_ops.mmu.set_pte = xen_set_pte; @@ -2098,7 +2124,7 @@ static void xen_leave_lazy_mmu(void) { preempt_disable(); xen_mc_flush(); - paravirt_leave_lazy_mmu(); + leave_lazy(XEN_LAZY_MMU); preempt_enable(); } @@ -2153,14 +2179,13 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { .make_p4d = PV_CALLEE_SAVE(xen_make_p4d), #endif - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, + .enter_mmap = xen_enter_mmap, .exit_mmap = xen_exit_mmap, .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, + .enter = xen_enter_lazy_mmu, .leave = xen_leave_lazy_mmu, - .flush = paravirt_flush_lazy_mmu, + .flush = xen_flush_lazy_mmu, }, .set_fixmap = xen_set_fixmap, @@ -2194,13 +2219,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, mcs = __xen_mc_entry(0); if (in_frames) - in_frames[i] = virt_to_mfn(vaddr); + in_frames[i] = virt_to_mfn((void *)vaddr); MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); + __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY); if (out_frames) - out_frames[i] = virt_to_pfn(vaddr); + out_frames[i] = virt_to_pfn((void *)vaddr); } xen_mc_issue(0); } @@ -2242,7 +2267,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, MULTI_update_va_mapping(mcs.mc, vaddr, mfn_pte(mfn, PAGE_KERNEL), flags); - set_phys_to_machine(virt_to_pfn(vaddr), mfn); + set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn); } xen_mc_issue(0); @@ -2302,12 +2327,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, int success; unsigned long vstart = (unsigned long)phys_to_virt(pstart); - /* - * Currently an auto-translated guest will not perform I/O, nor will - * it require PAE page directories below 4GB. Therefore any calls to - * this function are redundant and can be ignored. - */ - if (unlikely(order > MAX_CONTIG_ORDER)) return -ENOMEM; @@ -2319,7 +2338,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, xen_zap_pfn_range(vstart, order, in_frames, NULL); /* 2. Get a new contiguous memory extent. */ - out_frame = virt_to_pfn(vstart); + out_frame = virt_to_pfn((void *)vstart); success = xen_exchange_memory(1UL << order, 0, in_frames, 1, order, &out_frame, address_bits); @@ -2352,7 +2371,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) spin_lock_irqsave(&xen_reservation_lock, flags); /* 1. Find start MFN of contiguous extent. */ - in_frame = virt_to_mfn(vstart); + in_frame = virt_to_mfn((void *)vstart); /* 2. Zap current PTEs. */ xen_zap_pfn_range(vstart, order, NULL, out_frames); @@ -2383,7 +2402,7 @@ static noinline void xen_flush_tlb_all(void) op->cmd = MMUEXT_TLB_FLUSH_ALL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - xen_mc_issue(PARAVIRT_LAZY_MMU); + xen_mc_issue(XEN_LAZY_MMU); preempt_enable(); } @@ -2501,7 +2520,7 @@ out: } EXPORT_SYMBOL_GPL(xen_remap_pfn); -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_VMCORE_INFO phys_addr_t paddr_vmcoreinfo_note(void) { if (xen_pv_domain()) diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index 1c51b2c87f30..c3867b585e0d 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -26,7 +26,7 @@ static inline void xen_mc_batch(void) /* need to disable interrupts until this entry is complete */ local_irq_save(flags); - trace_xen_mc_batch(paravirt_get_lazy_mode()); + trace_xen_mc_batch(xen_get_lazy_mode()); __this_cpu_write(xen_mc_irq_flags, flags); } @@ -44,7 +44,7 @@ static inline void xen_mc_issue(unsigned mode) { trace_xen_mc_issue(mode); - if ((paravirt_get_lazy_mode() & mode) == 0) + if ((xen_get_lazy_mode() & mode) == 0) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 58db86f7b384..9bdc3b656b2c 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -134,11 +134,6 @@ static inline unsigned p2m_mid_index(unsigned long pfn) return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } -static inline unsigned p2m_index(unsigned long pfn) -{ - return pfn % P2M_PER_PAGE; -} - static void p2m_top_mfn_init(unsigned long *top) { unsigned i; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index 21ecbe754cb2..246d67dab510 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -131,6 +131,10 @@ static inline uint32_t get_fam15h_addr(u32 addr) static inline bool is_amd_pmu_msr(unsigned int msr) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && + boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) + return false; + if ((msr >= MSR_F15H_PERF_CTL && msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || (msr >= MSR_K7_EVNTSEL0 && @@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr) return false; } -static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index) { u32 msr_index_pmc; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR && + boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN) + return false; + switch (msr_index) { case MSR_CORE_PERF_FIXED_CTR_CTRL: case MSR_IA32_DS_AREA: @@ -290,48 +299,52 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) return false; } +static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read, + bool *emul) +{ + int type, index = 0; + + if (is_amd_pmu_msr(msr)) + *emul = xen_amd_pmu_emulate(msr, val, is_read); + else if (is_intel_pmu_msr(msr, &type, &index)) + *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read); + else + return false; + + return true; +} + bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) { - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, val, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } - } else { - int type, index; + bool emulated; - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) - *val = native_read_msr_safe(msr, err); - return true; - } + if (!pmu_msr_chk_emulated(msr, val, true, &emulated)) + return false; + + if (!emulated) { + *val = err ? native_read_msr_safe(msr, err) + : native_read_msr(msr); } - return false; + return true; } bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) { uint64_t val = ((uint64_t)high << 32) | low; + bool emulated; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { - if (is_amd_pmu_msr(msr)) { - if (!xen_amd_pmu_emulate(msr, &val, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } - } else { - int type, index; + if (!pmu_msr_chk_emulated(msr, &val, false, &emulated)) + return false; - if (is_intel_pmu_msr(msr, &type, &index)) { - if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) - *err = native_write_msr_safe(msr, low, high); - return true; - } + if (!emulated) { + if (err) + *err = native_write_msr_safe(msr, low, high); + else + native_write_msr(msr, low, high); } - return false; + return true; } static unsigned long long xen_amd_read_pmc(int counter) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index cfa99e8f054b..380591028cb8 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -6,7 +6,9 @@ */ #include <linux/init.h> +#include <linux/iscsi_ibft.h> #include <linux/sched.h> +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/pm.h> #include <linux/memblock.h> @@ -36,12 +38,12 @@ #define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) -/* Amount of extra memory space we add to the e820 ranges */ -struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; - /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* Memory map would allow PCI passthrough. */ +bool xen_pv_pci_possible; + /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; @@ -59,18 +61,6 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; -/* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define EXTRA_MEM_RATIO (10) - static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -85,41 +75,12 @@ static void __init xen_parse_512gb(void) arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); if (!arg) val = true; - else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val)) return; xen_512gb_limit = val; } -static void __init xen_add_extra_mem(unsigned long start_pfn, - unsigned long n_pfns) -{ - int i; - - /* - * No need to check for zero size, should happen rarely and will only - * write a new entry regarded to be unused due to zero size. - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - /* Add new region. */ - if (xen_extra_mem[i].n_pfns == 0) { - xen_extra_mem[i].start_pfn = start_pfn; - xen_extra_mem[i].n_pfns = n_pfns; - break; - } - /* Append to existing region. */ - if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == - start_pfn) { - xen_extra_mem[i].n_pfns += n_pfns; - break; - } - } - if (i == XEN_EXTRA_MEM_MAX_REGIONS) - printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - - memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); -} - static void __init xen_del_extra_mem(unsigned long start_pfn, unsigned long n_pfns) { @@ -338,7 +299,7 @@ static void __init xen_do_set_identity_and_remap_chunk( WARN_ON(size == 0); - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn; ident_pfn_iter < ident_end_pfn; @@ -501,7 +462,7 @@ void __init xen_remap_memory(void) unsigned long pfn_s = ~0UL; unsigned long len = 0; - mfn_save = virt_to_mfn(buf); + mfn_save = virt_to_mfn((void *)buf); while (xen_remap_mfn != INVALID_P2M_ENTRY) { /* Map the remap information */ @@ -763,17 +724,26 @@ char * __init xen_memory_setup(void) BUG_ON(memmap.nr_entries == 0); xen_e820_table.nr_entries = memmap.nr_entries; - /* - * Xen won't allow a 1:1 mapping to be created to UNUSABLE - * regions, so if we're using the machine memory map leave the - * region as RAM as it is in the pseudo-physical map. - * - * UNUSABLE regions in domUs are not handled and will need - * a patch in the future. - */ - if (xen_initial_domain()) + if (xen_initial_domain()) { + /* + * Xen won't allow a 1:1 mapping to be created to UNUSABLE + * regions, so if we're using the machine memory map leave the + * region as RAM as it is in the pseudo-physical map. + * + * UNUSABLE regions in domUs are not handled and will need + * a patch in the future. + */ xen_ignore_unusable(); +#ifdef CONFIG_ISCSI_IBFT_FIND + /* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */ + xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START; + xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED; + xen_e820_table.nr_entries++; +#endif + } + /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); @@ -803,6 +773,9 @@ char * __init xen_memory_setup(void) chunk_size = size; type = xen_e820_table.entries[i].type; + if (type == E820_TYPE_RESERVED) + xen_pv_pci_possible = true; + if (type == E820_TYPE_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); @@ -910,17 +883,9 @@ static int register_callback(unsigned type, const void *func) void xen_enable_sysenter(void) { - int ret; - unsigned sysenter_feature; - - sysenter_feature = X86_FEATURE_SYSENTER32; - - if (!boot_cpu_has(sysenter_feature)) - return; - - ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat); - if(ret != 0) - setup_clear_cpu_cap(sysenter_feature); + if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) && + register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSENTER32); } void xen_enable_syscall(void) @@ -934,22 +899,15 @@ void xen_enable_syscall(void) mechanism for syscalls. */ } - if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { - ret = register_callback(CALLBACKTYPE_syscall32, - xen_entry_SYSCALL_compat); - if (ret != 0) - setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); - } + if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) && + register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat)) + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); } static void __init xen_pvmmu_arch_setup(void) { - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_pae_extended_cr3); - if (register_callback(CALLBACKTYPE_event, xen_asm_exc_xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c3e1f9a7d43a..935771726f9c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) void xen_smp_intr_free(unsigned int cpu) { + kfree(per_cpu(xen_resched_irq, cpu).name); + per_cpu(xen_resched_irq, cpu).name = NULL; if (per_cpu(xen_resched_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL); per_cpu(xen_resched_irq, cpu).irq = -1; - kfree(per_cpu(xen_resched_irq, cpu).name); - per_cpu(xen_resched_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfunc_irq, cpu).name); + per_cpu(xen_callfunc_irq, cpu).name = NULL; if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL); per_cpu(xen_callfunc_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfunc_irq, cpu).name); - per_cpu(xen_callfunc_irq, cpu).name = NULL; } + kfree(per_cpu(xen_debug_irq, cpu).name); + per_cpu(xen_debug_irq, cpu).name = NULL; if (per_cpu(xen_debug_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL); per_cpu(xen_debug_irq, cpu).irq = -1; - kfree(per_cpu(xen_debug_irq, cpu).name); - per_cpu(xen_debug_irq, cpu).name = NULL; } + kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); + per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq, NULL); per_cpu(xen_callfuncsingle_irq, cpu).irq = -1; - kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); - per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; } } @@ -65,6 +65,9 @@ int xen_smp_intr_init(unsigned int cpu) char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); + if (!resched_name) + goto fail_mem; + per_cpu(xen_resched_irq, cpu).name = resched_name; rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, xen_reschedule_interrupt, @@ -74,9 +77,11 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_resched_irq, cpu).irq = rc; - per_cpu(xen_resched_irq, cpu).name = resched_name; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); + if (!callfunc_name) + goto fail_mem; + per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, cpu, xen_call_function_interrupt, @@ -86,10 +91,13 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfunc_irq, cpu).irq = rc; - per_cpu(xen_callfunc_irq, cpu).name = callfunc_name; if (!xen_fifo_events) { debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + if (!debug_name) + goto fail_mem; + + per_cpu(xen_debug_irq, cpu).name = debug_name; rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, IRQF_PERCPU | IRQF_NOBALANCING, @@ -97,10 +105,13 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_debug_irq, cpu).irq = rc; - per_cpu(xen_debug_irq, cpu).name = debug_name; } callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); + if (!callfunc_name) + goto fail_mem; + + per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, cpu, xen_call_function_single_interrupt, @@ -110,10 +121,11 @@ int xen_smp_intr_init(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; - per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; return 0; + fail_mem: + rc = -ENOMEM; fail: xen_smp_intr_free(cpu); return rc; @@ -123,8 +135,6 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) { if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); - else - calculate_max_logical_packages(); } void xen_smp_send_reschedule(int cpu) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index bd02f9d50107..b8efdbc693f7 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -2,6 +2,10 @@ #ifndef _XEN_SMP_H #ifdef CONFIG_SMP + +void asm_cpu_bringup_and_idle(void); +asmlinkage void cpu_bringup_and_idle(void); + extern void xen_send_IPI_mask(const struct cpumask *mask, int vector); extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask, @@ -15,12 +19,15 @@ extern void xen_smp_intr_free(unsigned int cpu); int xen_smp_intr_init_pv(unsigned int cpu); void xen_smp_intr_free_pv(unsigned int cpu); +void xen_smp_count_cpus(void); void xen_smp_cpus_done(unsigned int max_cpus); void xen_smp_send_reschedule(int cpu); void xen_smp_send_call_function_ipi(const struct cpumask *mask); void xen_smp_send_call_function_single_ipi(int cpu); +void __noreturn xen_cpu_bringup_again(unsigned long stack); + struct xen_common_irq { int irq; char *name; @@ -38,6 +45,7 @@ static inline int xen_smp_intr_init_pv(unsigned int cpu) return 0; } static inline void xen_smp_intr_free_pv(unsigned int cpu) {} +static inline void xen_smp_count_cpus(void) { } #endif /* CONFIG_SMP */ #endif diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index b70afdff419c..ac95d1981cc0 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -55,18 +55,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { - if (common_cpu_die(cpu) == 0) { - if (xen_have_vector_callback) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - } + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); } } #else -static void xen_hvm_cpu_die(unsigned int cpu) +static void xen_hvm_cleanup_dead_cpu(unsigned int cpu) { BUG(); } @@ -77,7 +75,7 @@ void __init xen_hvm_smp_init(void) smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; smp_ops.smp_cpus_done = xen_smp_cpus_done; - smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.cleanup_dead_cpu = xen_hvm_cleanup_dead_cpu; if (!xen_have_vector_callback) { #ifdef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index ba7af2eca755..27d1a5b7f571 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -29,6 +29,7 @@ #include <asm/idtentry.h> #include <asm/desc.h> #include <asm/cpu.h> +#include <asm/apic.h> #include <asm/io_apic.h> #include <xen/interface/xen.h> @@ -55,14 +56,15 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); -void asm_cpu_bringup_and_idle(void); static void cpu_bringup(void) { int cpu; cr4_init(); + cpuhp_ap_sync_alive(); cpu_init(); + fpu__init_cpu(); touch_softlockup_watchdog(); /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ @@ -72,7 +74,6 @@ static void cpu_bringup(void) } cpu = smp_processor_id(); smp_store_cpu_info(cpu); - cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); speculative_store_bypass_ht_init(); @@ -83,7 +84,7 @@ static void cpu_bringup(void) set_cpu_online(cpu, true); - cpu_set_state_online(cpu); /* Implies full memory barrier. */ + smp_mb(); /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -97,18 +98,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void) void xen_smp_intr_free_pv(unsigned int cpu) { + kfree(per_cpu(xen_irq_work, cpu).name); + per_cpu(xen_irq_work, cpu).name = NULL; if (per_cpu(xen_irq_work, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); per_cpu(xen_irq_work, cpu).irq = -1; - kfree(per_cpu(xen_irq_work, cpu).name); - per_cpu(xen_irq_work, cpu).name = NULL; } + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); per_cpu(xen_pmu_irq, cpu).irq = -1; - kfree(per_cpu(xen_pmu_irq, cpu).name); - per_cpu(xen_pmu_irq, cpu).name = NULL; } } @@ -118,6 +119,7 @@ int xen_smp_intr_init_pv(unsigned int cpu) char *callfunc_name, *pmu_name; callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); + per_cpu(xen_irq_work, cpu).name = callfunc_name; rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, cpu, xen_irq_work_interrupt, @@ -127,10 +129,10 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_irq_work, cpu).irq = rc; - per_cpu(xen_irq_work, cpu).name = callfunc_name; if (is_xen_pmu) { pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + per_cpu(xen_pmu_irq, cpu).name = pmu_name; rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, xen_pmu_irq_handler, IRQF_PERCPU|IRQF_NOBALANCING, @@ -138,7 +140,6 @@ int xen_smp_intr_init_pv(unsigned int cpu) if (rc < 0) goto fail; per_cpu(xen_pmu_irq, cpu).irq = rc; - per_cpu(xen_pmu_irq, cpu).name = pmu_name; } return 0; @@ -148,40 +149,18 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init _get_smp_config(unsigned int early) +static void __init xen_pv_smp_config(void) { - int i, rc; - unsigned int subtract = 0; - - if (early) - return; - - num_processors = 0; - disabled_cpus = 0; - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } else { - set_cpu_possible(i, false); - set_cpu_present(i, false); - subtract++; - } - } -#ifdef CONFIG_HOTPLUG_CPU - /* This is akin to using 'nr_cpus' on the Linux command line. - * Which is OK as when we use 'dom0_max_vcpus=X' we can only - * have up to X, while nr_cpu_ids is greater than X. This - * normally is not a problem, except when CPU hotplugging - * is involved and then there might be more than X CPUs - * in the guest - which will not work as there is no - * hypercall to expand the max number of VCPUs an already - * running guest has. So cap it up to X. */ - if (subtract) - nr_cpu_ids = nr_cpu_ids - subtract; -#endif + u32 apicid = 0; + int i; + + topology_register_boot_apic(apicid++); + for (i = 1; i < nr_cpu_ids; i++) + topology_register_apic(apicid++, CPU_ACPIID_INVALID, true); + + /* Pretend to be a proper enumerated system */ + smp_found_config = 1; } static void __init xen_pv_smp_prepare_boot_cpu(void) @@ -209,7 +188,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - if (skip_ioapic_setup) { + if (ioapic_is_disabled) { char *m = (max_cpus == 0) ? "The nosmp parameter is incompatible with Xen; " \ "use Xen dom0_max_vcpus=1 parameter" : @@ -222,8 +201,6 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) smp_prepare_cpus_common(); - cpu_data(0).x86_max_cores = 1; - speculative_store_bypass_ht_init(); xen_pmu_init(0); @@ -254,15 +231,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) struct desc_struct *gdt; unsigned long gdt_mfn; - /* used to tell cpu_init() that it can proceed with initialization */ - cpumask_set_cpu(cpu, cpu_callout_mask); if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt == NULL) { cpumask_clear_cpu(cpu, xen_cpu_initialized_map); - cpumask_clear_cpu(cpu, cpu_callout_mask); return -ENOMEM; } @@ -316,7 +290,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) +static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) { int rc; @@ -326,14 +300,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_setup_runstate_info(cpu); - /* - * PV VCPUs are always successfully taken down (see 'while' loop - * in xen_cpu_die()), so -EBUSY is an error. - */ - rc = cpu_check_up_prepare(cpu); - if (rc) - return rc; - /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -343,15 +309,20 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) xen_pmu_init(cpu); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); - BUG_ON(rc); - - while (cpu_report_state(cpu) != CPU_ONLINE) - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); + /* + * Why is this a BUG? If the hypercall fails then everything can be + * rolled back, no? + */ + BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)); return 0; } +static void xen_pv_poll_sync_state(void) +{ + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); +} + #ifdef CONFIG_HOTPLUG_CPU static int xen_pv_cpu_disable(void) { @@ -367,35 +338,26 @@ static int xen_pv_cpu_disable(void) static void xen_pv_cpu_die(unsigned int cpu) { - while (HYPERVISOR_vcpu_op(VCPUOP_is_up, - xen_vcpu_nr(cpu), NULL)) { + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), NULL)) { __set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(HZ/10); } +} - if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); - xen_pmu_finish(cpu); - } +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } -static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ +static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); - cpu_bringup(); - /* - * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) - * clears certain data that the cpu_idle loop (which called us - * and that we return from) expects. The only way to get that - * data back is to call: - */ - tick_nohz_idle_enter(); - tick_nohz_idle_stop_tick_protected(); - - cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE); + xen_cpu_bringup_again((unsigned long)task_pt_regs(current)); + BUG(); } #else /* !CONFIG_HOTPLUG_CPU */ @@ -409,7 +371,12 @@ static void xen_pv_cpu_die(unsigned int cpu) BUG(); } -static void xen_pv_play_dead(void) +static void xen_pv_cleanup_dead_cpu(unsigned int cpu) +{ + BUG(); +} + +static void __noreturn xen_pv_play_dead(void) { BUG(); } @@ -442,13 +409,29 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +void __init xen_smp_count_cpus(void) +{ + unsigned int cpus; + + for (cpus = 0; cpus < nr_cpu_ids; cpus++) { + if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpus, NULL) < 0) + break; + } + + pr_info("Xen PV: Detected %u vCPUS\n", cpus); + if (cpus < nr_cpu_ids) + set_nr_cpu_ids(cpus); +} + static const struct smp_ops xen_smp_ops __initconst = { .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_pv_smp_prepare_cpus, .smp_cpus_done = xen_smp_cpus_done, - .cpu_up = xen_pv_cpu_up, + .kick_ap_alive = xen_pv_kick_ap, .cpu_die = xen_pv_cpu_die, + .cleanup_dead_cpu = xen_pv_cleanup_dead_cpu, + .poll_sync_state = xen_pv_poll_sync_state, .cpu_disable = xen_pv_cpu_disable, .play_dead = xen_pv_play_dead, @@ -464,6 +447,12 @@ void __init xen_smp_init(void) smp_ops = xen_smp_ops; /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = _get_smp_config; + x86_init.mpparse.find_mptable = x86_init_noop; + x86_init.mpparse.early_parse_smp_cfg = x86_init_noop; + + /* XEN/PV Dom0 has halfways sane topology information via CPUID/MADT */ + if (xen_initial_domain()) + x86_init.mpparse.parse_smp_cfg = x86_init_noop; + else + x86_init.mpparse.parse_smp_cfg = xen_pv_smp_config; } diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 043c73dfd2c9..5c6fc16e4b92 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu) cpu, per_cpu(lock_kicker_irq, cpu)); name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + per_cpu(irq_name, cpu) = name; irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, cpu, dummy_handler, @@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu) if (irq >= 0) { disable_irq(irq); /* make sure it's never delivered */ per_cpu(lock_kicker_irq, cpu) = irq; - per_cpu(irq_name, cpu) = name; } printk("cpu %d spinlock event irq %d\n", cpu, irq); @@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu) if (!xen_pvspin) return; + kfree(per_cpu(irq_name, cpu)); + per_cpu(irq_name, cpu) = NULL; /* * When booting the kernel with 'mitigations=auto,nosmt', the secondary * CPUs are not activated, and lock_kicker_irq is not initialized. @@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu) unbind_from_irqhandler(irq, NULL); per_cpu(lock_kicker_irq, cpu) = -1; - kfree(per_cpu(irq_name, cpu)); - per_cpu(irq_name, cpu) = NULL; } PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c index 9d548b0c772f..0c4f7554b7cc 100644 --- a/arch/x86/xen/suspend_hvm.c +++ b/arch/x86/xen/suspend_hvm.c @@ -5,6 +5,7 @@ #include <xen/hvm.h> #include <xen/features.h> #include <xen/interface/features.h> +#include <xen/events.h> #include "xen-ops.h" @@ -14,6 +15,13 @@ void xen_hvm_post_suspend(int suspend_cancelled) xen_hvm_init_shared_info(); xen_vcpu_restore(); } - xen_setup_callback_vector(); + if (xen_percpu_upcall) { + unsigned int cpu; + + for_each_online_cpu(cpu) + BUG_ON(xen_set_upcall_vector(cpu)); + } else { + xen_setup_callback_vector(); + } xen_unplug_emulated_devices(); } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 9ef0a5cca96e..52fa5609b7f6 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -20,6 +20,7 @@ #include <asm/pvclock.h> #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <asm/xen/cpuid.h> #include <xen/events.h> #include <xen/features.h> @@ -60,9 +61,16 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs) return xen_clocksource_read(); } -static u64 xen_sched_clock(void) +static noinstr u64 xen_sched_clock(void) { - return xen_clocksource_read() - xen_sched_clock_offset; + struct pvclock_vcpu_time_info *src; + u64 ret; + + src = &__this_cpu_read(xen_vcpu)->time; + ret = pvclock_clocksource_read_nowd(src); + ret -= xen_sched_clock_offset; + + return ret; } static void xen_read_wallclock(struct timespec64 *ts) @@ -474,15 +482,47 @@ static void xen_setup_vsyscall_time_info(void) xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK; } +/* + * Check if it is possible to safely use the tsc as a clocksource. This is + * only true if the hypervisor notifies the guest that its tsc is invariant, + * the tsc is stable, and the tsc instruction will never be emulated. + */ +static int __init xen_tsc_safe_clocksource(void) +{ + u32 eax, ebx, ecx, edx; + + if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC))) + return 0; + + if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC))) + return 0; + + if (check_tsc_unstable()) + return 0; + + /* Leaf 4, sub-leaf 0 (0x40000x03) */ + cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx); + + return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE; +} + static void __init xen_time_init(void) { struct pvclock_vcpu_time_info *pvti; int cpu = smp_processor_id(); struct timespec64 tp; - /* As Dom0 is never moved, no penalty on using TSC there */ + /* + * As Dom0 is never moved, no penalty on using TSC there. + * + * If it is possible for the guest to determine that the tsc is a safe + * clocksource, then set xen_clocksource rating below that of the tsc + * so that the system prefers tsc instead. + */ if (xen_initial_domain()) xen_clocksource.rating = 275; + else if (xen_tsc_safe_clocksource()) + xen_clocksource.rating = 299; clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC); diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c index 14ea32e734d5..f7547807b0bd 100644 --- a/arch/x86/xen/vga.c +++ b/arch/x86/xen/vga.c @@ -2,17 +2,15 @@ #include <linux/screen_info.h> #include <linux/init.h> -#include <asm/bootparam.h> #include <asm/setup.h> #include <xen/interface/xen.h> #include "xen-ops.h" -void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size) +void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size, + struct screen_info *screen_info) { - struct screen_info *screen_info = &boot_params.screen_info; - /* This is drawn from a dump from vgacon:startup in * standard Linux. */ screen_info->orig_video_mode = 3; diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 6b4fdf6b9542..83189cf5cdce 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -28,7 +28,7 @@ * non-zero. */ SYM_FUNC_START(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) RET SYM_FUNC_END(xen_irq_disable_direct) @@ -69,7 +69,7 @@ SYM_FUNC_END(check_events) SYM_FUNC_START(xen_irq_enable_direct) FRAME_BEGIN /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) /* * Preempt here doesn't matter because that will deal with any @@ -78,7 +78,7 @@ SYM_FUNC_START(xen_irq_enable_direct) */ /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending) jz 1f call check_events @@ -97,7 +97,7 @@ SYM_FUNC_END(xen_irq_enable_direct) * x86 use opposite senses (mask vs enable). */ SYM_FUNC_START(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask + testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask) setz %ah addb %ah, %ah RET @@ -113,7 +113,7 @@ SYM_FUNC_END(xen_read_cr2); SYM_FUNC_START(xen_read_cr2_direct) FRAME_BEGIN - _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX + _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX FRAME_END RET SYM_FUNC_END(xen_read_cr2_direct); @@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault xen_pv_trap asm_exc_spurious_interrupt_bug xen_pv_trap asm_exc_coprocessor_error xen_pv_trap asm_exc_alignment_check -#ifdef CONFIG_X86_KERNEL_IBT +#ifdef CONFIG_X86_CET xen_pv_trap asm_exc_control_protection #endif #ifdef CONFIG_X86_MCE @@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check #endif /* CONFIG_X86_MCE */ xen_pv_trap asm_exc_simd_coprocessor_error #ifdef CONFIG_IA32_EMULATION -xen_pv_trap entry_INT80_compat +xen_pv_trap asm_int80_emulation #endif xen_pv_trap asm_exc_xen_unknown_trap xen_pv_trap asm_exc_xen_hypervisor_callback @@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback SYM_CODE_START(xen_early_idt_handler_array) i = 0 .rept NUM_EXCEPTION_VECTORS - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ENDBR pop %rcx pop %r11 @@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 * rsp->rax } */ SYM_CODE_START(xen_iret) - UNWIND_HINT_EMPTY + UNWIND_HINT_UNDEFINED ANNOTATE_NOENDBR pushq $0 jmp hypercall_iret @@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSCALL_compat_after_hwframe @@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat) /* * Neither Xen nor the kernel really knows what the old SS and - * CS were. The kernel expects __USER32_DS and __USER32_CS, so + * CS were. The kernel expects __USER_DS and __USER32_CS, so * report those values even though Xen will guess its own values. */ - movq $__USER32_DS, 4*8(%rsp) + movq $__USER_DS, 4*8(%rsp) movq $__USER32_CS, 1*8(%rsp) jmp entry_SYSENTER_compat_after_hwframe diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index ffaa62167f6e..04101b984f24 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -45,11 +45,11 @@ SYM_CODE_END(hypercall_page) #ifdef CONFIG_XEN_PV __INIT SYM_CODE_START(startup_xen) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ANNOTATE_NOENDBR cld - mov initial_stack(%rip), %rsp + leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp /* Set up %gs. * @@ -71,11 +71,18 @@ SYM_CODE_END(startup_xen) #ifdef CONFIG_XEN_PV_SMP .pushsection .text SYM_CODE_START(asm_cpu_bringup_and_idle) - UNWIND_HINT_EMPTY + UNWIND_HINT_END_OF_STACK ENDBR call cpu_bringup_and_idle SYM_CODE_END(asm_cpu_bringup_and_idle) + +SYM_CODE_START(xen_cpu_bringup_again) + UNWIND_HINT_FUNC + mov %rdi, %rsp + UNWIND_HINT_REGS + call cpu_bringup_and_idle +SYM_CODE_END(xen_cpu_bringup_again) .popsection #endif #endif @@ -83,30 +90,35 @@ SYM_CODE_END(asm_cpu_bringup_and_idle) ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") -#ifdef CONFIG_X86_32 - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) -#else +#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) /* Map the p2m table to a 512GB-aligned user address. */ ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) -#endif -#ifdef CONFIG_XEN_PV ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) -#endif - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, - .ascii "!writable_page_tables|pae_pgdir_above_4gb") - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, - .long (1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_dom0) | \ - (1 << XENFEAT_linux_rsdp_unrestricted)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) - ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) +# define FEATURES_PV (1 << XENFEAT_writable_page_tables) +#else +# define FEATURES_PV 0 +#endif +#ifdef CONFIG_XEN_PVH +# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted) +#else +# define FEATURES_PVH 0 +#endif +#ifdef CONFIG_XEN_DOM0 +# define FEATURES_DOM0 (1 << XENFEAT_dom0) +#else +# define FEATURES_DOM0 0 +#endif + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0) + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") + ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) #endif /*CONFIG_XEN */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a8bb972193d..79cf93f2c92f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -21,7 +21,7 @@ extern void *xen_initial_gdt; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -72,8 +72,6 @@ void xen_restore_time_memory_area(void); void xen_init_time_ops(void); void xen_hvm_init_time_ops(void); -irqreturn_t xen_debug_interrupt(int irq, void *dev_id); - bool xen_vcpu_stolen(int vcpu); void xen_vcpu_setup(int cpu); @@ -108,11 +106,12 @@ static inline void xen_uninit_lock_cpu(int cpu) struct dom0_vga_console_info; -#ifdef CONFIG_XEN_PV_DOM0 -void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size); +#ifdef CONFIG_XEN_DOM0 +void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size, + struct screen_info *); #else static inline void __init xen_init_vga(const struct dom0_vga_console_info *info, - size_t size) + size_t size, struct screen_info *si) { } #endif @@ -147,9 +146,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), void xen_pin_vcpu(int cpu); void xen_emergency_restart(void); +void xen_force_evtchn_callback(void); + #ifdef CONFIG_XEN_PV void xen_pv_pre_suspend(void); void xen_pv_post_suspend(int suspend_cancelled); +void xen_start_kernel(struct start_info *si); #else static inline void xen_pv_pre_suspend(void) {} static inline void xen_pv_post_suspend(int suspend_cancelled) {} @@ -161,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled); static inline void xen_hvm_post_suspend(int suspend_cancelled) {} #endif +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns); + #endif /* XEN_OPS_H */ |