aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c116
-rw-r--r--arch/x86/xen/efi.c4
-rw-r--r--arch/x86/xen/enlighten.c42
-rw-r--r--arch/x86/xen/enlighten_hvm.c47
-rw-r--r--arch/x86/xen/enlighten_pv.c235
-rw-r--r--arch/x86/xen/enlighten_pvh.c82
-rw-r--r--arch/x86/xen/irq.c4
-rw-r--r--arch/x86/xen/mmu_pv.c121
-rw-r--r--arch/x86/xen/multicalls.h4
-rw-r--r--arch/x86/xen/p2m.c5
-rw-r--r--arch/x86/xen/pmu.c71
-rw-r--r--arch/x86/xen/setup.c112
-rw-r--r--arch/x86/xen/smp.c38
-rw-r--r--arch/x86/xen/smp.h8
-rw-r--r--arch/x86/xen/smp_hvm.c16
-rw-r--r--arch/x86/xen/smp_pv.c159
-rw-r--r--arch/x86/xen/spinlock.c6
-rw-r--r--arch/x86/xen/suspend_hvm.c10
-rw-r--r--arch/x86/xen/time.c46
-rw-r--r--arch/x86/xen/vga.c6
-rw-r--r--arch/x86/xen/xen-asm.S26
-rw-r--r--arch/x86/xen/xen-head.S50
-rw-r--r--arch/x86/xen/xen-ops.h28
25 files changed, 766 insertions, 483 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 85246dd9faa1..77e788e928cd 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -9,6 +9,7 @@ config XEN
select PARAVIRT_CLOCK
select X86_HV_CALLBACK_VECTOR
depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MCORE2 || MATOM || MK8)
depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
@@ -80,7 +81,6 @@ config XEN_PVH
bool "Xen PVH guest support"
depends on XEN && XEN_PVHVM && ACPI
select PVH
- def_bool n
help
Support for running as a Xen PVH guest.
@@ -92,3 +92,12 @@ config XEN_DOM0
select X86_X2APIC if XEN_PVH && X86_64
help
Support running as a Xen Dom0 guest.
+
+config XEN_PV_MSR_SAFE
+ bool "Always use safe MSR accesses in PV guests"
+ default y
+ depends on XEN_PV
+ help
+ Use safe (not faulting) MSR access functions even if the MSR access
+ should not fault anyway.
+ The default can be changed by using the "xen_msr_safe" boot parameter.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3c5b52fbe4a7..a9ec8c9f5c5d 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_PV_DOM0) += vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 62d34b6611c5..8b045dd25196 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -33,13 +33,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
return 0xfd;
}
-static u32 xen_set_apic_id(unsigned int x)
-{
- WARN_ON(1);
- return x;
-}
-
-static unsigned int xen_get_apic_id(unsigned long x)
+static u32 xen_get_apic_id(u32 x)
{
return ((x)>>24) & 0xFFu;
}
@@ -49,20 +43,20 @@ static u32 xen_apic_read(u32 reg)
struct xen_platform_op op = {
.cmd = XENPF_get_cpuinfo,
.interface_version = XENPF_INTERFACE_VERSION,
- .u.pcpu_info.xen_cpuid = 0,
};
- int ret;
-
- /* Shouldn't need this as APIC is turned off for PV, and we only
- * get called on the bootup processor. But just in case. */
- if (!xen_initial_domain() || smp_processor_id())
- return 0;
+ int ret, cpu;
if (reg == APIC_LVR)
return 0x14;
if (reg != APIC_ID)
return 0;
+ cpu = smp_processor_id();
+ if (!xen_initial_domain())
+ return cpu ? cpuid_to_apicid[cpu] << 24 : 0;
+
+ op.u.pcpu_info.xen_cpuid = cpu;
+
ret = HYPERVISOR_platform_op(&op);
if (ret)
op.u.pcpu_info.apic_id = BAD_APICID;
@@ -81,6 +75,11 @@ static void xen_apic_write(u32 reg, u32 val)
WARN(1,"register: %x, value: %x\n", reg, val);
}
+static void xen_apic_eoi(void)
+{
+ WARN_ON_ONCE(1);
+}
+
static u64 xen_apic_icr_read(void)
{
return 0;
@@ -92,11 +91,6 @@ static void xen_apic_icr_write(u32 low, u32 id)
WARN_ON(1);
}
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
static int xen_apic_probe_pv(void)
{
if (xen_pv_domain())
@@ -110,99 +104,47 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
return xen_pv_domain();
}
-static int xen_id_always_valid(u32 apicid)
-{
- return 1;
-}
-
-static int xen_id_always_registered(void)
-{
- return 1;
-}
-
-static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
-static void xen_noop(void)
-{
-}
-
-static void xen_silent_inquire(int apicid)
-{
-}
-
-static int xen_cpu_present_to_apicid(int cpu)
+static u32 xen_cpu_present_to_apicid(int cpu)
{
if (cpu_present(cpu))
- return cpu_data(cpu).apicid;
+ return cpu_data(cpu).topo.apicid;
else
return BAD_APICID;
}
-static struct apic xen_pv_apic = {
- .name = "Xen PV",
- .probe = xen_apic_probe_pv,
+static struct apic xen_pv_apic __ro_after_init = {
+ .name = "Xen PV",
+ .probe = xen_apic_probe_pv,
.acpi_madt_oem_check = xen_madt_oem_check,
- .apic_id_valid = xen_id_always_valid,
- .apic_id_registered = xen_id_always_registered,
/* .delivery_mode and .dest_mode_logical not used by XENPV */
.disable_esr = 0,
- .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
- .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
- .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = xen_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
- .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
- .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
- .get_apic_id = xen_get_apic_id,
- .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = xen_get_apic_id,
.calc_dest_apicid = apic_flat_calc_apicid,
#ifdef CONFIG_SMP
- .send_IPI_mask = xen_send_IPI_mask,
- .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
- .send_IPI_allbutself = xen_send_IPI_allbutself,
- .send_IPI_all = xen_send_IPI_all,
- .send_IPI_self = xen_send_IPI_self,
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
#endif
- /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
- .inquire_remote_apic = xen_silent_inquire,
-
.read = xen_apic_read,
.write = xen_apic_write,
- .eoi_write = xen_apic_write,
+ .eoi = xen_apic_eoi,
- .icr_read = xen_apic_icr_read,
- .icr_write = xen_apic_icr_write,
- .wait_icr_idle = xen_noop,
- .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
};
+apic_driver(xen_pv_apic);
-static void __init xen_apic_check(void)
-{
- if (apic == &xen_pv_apic)
- return;
-
- pr_info("Switched APIC routing from %s to %s.\n", apic->name,
- xen_pv_apic.name);
- apic = &xen_pv_apic;
-}
void __init xen_init_apic(void)
{
x86_apic_ops.io_apic_read = xen_io_apic_read;
- /* On PV guests the APIC CPUID bit is disabled so none of the
- * routines end up executing. */
- if (!xen_initial_domain())
- apic = &xen_pv_apic;
-
- x86_platform.apic_post_init = xen_apic_check;
}
-apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 7d7ffb9c826a..7250d0e0e1a9 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -16,6 +16,8 @@
#include <asm/setup.h>
#include <asm/xen/hypercall.h>
+#include "xen-ops.h"
+
static efi_char16_t vendor[100] __initdata;
static efi_system_table_t efi_systab_xen __initdata = {
@@ -136,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params)
if (efi_systab_xen == NULL)
return;
- strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
+ strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
sizeof(boot_params->efi_info.efi_loader_signature));
boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30c6e986a6cd..a01ca255b0c6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -6,6 +6,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/panic_notifier.h>
@@ -32,10 +33,13 @@ EXPORT_SYMBOL_GPL(hypercall_page);
* &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
* but during boot it is switched to point to xen_vcpu_info.
- * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
+ * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events.
+ * Make sure that xen_vcpu_info doesn't cross a page boundary by making it
+ * cache-line aligned (the struct is guaranteed to have a size of 64 bytes,
+ * which matches the cache line size of 64-bit x86 processors).
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
@@ -51,7 +55,7 @@ EXPORT_SYMBOL_GPL(xen_start_info);
struct shared_info xen_dummy_shared_info;
-__read_mostly int xen_have_vector_callback;
+__read_mostly bool xen_have_vector_callback = true;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
/*
@@ -160,6 +164,7 @@ void xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
+ BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES);
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
@@ -346,3 +351,34 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
+{
+ unsigned int i;
+
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 8b71b1dd7639..c001a2296582 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -4,9 +4,12 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
+#include <linux/virtio_anchor.h>
#include <xen/features.h>
#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/memory.h>
#include <asm/apic.h>
@@ -30,6 +33,9 @@
static unsigned long shared_info_pfn;
+__ro_after_init bool xen_percpu_upcall;
+EXPORT_SYMBOL_GPL(xen_percpu_upcall);
+
void xen_hvm_init_shared_info(void)
{
struct xen_add_to_physmap xatp;
@@ -125,9 +131,12 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ if (xen_percpu_upcall)
+ apic_eoi();
+
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}
@@ -139,7 +148,9 @@ static void xen_hvm_shutdown(void)
if (kexec_in_progress)
xen_reboot(SHUTDOWN_soft_reset);
}
+#endif
+#ifdef CONFIG_CRASH_DUMP
static void xen_hvm_crash_shutdown(struct pt_regs *regs)
{
native_machine_crash_shutdown(regs);
@@ -152,15 +163,14 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
int rc = 0;
/*
- * This can happen if CPU was offlined earlier and
- * offlining timed out in common_cpu_die().
+ * If a CPU was offlined earlier and offlining timed out then the
+ * lock mechanism is still initialized. Uninit it unconditionally
+ * as it's safe to call even if already uninited. Interrupts and
+ * timer have already been handled in xen_cpu_dead_hvm().
*/
- if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- }
+ xen_uninit_lock_cpu(cpu);
- if (cpu_acpi_id(cpu) != U32_MAX)
+ if (cpu_acpi_id(cpu) != CPU_ACPIID_INVALID)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
@@ -168,6 +178,15 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
if (!xen_have_vector_callback)
return 0;
+ if (xen_percpu_upcall) {
+ rc = xen_set_upcall_vector(cpu);
+ if (rc) {
+ WARN(1, "HVMOP_set_evtchn_upcall_vector"
+ " for CPU %d failed: %d\n", cpu, rc);
+ return rc;
+ }
+ }
+
if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
@@ -188,14 +207,13 @@ static int xen_cpu_dead_hvm(unsigned int cpu)
return 0;
}
-static bool no_vector_callback __initdata;
-
static void __init xen_hvm_guest_init(void)
{
if (xen_pv_domain())
return;
- xen_set_restricted_virtio_memory_access();
+ if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
+ virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
init_hvm_pv_info();
@@ -211,9 +229,6 @@ static void __init xen_hvm_guest_init(void)
xen_panic_handler_init();
- if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector))
- xen_have_vector_callback = 1;
-
xen_hvm_smp_init();
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
xen_unplug_emulated_devices();
@@ -223,6 +238,8 @@ static void __init xen_hvm_guest_init(void)
#ifdef CONFIG_KEXEC_CORE
machine_ops.shutdown = xen_hvm_shutdown;
+#endif
+#ifdef CONFIG_CRASH_DUMP
machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
#endif
}
@@ -239,7 +256,7 @@ early_param("xen_nopv", xen_parse_nopv);
static __init int xen_parse_no_vector_callback(char *arg)
{
- no_vector_callback = true;
+ xen_have_vector_callback = false;
return 0;
}
early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 70fb2ea85e90..ace2eb054053 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -23,6 +23,7 @@
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
+#include <linux/kstrtox.h>
#include <linux/memblock.h>
#include <linux/export.h>
#include <linux/mm.h>
@@ -31,6 +32,8 @@
#include <linux/gfp.h>
#include <linux/edd.h>
#include <linux/reboot.h>
+#include <linux/virtio_anchor.h>
+#include <linux/stackprotector.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -63,9 +66,9 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
+#include <asm/mtrr.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
@@ -76,7 +79,7 @@
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
@@ -98,6 +101,17 @@ struct tls_descs {
struct desc_struct desc[3];
};
+DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE;
+DEFINE_PER_CPU(unsigned int, xen_lazy_nesting);
+
+enum xen_lazy_mode xen_get_lazy_mode(void)
+{
+ if (in_interrupt())
+ return XEN_LAZY_NONE;
+
+ return this_cpu_read(xen_lazy_mode);
+}
+
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
@@ -107,9 +121,69 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
+
+static int __init parse_xen_msr_safe(char *str)
+{
+ if (str)
+ return kstrtobool(str, &xen_msr_safe);
+ return -EINVAL;
+}
+early_param("xen_msr_safe", parse_xen_msr_safe);
+
+/* Get MTRR settings from Xen and put them into mtrr_state. */
+static void __init xen_set_mtrr_data(void)
+{
+#ifdef CONFIG_MTRR
+ struct xen_platform_op op = {
+ .cmd = XENPF_read_memtype,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ unsigned int reg;
+ unsigned long mask;
+ uint32_t eax, width;
+ static struct mtrr_var_range var[MTRR_MAX_VAR_RANGES] __initdata;
+
+ /* Get physical address width (only 64-bit cpus supported). */
+ width = 36;
+ eax = cpuid_eax(0x80000000);
+ if ((eax >> 16) == 0x8000 && eax >= 0x80000008) {
+ eax = cpuid_eax(0x80000008);
+ width = eax & 0xff;
+ }
+
+ for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) {
+ op.u.read_memtype.reg = reg;
+ if (HYPERVISOR_platform_op(&op))
+ break;
+
+ /*
+ * Only called in dom0, which has all RAM PFNs mapped at
+ * RAM MFNs, and all PCI space etc. is identity mapped.
+ * This means we can treat MFN == PFN regarding MTRR settings.
+ */
+ var[reg].base_lo = op.u.read_memtype.type;
+ var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT;
+ var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT);
+ mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1);
+ mask &= (1UL << width) - 1;
+ if (mask)
+ mask |= MTRR_PHYSMASK_V;
+ var[reg].mask_lo = mask;
+ var[reg].mask_hi = mask >> 32;
+ }
+
+ /* Only overwrite MTRR state if any MTRR could be got from Xen. */
+ if (reg)
+ mtrr_overwrite_state(var, reg, MTRR_TYPE_UNCACHABLE);
+#endif
+}
+
static void __init xen_pv_init_platform(void)
{
- xen_set_restricted_virtio_memory_access();
+ /* PV guests can't operate virtio devices without grants. */
+ if (IS_ENABLED(CONFIG_XEN_VIRTIO))
+ virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
@@ -121,6 +195,14 @@ static void __init xen_pv_init_platform(void)
/* pvclock is in shared info area */
xen_init_time_ops();
+
+ if (xen_initial_domain())
+ xen_set_mtrr_data();
+ else
+ mtrr_overwrite_state(NULL, 0, MTRR_TYPE_WRBACK);
+
+ /* Adjust nr_cpu_ids before "enumeration" happens */
+ xen_smp_count_cpus();
}
static void __init xen_pv_guest_late_init(void)
@@ -220,17 +302,17 @@ static bool __init xen_check_mwait(void)
native_cpuid(&ax, &bx, &cx, &dx);
- /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
- buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+ buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
- (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
@@ -262,6 +344,7 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_LKGS);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
@@ -293,10 +376,25 @@ static noinstr unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
+static void xen_start_context_switch(struct task_struct *prev)
+{
+ BUG_ON(preemptible());
+
+ if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
+ enter_lazy(XEN_LAZY_CPU);
+}
+
static void xen_end_context_switch(struct task_struct *next)
{
+ BUG_ON(preemptible());
+
xen_mc_flush();
- paravirt_end_context_switch(next);
+ leave_lazy(XEN_LAZY_CPU);
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
static unsigned long xen_store_tr(void)
@@ -403,7 +501,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gdt(const struct desc_ptr *dtr)
@@ -454,7 +552,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- pfn = virt_to_pfn(va);
+ pfn = virt_to_pfn((void *)va);
mfn = pfn_to_mfn(pfn);
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
@@ -499,7 +597,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (xen_get_lazy_mode() == XEN_LAZY_CPU)
loadsegment(fs, 0);
xen_mc_batch();
@@ -508,7 +606,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
load_TLS_descriptor(t, cpu, 1);
load_TLS_descriptor(t, cpu, 2);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gs_index(unsigned int idx)
@@ -609,7 +707,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_int3, false ),
TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
- { entry_INT80_compat, xen_entry_INT80_compat, false },
+ TRAP_ENTRY(int80_emulation, false ),
#endif
TRAP_ENTRY(exc_page_fault, false ),
TRAP_ENTRY(exc_divide_error, false ),
@@ -625,7 +723,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
TRAP_ENTRY(exc_control_protection, false ),
#endif
};
@@ -762,6 +860,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ static const struct trap_info zero = { };
unsigned out;
trace_xen_cpu_load_idt(desc);
@@ -771,7 +870,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
out = xen_convert_trap_info(desc, traps, false);
- memset(&traps[out], 0, sizeof(traps[0]));
+ traps[out] = zero;
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
@@ -839,7 +938,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
@@ -903,7 +1002,7 @@ static void xen_write_cr0(unsigned long cr0)
MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_write_cr4(unsigned long cr4)
@@ -913,14 +1012,18 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-static u64 xen_read_msr_safe(unsigned int msr, int *err)
+static u64 xen_do_read_msr(unsigned int msr, int *err)
{
- u64 val;
+ u64 val = 0; /* Avoid uninitialized value for safe variant. */
if (pmu_msr_read(msr, &val, err))
return val;
- val = native_read_msr_safe(msr, err);
+ if (err)
+ val = native_read_msr_safe(msr, err);
+ else
+ val = native_read_msr(msr);
+
switch (msr) {
case MSR_IA32_APICBASE:
val &= ~X2APIC_ENABLE;
@@ -929,23 +1032,39 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err)
return val;
}
-static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+static void set_seg(unsigned int which, unsigned int low, unsigned int high,
+ int *err)
{
- int ret;
- unsigned int which;
- u64 base;
+ u64 base = ((u64)high << 32) | low;
- ret = 0;
+ if (HYPERVISOR_set_segment_base(which, base) == 0)
+ return;
+
+ if (err)
+ *err = -EIO;
+ else
+ WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base);
+}
+/*
+ * Support write_msr_safe() and write_msr() semantics.
+ * With err == NULL write_msr() semantics are selected.
+ * Supplying an err pointer requires err to be pre-initialized with 0.
+ */
+static void xen_do_write_msr(unsigned int msr, unsigned int low,
+ unsigned int high, int *err)
+{
switch (msr) {
- case MSR_FS_BASE: which = SEGBASE_FS; goto set;
- case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
- case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
-
- set:
- base = ((u64)high << 32) | low;
- if (HYPERVISOR_set_segment_base(which, base) != 0)
- ret = -EIO;
+ case MSR_FS_BASE:
+ set_seg(SEGBASE_FS, low, high, err);
+ break;
+
+ case MSR_KERNEL_GS_BASE:
+ set_seg(SEGBASE_GS_USER, low, high, err);
+ break;
+
+ case MSR_GS_BASE:
+ set_seg(SEGBASE_GS_KERNEL, low, high, err);
break;
case MSR_STAR:
@@ -961,31 +1080,42 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
break;
default:
- if (!pmu_msr_write(msr, low, high, &ret))
- ret = native_write_msr_safe(msr, low, high);
+ if (!pmu_msr_write(msr, low, high, err)) {
+ if (err)
+ *err = native_write_msr_safe(msr, low, high);
+ else
+ native_write_msr(msr, low, high);
+ }
}
+}
+
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+ return xen_do_read_msr(msr, err);
+}
+
+static int xen_write_msr_safe(unsigned int msr, unsigned int low,
+ unsigned int high)
+{
+ int err = 0;
+
+ xen_do_write_msr(msr, low, high, &err);
- return ret;
+ return err;
}
static u64 xen_read_msr(unsigned int msr)
{
- /*
- * This will silently swallow a #GP from RDMSR. It may be worth
- * changing that.
- */
int err;
- return xen_read_msr_safe(msr, &err);
+ return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
}
static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
{
- /*
- * This will silently swallow a #GP from WRMSR. It may be worth
- * changing that.
- */
- xen_write_msr_safe(msr, low, high);
+ int err;
+
+ xen_do_write_msr(msr, low, high, xen_msr_safe ? &err : NULL);
}
/* This is called once we have the cpu_possible_mask */
@@ -1022,7 +1152,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
.write_cr4 = xen_write_cr4,
- .wbinvd = native_wbinvd,
+ .wbinvd = pv_native_wbinvd,
.read_msr = xen_read_msr,
.write_msr = xen_write_msr,
@@ -1055,7 +1185,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- .start_context_switch = paravirt_start_context_switch,
+ .start_context_switch = xen_start_context_switch,
.end_context_switch = xen_end_context_switch,
},
};
@@ -1164,7 +1294,7 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
pv_ops.cpu.load_gdt = xen_load_gdt;
@@ -1220,10 +1350,12 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_vcpu_info_reset(0);
x86_platform.get_nmi_reason = xen_get_nmi_reason;
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_select = x86_init_noop;
- x86_init.irqs.intr_mode_init = x86_init_noop;
+ x86_init.irqs.intr_mode_init = x86_64_probe_apic;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
x86_init.hyper.init_platform = xen_pv_init_platform;
@@ -1263,12 +1395,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_init_capabilities();
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
*/
xen_init_apic();
-#endif
machine_ops = xen_machine_ops;
@@ -1341,7 +1471,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
- xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_init_vga(info, xen_start_info->console.dom0.info_size,
+ &boot_params.screen_info);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index bcae606bbc5c..27a2a02ef8fb 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,9 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
#include <linux/export.h>
+#include <linux/mm.h>
#include <xen/hvc-console.h>
+#include <asm/bootparam.h>
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
@@ -43,6 +45,19 @@ void __init xen_pvh_init(struct boot_params *boot_params)
x86_init.oem.banner = xen_banner;
xen_efi_init(boot_params);
+
+ if (xen_initial_domain()) {
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_dom0_console,
+ };
+ int ret = HYPERVISOR_platform_op(&op);
+
+ if (ret > 0)
+ xen_init_vga(&op.u.dom0_console,
+ min(ret * sizeof(char),
+ sizeof(op.u.dom0_console)),
+ &boot_params->screen_info);
+ }
}
void __init mem_map_via_hcall(struct boot_params *boot_params_p)
@@ -59,3 +74,70 @@ void __init mem_map_via_hcall(struct boot_params *boot_params_p)
}
boot_params_p->e820_entries = memmap.nr_entries;
}
+
+/*
+ * Reserve e820 UNUSABLE regions to inflate the memory balloon.
+ *
+ * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
+ * located as the same place as in the native memory map, but since dom0 gets
+ * less memory than the total amount of host RAM the ranges that can't be
+ * populated are converted from RAM -> UNUSABLE. Use such regions (up to the
+ * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
+ * boot. Doing so prevents the guest (even if just temporary) from using holes
+ * in the memory map in order to map grants or foreign addresses, and
+ * hopefully limits the risk of a clash with a device MMIO region. Ideally the
+ * hypervisor should notify us which memory ranges are suitable for creating
+ * foreign mappings, but that's not yet implemented.
+ */
+void __init xen_reserve_extra_memory(struct boot_params *bootp)
+{
+ unsigned int i, ram_pages = 0, extra_pages;
+
+ for (i = 0; i < bootp->e820_entries; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+
+ if (e->type != E820_TYPE_RAM)
+ continue;
+ ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
+ }
+
+ /* Max amount of extra memory. */
+ extra_pages = EXTRA_MEM_RATIO * ram_pages;
+
+ /*
+ * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
+ * purposes.
+ */
+ for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+ unsigned long pages;
+
+ if (e->type != E820_TYPE_UNUSABLE)
+ continue;
+
+ pages = min(extra_pages,
+ PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
+
+ if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
+ struct boot_e820_entry *next;
+
+ if (bootp->e820_entries ==
+ ARRAY_SIZE(bootp->e820_table))
+ /* No space left to split - skip region. */
+ continue;
+
+ /* Split entry. */
+ next = e + 1;
+ memmove(next, e,
+ (bootp->e820_entries - i) * sizeof(*e));
+ bootp->e820_entries++;
+ next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
+ e->size = next->addr - e->addr;
+ next->size -= e->size;
+ }
+ e->type = E820_TYPE_RAM;
+ extra_pages -= pages;
+
+ xen_add_extra_mem(PFN_UP(e->addr), pages);
+ }
+}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 06c3c2fb4b06..39982f955cfe 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void xen_safe_halt(void)
+static noinstr void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
@@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = {
/* Initial interrupt flag handling only called while interrupts off. */
.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
+ .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ee29fb558f2e..54e0d311dcc9 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -34,7 +34,7 @@
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use. This menas that Xen can be assured
+ * when it is not actively in use. This means that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
@@ -86,6 +86,22 @@
#include "mmu.h"
#include "debugfs.h"
+/*
+ * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
+ * to avoid warnings with "-Wmissing-prototypes".
+ */
+pteval_t xen_pte_val(pte_t pte);
+pgdval_t xen_pgd_val(pgd_t pgd);
+pmdval_t xen_pmd_val(pmd_t pmd);
+pudval_t xen_pud_val(pud_t pud);
+p4dval_t xen_p4d_val(p4d_t p4d);
+pte_t xen_make_pte(pteval_t pte);
+pgd_t xen_make_pgd(pgdval_t pgd);
+pmd_t xen_make_pmd(pmdval_t pmd);
+pud_t xen_make_pud(pudval_t pud);
+p4d_t xen_make_p4d(p4dval_t p4d);
+pte_t xen_make_pte_init(pteval_t pte);
+
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* l3 pud for userspace vsyscall mapping */
static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
@@ -150,7 +166,7 @@ void make_lowmem_page_readwrite(void *vaddr)
if (pte == NULL)
return; /* vaddr missing */
- ptev = pte_mkwrite(*pte);
+ ptev = pte_mkwrite_novma(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
@@ -220,7 +236,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -254,7 +270,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
struct mmu_update u;
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ if (xen_get_lazy_mode() != XEN_LAZY_MMU)
return false;
xen_mc_batch();
@@ -263,7 +279,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
u.val = pte_val_ma(pteval);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
return true;
}
@@ -309,7 +325,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
/* Assume pteval_t is equivalent to all the other *val_t types. */
@@ -403,7 +419,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -483,7 +499,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
__xen_set_p4d_hyper(ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -515,7 +531,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
if (user_ptr)
__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
#if CONFIG_PGTABLE_LEVELS >= 5
@@ -651,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
spinlock_t *ptl = NULL;
#if USE_SPLIT_PTE_PTLOCKS
- ptl = ptlock_ptr(page);
+ ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -885,14 +901,7 @@ void xen_mm_unpin_all(void)
spin_unlock(&pgd_lock);
}
-static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- spin_lock(&next->page_table_lock);
- xen_pgd_pin(next);
- spin_unlock(&next->page_table_lock);
-}
-
-static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_enter_mmap(struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -904,7 +913,7 @@ static void drop_mm_ref_this_cpu(void *info)
struct mm_struct *mm = info;
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
+ leave_mm();
/*
* If this cpu still has a stale cr3 reference, then make sure
@@ -1050,7 +1059,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
pte_t *pte_tbl;
int i;
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
return;
@@ -1073,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
pmd_t *pmd_tbl;
int i;
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
return;
@@ -1095,7 +1104,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
pud_t *pud_tbl;
int i;
- if (p4d_large(*p4d)) {
+ if (p4d_leaf(*p4d)) {
pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, P4D_SIZE);
return;
@@ -1236,7 +1245,7 @@ static noinline void xen_flush_tlb(void)
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1256,7 +1265,7 @@ static void xen_flush_tlb_one_user(unsigned long addr)
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1293,7 +1302,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus,
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
static unsigned long xen_read_cr3(void)
@@ -1352,7 +1361,7 @@ static void xen_write_cr3(unsigned long cr3)
else
__xen_write_cr3(false, 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
/*
@@ -1387,7 +1396,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
__xen_write_cr3(true, cr3);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
static int xen_pgd_alloc(struct mm_struct *mm)
@@ -1548,7 +1557,7 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
}
@@ -1578,7 +1587,7 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
__set_pfn_prot(pfn, PAGE_KERNEL);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
ClearPagePinned(page);
}
@@ -1795,7 +1804,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(init_top_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -1854,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pud_present(pud))
return 0;
pa = pud_val(pud) & PTE_PFN_MASK;
- if (pud_large(pud))
+ if (pud_leaf(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
@@ -1862,7 +1871,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pmd_present(pmd))
return 0;
pa = pmd_val(pmd) & PTE_PFN_MASK;
- if (pmd_large(pmd))
+ if (pmd_leaf(pmd))
return pa + (vaddr & ~PMD_MASK);
pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
@@ -2074,6 +2083,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
+static void xen_enter_lazy_mmu(void)
+{
+ enter_lazy(XEN_LAZY_MMU);
+}
+
+static void xen_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
static void __init xen_post_allocator_init(void)
{
pv_ops.mmu.set_pte = xen_set_pte;
@@ -2098,7 +2124,7 @@ static void xen_leave_lazy_mmu(void)
{
preempt_disable();
xen_mc_flush();
- paravirt_leave_lazy_mmu();
+ leave_lazy(XEN_LAZY_MMU);
preempt_enable();
}
@@ -2153,14 +2179,13 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
+ .enter_mmap = xen_enter_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
+ .enter = xen_enter_lazy_mmu,
.leave = xen_leave_lazy_mmu,
- .flush = paravirt_flush_lazy_mmu,
+ .flush = xen_flush_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
@@ -2194,13 +2219,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
mcs = __xen_mc_entry(0);
if (in_frames)
- in_frames[i] = virt_to_mfn(vaddr);
+ in_frames[i] = virt_to_mfn((void *)vaddr);
MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
- __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+ __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY);
if (out_frames)
- out_frames[i] = virt_to_pfn(vaddr);
+ out_frames[i] = virt_to_pfn((void *)vaddr);
}
xen_mc_issue(0);
}
@@ -2242,7 +2267,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
MULTI_update_va_mapping(mcs.mc, vaddr,
mfn_pte(mfn, PAGE_KERNEL), flags);
- set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn);
}
xen_mc_issue(0);
@@ -2302,12 +2327,6 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
int success;
unsigned long vstart = (unsigned long)phys_to_virt(pstart);
- /*
- * Currently an auto-translated guest will not perform I/O, nor will
- * it require PAE page directories below 4GB. Therefore any calls to
- * this function are redundant and can be ignored.
- */
-
if (unlikely(order > MAX_CONTIG_ORDER))
return -ENOMEM;
@@ -2319,7 +2338,7 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
xen_zap_pfn_range(vstart, order, in_frames, NULL);
/* 2. Get a new contiguous memory extent. */
- out_frame = virt_to_pfn(vstart);
+ out_frame = virt_to_pfn((void *)vstart);
success = xen_exchange_memory(1UL << order, 0, in_frames,
1, order, &out_frame,
address_bits);
@@ -2352,7 +2371,7 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
spin_lock_irqsave(&xen_reservation_lock, flags);
/* 1. Find start MFN of contiguous extent. */
- in_frame = virt_to_mfn(vstart);
+ in_frame = virt_to_mfn((void *)vstart);
/* 2. Zap current PTEs. */
xen_zap_pfn_range(vstart, order, NULL, out_frames);
@@ -2383,7 +2402,7 @@ static noinline void xen_flush_tlb_all(void)
op->cmd = MMUEXT_TLB_FLUSH_ALL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -2501,7 +2520,7 @@ out:
}
EXPORT_SYMBOL_GPL(xen_remap_pfn);
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_VMCORE_INFO
phys_addr_t paddr_vmcoreinfo_note(void)
{
if (xen_pv_domain())
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 1c51b2c87f30..c3867b585e0d 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -26,7 +26,7 @@ static inline void xen_mc_batch(void)
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
- trace_xen_mc_batch(paravirt_get_lazy_mode());
+ trace_xen_mc_batch(xen_get_lazy_mode());
__this_cpu_write(xen_mc_irq_flags, flags);
}
@@ -44,7 +44,7 @@ static inline void xen_mc_issue(unsigned mode)
{
trace_xen_mc_issue(mode);
- if ((paravirt_get_lazy_mode() & mode) == 0)
+ if ((xen_get_lazy_mode() & mode) == 0)
xen_mc_flush();
/* restore flags saved in xen_mc_batch */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 58db86f7b384..9bdc3b656b2c 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -134,11 +134,6 @@ static inline unsigned p2m_mid_index(unsigned long pfn)
return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 21ecbe754cb2..246d67dab510 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -131,6 +131,10 @@ static inline uint32_t get_fam15h_addr(u32 addr)
static inline bool is_amd_pmu_msr(unsigned int msr)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return false;
+
if ((msr >= MSR_F15H_PERF_CTL &&
msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
(msr >= MSR_K7_EVNTSEL0 &&
@@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr)
return false;
}
-static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index)
{
u32 msr_index_pmc;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return false;
+
switch (msr_index) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_IA32_DS_AREA:
@@ -290,48 +299,52 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
return false;
}
+static bool pmu_msr_chk_emulated(unsigned int msr, uint64_t *val, bool is_read,
+ bool *emul)
+{
+ int type, index = 0;
+
+ if (is_amd_pmu_msr(msr))
+ *emul = xen_amd_pmu_emulate(msr, val, is_read);
+ else if (is_intel_pmu_msr(msr, &type, &index))
+ *emul = xen_intel_pmu_emulate(msr, val, type, index, is_read);
+ else
+ return false;
+
+ return true;
+}
+
bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, val, 1))
- *val = native_read_msr_safe(msr, err);
- return true;
- }
- } else {
- int type, index;
+ bool emulated;
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
- *val = native_read_msr_safe(msr, err);
- return true;
- }
+ if (!pmu_msr_chk_emulated(msr, val, true, &emulated))
+ return false;
+
+ if (!emulated) {
+ *val = err ? native_read_msr_safe(msr, err)
+ : native_read_msr(msr);
}
- return false;
+ return true;
}
bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
{
uint64_t val = ((uint64_t)high << 32) | low;
+ bool emulated;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, &val, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true;
- }
- } else {
- int type, index;
+ if (!pmu_msr_chk_emulated(msr, &val, false, &emulated))
+ return false;
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true;
- }
+ if (!emulated) {
+ if (err)
+ *err = native_write_msr_safe(msr, low, high);
+ else
+ native_write_msr(msr, low, high);
}
- return false;
+ return true;
}
static unsigned long long xen_amd_read_pmc(int counter)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index cfa99e8f054b..380591028cb8 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -6,7 +6,9 @@
*/
#include <linux/init.h>
+#include <linux/iscsi_ibft.h>
#include <linux/sched.h>
+#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
@@ -36,12 +38,12 @@
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
-/* Amount of extra memory space we add to the e820 ranges */
-struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
+/* Memory map would allow PCI passthrough. */
+bool xen_pv_pci_possible;
+
/* E820 map used during setting up memory. */
static struct e820_table xen_e820_table __initdata;
@@ -59,18 +61,6 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
-/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO (10)
-
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -85,41 +75,12 @@ static void __init xen_parse_512gb(void)
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
- else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
}
-static void __init xen_add_extra_mem(unsigned long start_pfn,
- unsigned long n_pfns)
-{
- int i;
-
- /*
- * No need to check for zero size, should happen rarely and will only
- * write a new entry regarded to be unused due to zero size.
- */
- for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- /* Add new region. */
- if (xen_extra_mem[i].n_pfns == 0) {
- xen_extra_mem[i].start_pfn = start_pfn;
- xen_extra_mem[i].n_pfns = n_pfns;
- break;
- }
- /* Append to existing region. */
- if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
- start_pfn) {
- xen_extra_mem[i].n_pfns += n_pfns;
- break;
- }
- }
- if (i == XEN_EXTRA_MEM_MAX_REGIONS)
- printk(KERN_WARNING "Warning: not enough extra memory regions\n");
-
- memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
-}
-
static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
@@ -338,7 +299,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
WARN_ON(size == 0);
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
ident_pfn_iter < ident_end_pfn;
@@ -501,7 +462,7 @@ void __init xen_remap_memory(void)
unsigned long pfn_s = ~0UL;
unsigned long len = 0;
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
while (xen_remap_mfn != INVALID_P2M_ENTRY) {
/* Map the remap information */
@@ -763,17 +724,26 @@ char * __init xen_memory_setup(void)
BUG_ON(memmap.nr_entries == 0);
xen_e820_table.nr_entries = memmap.nr_entries;
- /*
- * Xen won't allow a 1:1 mapping to be created to UNUSABLE
- * regions, so if we're using the machine memory map leave the
- * region as RAM as it is in the pseudo-physical map.
- *
- * UNUSABLE regions in domUs are not handled and will need
- * a patch in the future.
- */
- if (xen_initial_domain())
+ if (xen_initial_domain()) {
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
xen_ignore_unusable();
+#ifdef CONFIG_ISCSI_IBFT_FIND
+ /* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */
+ xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START;
+ xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START;
+ xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED;
+ xen_e820_table.nr_entries++;
+#endif
+ }
+
/* Make sure the Xen-supplied memory map is well-ordered. */
e820__update_table(&xen_e820_table);
@@ -803,6 +773,9 @@ char * __init xen_memory_setup(void)
chunk_size = size;
type = xen_e820_table.entries[i].type;
+ if (type == E820_TYPE_RESERVED)
+ xen_pv_pci_possible = true;
+
if (type == E820_TYPE_RAM) {
if (addr < mem_end) {
chunk_size = min(size, mem_end - addr);
@@ -910,17 +883,9 @@ static int register_callback(unsigned type, const void *func)
void xen_enable_sysenter(void)
{
- int ret;
- unsigned sysenter_feature;
-
- sysenter_feature = X86_FEATURE_SYSENTER32;
-
- if (!boot_cpu_has(sysenter_feature))
- return;
-
- ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat);
- if(ret != 0)
- setup_clear_cpu_cap(sysenter_feature);
+ if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) &&
+ register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
}
void xen_enable_syscall(void)
@@ -934,22 +899,15 @@ void xen_enable_syscall(void)
mechanism for syscalls. */
}
- if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
- ret = register_callback(CALLBACKTYPE_syscall32,
- xen_entry_SYSCALL_compat);
- if (ret != 0)
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
- }
+ if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) &&
+ register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
static void __init xen_pvmmu_arch_setup(void)
{
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
-
if (register_callback(CALLBACKTYPE_event,
xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c3e1f9a7d43a..935771726f9c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -32,30 +32,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
void xen_smp_intr_free(unsigned int cpu)
{
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
per_cpu(xen_resched_irq, cpu).irq = -1;
- kfree(per_cpu(xen_resched_irq, cpu).name);
- per_cpu(xen_resched_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
per_cpu(xen_callfunc_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfunc_irq, cpu).name);
- per_cpu(xen_callfunc_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
per_cpu(xen_debug_irq, cpu).irq = -1;
- kfree(per_cpu(xen_debug_irq, cpu).name);
- per_cpu(xen_debug_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
NULL);
per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
- per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
}
}
@@ -65,6 +65,9 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ if (!resched_name)
+ goto fail_mem;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
@@ -74,9 +77,11 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu).irq = rc;
- per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
@@ -86,10 +91,13 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu).irq = rc;
- per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ if (!debug_name)
+ goto fail_mem;
+
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
IRQF_PERCPU | IRQF_NOBALANCING,
@@ -97,10 +105,13 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu).irq = rc;
- per_cpu(xen_debug_irq, cpu).name = debug_name;
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
@@ -110,10 +121,11 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
- per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
return 0;
+ fail_mem:
+ rc = -ENOMEM;
fail:
xen_smp_intr_free(cpu);
return rc;
@@ -123,8 +135,6 @@ void __init xen_smp_cpus_done(unsigned int max_cpus)
{
if (xen_hvm_domain())
native_smp_cpus_done(max_cpus);
- else
- calculate_max_logical_packages();
}
void xen_smp_send_reschedule(int cpu)
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index bd02f9d50107..b8efdbc693f7 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -2,6 +2,10 @@
#ifndef _XEN_SMP_H
#ifdef CONFIG_SMP
+
+void asm_cpu_bringup_and_idle(void);
+asmlinkage void cpu_bringup_and_idle(void);
+
extern void xen_send_IPI_mask(const struct cpumask *mask,
int vector);
extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
@@ -15,12 +19,15 @@ extern void xen_smp_intr_free(unsigned int cpu);
int xen_smp_intr_init_pv(unsigned int cpu);
void xen_smp_intr_free_pv(unsigned int cpu);
+void xen_smp_count_cpus(void);
void xen_smp_cpus_done(unsigned int max_cpus);
void xen_smp_send_reschedule(int cpu);
void xen_smp_send_call_function_ipi(const struct cpumask *mask);
void xen_smp_send_call_function_single_ipi(int cpu);
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
+
struct xen_common_irq {
int irq;
char *name;
@@ -38,6 +45,7 @@ static inline int xen_smp_intr_init_pv(unsigned int cpu)
return 0;
}
static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
+static inline void xen_smp_count_cpus(void) { }
#endif /* CONFIG_SMP */
#endif
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index b70afdff419c..ac95d1981cc0 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -55,18 +55,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void xen_hvm_cpu_die(unsigned int cpu)
+static void xen_hvm_cleanup_dead_cpu(unsigned int cpu)
{
- if (common_cpu_die(cpu) == 0) {
- if (xen_have_vector_callback) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
- }
+ if (xen_have_vector_callback) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
}
}
#else
-static void xen_hvm_cpu_die(unsigned int cpu)
+static void xen_hvm_cleanup_dead_cpu(unsigned int cpu)
{
BUG();
}
@@ -77,7 +75,7 @@ void __init xen_hvm_smp_init(void)
smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_cpus_done = xen_smp_cpus_done;
- smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.cleanup_dead_cpu = xen_hvm_cleanup_dead_cpu;
if (!xen_have_vector_callback) {
#ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index ba7af2eca755..27d1a5b7f571 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -29,6 +29,7 @@
#include <asm/idtentry.h>
#include <asm/desc.h>
#include <asm/cpu.h>
+#include <asm/apic.h>
#include <asm/io_apic.h>
#include <xen/interface/xen.h>
@@ -55,14 +56,15 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
-void asm_cpu_bringup_and_idle(void);
static void cpu_bringup(void)
{
int cpu;
cr4_init();
+ cpuhp_ap_sync_alive();
cpu_init();
+ fpu__init_cpu();
touch_softlockup_watchdog();
/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
@@ -72,7 +74,6 @@ static void cpu_bringup(void)
}
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
- cpu_data(cpu).x86_max_cores = 1;
set_cpu_sibling_map(cpu);
speculative_store_bypass_ht_init();
@@ -83,7 +84,7 @@ static void cpu_bringup(void)
set_cpu_online(cpu, true);
- cpu_set_state_online(cpu); /* Implies full memory barrier. */
+ smp_mb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -97,18 +98,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
void xen_smp_intr_free_pv(unsigned int cpu)
{
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
if (per_cpu(xen_irq_work, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
per_cpu(xen_irq_work, cpu).irq = -1;
- kfree(per_cpu(xen_irq_work, cpu).name);
- per_cpu(xen_irq_work, cpu).name = NULL;
}
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
- kfree(per_cpu(xen_pmu_irq, cpu).name);
- per_cpu(xen_pmu_irq, cpu).name = NULL;
}
}
@@ -118,6 +119,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
char *callfunc_name, *pmu_name;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
@@ -127,10 +129,10 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_irq_work, cpu).irq = rc;
- per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
@@ -138,7 +140,6 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
- per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0;
@@ -148,40 +149,18 @@ int xen_smp_intr_init_pv(unsigned int cpu)
return rc;
}
-static void __init _get_smp_config(unsigned int early)
+static void __init xen_pv_smp_config(void)
{
- int i, rc;
- unsigned int subtract = 0;
-
- if (early)
- return;
-
- num_processors = 0;
- disabled_cpus = 0;
- for (i = 0; i < nr_cpu_ids; i++) {
- rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0) {
- num_processors++;
- set_cpu_possible(i, true);
- } else {
- set_cpu_possible(i, false);
- set_cpu_present(i, false);
- subtract++;
- }
- }
-#ifdef CONFIG_HOTPLUG_CPU
- /* This is akin to using 'nr_cpus' on the Linux command line.
- * Which is OK as when we use 'dom0_max_vcpus=X' we can only
- * have up to X, while nr_cpu_ids is greater than X. This
- * normally is not a problem, except when CPU hotplugging
- * is involved and then there might be more than X CPUs
- * in the guest - which will not work as there is no
- * hypercall to expand the max number of VCPUs an already
- * running guest has. So cap it up to X. */
- if (subtract)
- nr_cpu_ids = nr_cpu_ids - subtract;
-#endif
+ u32 apicid = 0;
+ int i;
+
+ topology_register_boot_apic(apicid++);
+ for (i = 1; i < nr_cpu_ids; i++)
+ topology_register_apic(apicid++, CPU_ACPIID_INVALID, true);
+
+ /* Pretend to be a proper enumerated system */
+ smp_found_config = 1;
}
static void __init xen_pv_smp_prepare_boot_cpu(void)
@@ -209,7 +188,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
char *m = (max_cpus == 0) ?
"The nosmp parameter is incompatible with Xen; " \
"use Xen dom0_max_vcpus=1 parameter" :
@@ -222,8 +201,6 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
smp_prepare_cpus_common();
- cpu_data(0).x86_max_cores = 1;
-
speculative_store_bypass_ht_init();
xen_pmu_init(0);
@@ -254,15 +231,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
struct desc_struct *gdt;
unsigned long gdt_mfn;
- /* used to tell cpu_init() that it can proceed with initialization */
- cpumask_set_cpu(cpu, cpu_callout_mask);
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL) {
cpumask_clear_cpu(cpu, xen_cpu_initialized_map);
- cpumask_clear_cpu(cpu, cpu_callout_mask);
return -ENOMEM;
}
@@ -316,7 +290,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
+static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle)
{
int rc;
@@ -326,14 +300,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_setup_runstate_info(cpu);
- /*
- * PV VCPUs are always successfully taken down (see 'while' loop
- * in xen_cpu_die()), so -EBUSY is an error.
- */
- rc = cpu_check_up_prepare(cpu);
- if (rc)
- return rc;
-
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -343,15 +309,20 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_pmu_init(cpu);
- rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
- BUG_ON(rc);
-
- while (cpu_report_state(cpu) != CPU_ONLINE)
- HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+ /*
+ * Why is this a BUG? If the hypercall fails then everything can be
+ * rolled back, no?
+ */
+ BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL));
return 0;
}
+static void xen_pv_poll_sync_state(void)
+{
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
static int xen_pv_cpu_disable(void)
{
@@ -367,35 +338,26 @@ static int xen_pv_cpu_disable(void)
static void xen_pv_cpu_die(unsigned int cpu)
{
- while (HYPERVISOR_vcpu_op(VCPUOP_is_up,
- xen_vcpu_nr(cpu), NULL)) {
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), NULL)) {
__set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(HZ/10);
}
+}
- if (common_cpu_die(cpu) == 0) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
- xen_pmu_finish(cpu);
- }
+static void xen_pv_cleanup_dead_cpu(unsigned int cpu)
+{
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ xen_pmu_finish(cpu);
}
-static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
- cpu_bringup();
- /*
- * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
- * clears certain data that the cpu_idle loop (which called us
- * and that we return from) expects. The only way to get that
- * data back is to call:
- */
- tick_nohz_idle_enter();
- tick_nohz_idle_stop_tick_protected();
-
- cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
+ xen_cpu_bringup_again((unsigned long)task_pt_regs(current));
+ BUG();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -409,7 +371,12 @@ static void xen_pv_cpu_die(unsigned int cpu)
BUG();
}
-static void xen_pv_play_dead(void)
+static void xen_pv_cleanup_dead_cpu(unsigned int cpu)
+{
+ BUG();
+}
+
+static void __noreturn xen_pv_play_dead(void)
{
BUG();
}
@@ -442,13 +409,29 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
+void __init xen_smp_count_cpus(void)
+{
+ unsigned int cpus;
+
+ for (cpus = 0; cpus < nr_cpu_ids; cpus++) {
+ if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpus, NULL) < 0)
+ break;
+ }
+
+ pr_info("Xen PV: Detected %u vCPUS\n", cpus);
+ if (cpus < nr_cpu_ids)
+ set_nr_cpu_ids(cpus);
+}
+
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
- .cpu_up = xen_pv_cpu_up,
+ .kick_ap_alive = xen_pv_kick_ap,
.cpu_die = xen_pv_cpu_die,
+ .cleanup_dead_cpu = xen_pv_cleanup_dead_cpu,
+ .poll_sync_state = xen_pv_poll_sync_state,
.cpu_disable = xen_pv_cpu_disable,
.play_dead = xen_pv_play_dead,
@@ -464,6 +447,12 @@ void __init xen_smp_init(void)
smp_ops = xen_smp_ops;
/* Avoid searching for BIOS MP tables */
- x86_init.mpparse.find_smp_config = x86_init_noop;
- x86_init.mpparse.get_smp_config = _get_smp_config;
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+
+ /* XEN/PV Dom0 has halfways sane topology information via CPUID/MADT */
+ if (xen_initial_domain())
+ x86_init.mpparse.parse_smp_cfg = x86_init_noop;
+ else
+ x86_init.mpparse.parse_smp_cfg = xen_pv_smp_config;
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 043c73dfd2c9..5c6fc16e4b92 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -75,6 +75,7 @@ void xen_init_lock_cpu(int cpu)
cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ per_cpu(irq_name, cpu) = name;
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
@@ -85,7 +86,6 @@ void xen_init_lock_cpu(int cpu)
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
- per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -98,6 +98,8 @@ void xen_uninit_lock_cpu(int cpu)
if (!xen_pvspin)
return;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
/*
* When booting the kernel with 'mitigations=auto,nosmt', the secondary
* CPUs are not activated, and lock_kicker_irq is not initialized.
@@ -108,8 +110,6 @@ void xen_uninit_lock_cpu(int cpu)
unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
- kfree(per_cpu(irq_name, cpu));
- per_cpu(irq_name, cpu) = NULL;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index 9d548b0c772f..0c4f7554b7cc 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -5,6 +5,7 @@
#include <xen/hvm.h>
#include <xen/features.h>
#include <xen/interface/features.h>
+#include <xen/events.h>
#include "xen-ops.h"
@@ -14,6 +15,13 @@ void xen_hvm_post_suspend(int suspend_cancelled)
xen_hvm_init_shared_info();
xen_vcpu_restore();
}
- xen_setup_callback_vector();
+ if (xen_percpu_upcall) {
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu)
+ BUG_ON(xen_set_upcall_vector(cpu));
+ } else {
+ xen_setup_callback_vector();
+ }
xen_unplug_emulated_devices();
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 9ef0a5cca96e..52fa5609b7f6 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <asm/xen/cpuid.h>
#include <xen/events.h>
#include <xen/features.h>
@@ -60,9 +61,16 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
{
- return xen_clocksource_read() - xen_sched_clock_offset;
+ struct pvclock_vcpu_time_info *src;
+ u64 ret;
+
+ src = &__this_cpu_read(xen_vcpu)->time;
+ ret = pvclock_clocksource_read_nowd(src);
+ ret -= xen_sched_clock_offset;
+
+ return ret;
}
static void xen_read_wallclock(struct timespec64 *ts)
@@ -474,15 +482,47 @@ static void xen_setup_vsyscall_time_info(void)
xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
+/*
+ * Check if it is possible to safely use the tsc as a clocksource. This is
+ * only true if the hypervisor notifies the guest that its tsc is invariant,
+ * the tsc is stable, and the tsc instruction will never be emulated.
+ */
+static int __init xen_tsc_safe_clocksource(void)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
+ return 0;
+
+ if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
+ return 0;
+
+ if (check_tsc_unstable())
+ return 0;
+
+ /* Leaf 4, sub-leaf 0 (0x40000x03) */
+ cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
+
+ return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
+}
+
static void __init xen_time_init(void)
{
struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec64 tp;
- /* As Dom0 is never moved, no penalty on using TSC there */
+ /*
+ * As Dom0 is never moved, no penalty on using TSC there.
+ *
+ * If it is possible for the guest to determine that the tsc is a safe
+ * clocksource, then set xen_clocksource rating below that of the tsc
+ * so that the system prefers tsc instead.
+ */
if (xen_initial_domain())
xen_clocksource.rating = 275;
+ else if (xen_tsc_safe_clocksource())
+ xen_clocksource.rating = 299;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index 14ea32e734d5..f7547807b0bd 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -2,17 +2,15 @@
#include <linux/screen_info.h>
#include <linux/init.h>
-#include <asm/bootparam.h>
#include <asm/setup.h>
#include <xen/interface/xen.h>
#include "xen-ops.h"
-void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size,
+ struct screen_info *screen_info)
{
- struct screen_info *screen_info = &boot_params.screen_info;
-
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b4fdf6b9542..83189cf5cdce 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -28,7 +28,7 @@
* non-zero.
*/
SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
RET
SYM_FUNC_END(xen_irq_disable_direct)
@@ -69,7 +69,7 @@ SYM_FUNC_END(check_events)
SYM_FUNC_START(xen_irq_enable_direct)
FRAME_BEGIN
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
/*
* Preempt here doesn't matter because that will deal with any
@@ -78,7 +78,7 @@ SYM_FUNC_START(xen_irq_enable_direct)
*/
/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending)
jz 1f
call check_events
@@ -97,7 +97,7 @@ SYM_FUNC_END(xen_irq_enable_direct)
* x86 use opposite senses (mask vs enable).
*/
SYM_FUNC_START(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
setz %ah
addb %ah, %ah
RET
@@ -113,7 +113,7 @@ SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
FRAME_BEGIN
- _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+ _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX
FRAME_END
RET
SYM_FUNC_END(xen_read_cr2_direct);
@@ -148,7 +148,7 @@ xen_pv_trap asm_exc_page_fault
xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
xen_pv_trap asm_exc_control_protection
#endif
#ifdef CONFIG_X86_MCE
@@ -156,7 +156,7 @@ xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
-xen_pv_trap entry_INT80_compat
+xen_pv_trap asm_int80_emulation
#endif
xen_pv_trap asm_exc_xen_unknown_trap
xen_pv_trap asm_exc_xen_hypervisor_callback
@@ -165,7 +165,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ENDBR
pop %rcx
pop %r11
@@ -193,7 +193,7 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* rsp->rax }
*/
SYM_CODE_START(xen_iret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
pushq $0
jmp hypercall_iret
@@ -262,10 +262,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
@@ -284,10 +284,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..04101b984f24 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -45,11 +45,11 @@ SYM_CODE_END(hypercall_page)
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
cld
- mov initial_stack(%rip), %rsp
+ leaq (__end_init_task - TOP_OF_KERNEL_STACK_PADDING - PTREGS_SIZE)(%rip), %rsp
/* Set up %gs.
*
@@ -71,11 +71,18 @@ SYM_CODE_END(startup_xen)
#ifdef CONFIG_XEN_PV_SMP
.pushsection .text
SYM_CODE_START(asm_cpu_bringup_and_idle)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
call cpu_bringup_and_idle
SYM_CODE_END(asm_cpu_bringup_and_idle)
+
+SYM_CODE_START(xen_cpu_bringup_again)
+ UNWIND_HINT_FUNC
+ mov %rdi, %rsp
+ UNWIND_HINT_REGS
+ call cpu_bringup_and_idle
+SYM_CODE_END(xen_cpu_bringup_again)
.popsection
#endif
#endif
@@ -83,30 +90,35 @@ SYM_CODE_END(asm_cpu_bringup_and_idle)
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
-#ifdef CONFIG_X86_32
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
-#else
+#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
-#endif
-#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
-#endif
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
- .ascii "!writable_page_tables|pae_pgdir_above_4gb")
- ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
- .long (1 << XENFEAT_writable_page_tables) | \
- (1 << XENFEAT_dom0) | \
- (1 << XENFEAT_linux_rsdp_unrestricted))
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
- ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
- ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
- ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
+# define FEATURES_PV (1 << XENFEAT_writable_page_tables)
+#else
+# define FEATURES_PV 0
+#endif
+#ifdef CONFIG_XEN_PVH
+# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted)
+#else
+# define FEATURES_PVH 0
+#endif
+#ifdef CONFIG_XEN_DOM0
+# define FEATURES_DOM0 (1 << XENFEAT_dom0)
+#else
+# define FEATURES_DOM0 0
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+ .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0)
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a8bb972193d..79cf93f2c92f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -21,7 +21,7 @@ extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -72,8 +72,6 @@ void xen_restore_time_memory_area(void);
void xen_init_time_ops(void);
void xen_hvm_init_time_ops(void);
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
-
bool xen_vcpu_stolen(int vcpu);
void xen_vcpu_setup(int cpu);
@@ -108,11 +106,12 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_PV_DOM0
-void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size,
+ struct screen_info *);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
- size_t size)
+ size_t size, struct screen_info *si)
{
}
#endif
@@ -147,9 +146,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
void xen_pin_vcpu(int cpu);
void xen_emergency_restart(void);
+void xen_force_evtchn_callback(void);
+
#ifdef CONFIG_XEN_PV
void xen_pv_pre_suspend(void);
void xen_pv_post_suspend(int suspend_cancelled);
+void xen_start_kernel(struct start_info *si);
#else
static inline void xen_pv_pre_suspend(void) {}
static inline void xen_pv_post_suspend(int suspend_cancelled) {}
@@ -161,4 +163,18 @@ void xen_hvm_post_suspend(int suspend_cancelled);
static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
#endif
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+
#endif /* XEN_OPS_H */