aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig11
-rw-r--r--arch/x86/xen/Makefile2
-rw-r--r--arch/x86/xen/apic.c118
-rw-r--r--arch/x86/xen/debugfs.c2
-rw-r--r--arch/x86/xen/debugfs.h7
-rw-r--r--arch/x86/xen/efi.c4
-rw-r--r--arch/x86/xen/enlighten.c150
-rw-r--r--arch/x86/xen/enlighten_hvm.c62
-rw-r--r--arch/x86/xen/enlighten_pv.c336
-rw-r--r--arch/x86/xen/enlighten_pvh.c142
-rw-r--r--arch/x86/xen/irq.c4
-rw-r--r--arch/x86/xen/mmu.c3
-rw-r--r--arch/x86/xen/mmu.h28
-rw-r--r--arch/x86/xen/mmu_hvm.c2
-rw-r--r--arch/x86/xen/mmu_pv.c226
-rw-r--r--arch/x86/xen/multicalls.c135
-rw-r--r--arch/x86/xen/multicalls.h69
-rw-r--r--arch/x86/xen/p2m.c128
-rw-r--r--arch/x86/xen/pmu.c83
-rw-r--r--arch/x86/xen/pmu.h22
-rw-r--r--arch/x86/xen/setup.c317
-rw-r--r--arch/x86/xen/smp.c39
-rw-r--r--arch/x86/xen/smp.h43
-rw-r--r--arch/x86/xen/smp_hvm.c18
-rw-r--r--arch/x86/xen/smp_pv.c164
-rw-r--r--arch/x86/xen/spinlock.c26
-rw-r--r--arch/x86/xen/suspend.c9
-rw-r--r--arch/x86/xen/suspend_hvm.c10
-rw-r--r--arch/x86/xen/time.c48
-rw-r--r--arch/x86/xen/vga.c6
-rw-r--r--arch/x86/xen/xen-asm.S79
-rw-r--r--arch/x86/xen/xen-head.S164
-rw-r--r--arch/x86/xen/xen-ops.h190
33 files changed, 1741 insertions, 906 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 85246dd9faa1..98d8a50d2aed 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -9,6 +9,7 @@ config XEN
select PARAVIRT_CLOCK
select X86_HV_CALLBACK_VECTOR
depends on X86_64 || (X86_32 && X86_PAE)
+ depends on X86_64 || (X86_GENERIC || MPENTIUM4 || MATOM)
depends on X86_LOCAL_APIC && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
@@ -80,7 +81,6 @@ config XEN_PVH
bool "Xen PVH guest support"
depends on XEN && XEN_PVHVM && ACPI
select PVH
- def_bool n
help
Support for running as a Xen PVH guest.
@@ -92,3 +92,12 @@ config XEN_DOM0
select X86_X2APIC if XEN_PVH && X86_64
help
Support running as a Xen Dom0 guest.
+
+config XEN_PV_MSR_SAFE
+ bool "Always use safe MSR accesses in PV guests"
+ default y
+ depends on XEN_PV
+ help
+ Use safe (not faulting) MSR access functions even if the MSR access
+ should not fault anyway.
+ The default can be changed by using the "xen_msr_safe" boot parameter.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 3c5b52fbe4a7..a9ec8c9f5c5d 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -45,6 +45,6 @@ obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
-obj-$(CONFIG_XEN_PV_DOM0) += vga.o
+obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_XEN_EFI) += efi.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 62d34b6611c5..bb0f3f368446 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -10,8 +10,6 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
-#include "pmu.h"
-#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
{
@@ -33,13 +31,7 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
return 0xfd;
}
-static u32 xen_set_apic_id(unsigned int x)
-{
- WARN_ON(1);
- return x;
-}
-
-static unsigned int xen_get_apic_id(unsigned long x)
+static u32 xen_get_apic_id(u32 x)
{
return ((x)>>24) & 0xFFu;
}
@@ -49,20 +41,20 @@ static u32 xen_apic_read(u32 reg)
struct xen_platform_op op = {
.cmd = XENPF_get_cpuinfo,
.interface_version = XENPF_INTERFACE_VERSION,
- .u.pcpu_info.xen_cpuid = 0,
};
- int ret;
-
- /* Shouldn't need this as APIC is turned off for PV, and we only
- * get called on the bootup processor. But just in case. */
- if (!xen_initial_domain() || smp_processor_id())
- return 0;
+ int ret, cpu;
if (reg == APIC_LVR)
return 0x14;
if (reg != APIC_ID)
return 0;
+ cpu = smp_processor_id();
+ if (!xen_initial_domain())
+ return cpu ? cpuid_to_apicid[cpu] << 24 : 0;
+
+ op.u.pcpu_info.xen_cpuid = cpu;
+
ret = HYPERVISOR_platform_op(&op);
if (ret)
op.u.pcpu_info.apic_id = BAD_APICID;
@@ -81,6 +73,11 @@ static void xen_apic_write(u32 reg, u32 val)
WARN(1,"register: %x, value: %x\n", reg, val);
}
+static void xen_apic_eoi(void)
+{
+ WARN_ON_ONCE(1);
+}
+
static u64 xen_apic_icr_read(void)
{
return 0;
@@ -92,11 +89,6 @@ static void xen_apic_icr_write(u32 low, u32 id)
WARN_ON(1);
}
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
static int xen_apic_probe_pv(void)
{
if (xen_pv_domain())
@@ -110,99 +102,47 @@ static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
return xen_pv_domain();
}
-static int xen_id_always_valid(u32 apicid)
-{
- return 1;
-}
-
-static int xen_id_always_registered(void)
-{
- return 1;
-}
-
-static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
-{
- return initial_apic_id >> index_msb;
-}
-
-static void xen_noop(void)
-{
-}
-
-static void xen_silent_inquire(int apicid)
-{
-}
-
-static int xen_cpu_present_to_apicid(int cpu)
+static u32 xen_cpu_present_to_apicid(int cpu)
{
if (cpu_present(cpu))
- return cpu_data(cpu).apicid;
+ return cpu_data(cpu).topo.apicid;
else
return BAD_APICID;
}
-static struct apic xen_pv_apic = {
- .name = "Xen PV",
- .probe = xen_apic_probe_pv,
+static struct apic xen_pv_apic __ro_after_init = {
+ .name = "Xen PV",
+ .probe = xen_apic_probe_pv,
.acpi_madt_oem_check = xen_madt_oem_check,
- .apic_id_valid = xen_id_always_valid,
- .apic_id_registered = xen_id_always_registered,
/* .delivery_mode and .dest_mode_logical not used by XENPV */
.disable_esr = 0,
- .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
- .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
- .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
- .setup_apic_routing = NULL,
.cpu_present_to_apicid = xen_cpu_present_to_apicid,
- .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
- .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
- .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
- .get_apic_id = xen_get_apic_id,
- .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
+ .max_apic_id = UINT_MAX,
+ .get_apic_id = xen_get_apic_id,
.calc_dest_apicid = apic_flat_calc_apicid,
#ifdef CONFIG_SMP
- .send_IPI_mask = xen_send_IPI_mask,
- .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
- .send_IPI_allbutself = xen_send_IPI_allbutself,
- .send_IPI_all = xen_send_IPI_all,
- .send_IPI_self = xen_send_IPI_self,
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
#endif
- /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
- .inquire_remote_apic = xen_silent_inquire,
-
.read = xen_apic_read,
.write = xen_apic_write,
- .eoi_write = xen_apic_write,
+ .eoi = xen_apic_eoi,
- .icr_read = xen_apic_icr_read,
- .icr_write = xen_apic_icr_write,
- .wait_icr_idle = xen_noop,
- .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
};
+apic_driver(xen_pv_apic);
-static void __init xen_apic_check(void)
-{
- if (apic == &xen_pv_apic)
- return;
-
- pr_info("Switched APIC routing from %s to %s.\n", apic->name,
- xen_pv_apic.name);
- apic = &xen_pv_apic;
-}
void __init xen_init_apic(void)
{
x86_apic_ops.io_apic_read = xen_io_apic_read;
- /* On PV guests the APIC CPUID bit is disabled so none of the
- * routines end up executing. */
- if (!xen_initial_domain())
- apic = &xen_pv_apic;
-
- x86_platform.apic_post_init = xen_apic_check;
}
-apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 532410998684..b8c9f2a7d9b6 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -3,7 +3,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
-#include "debugfs.h"
+#include "xen-ops.h"
static struct dentry *d_xen_debug;
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
deleted file mode 100644
index 6b813ad1091c..000000000000
--- a/arch/x86/xen/debugfs.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_DEBUGFS_H
-#define _XEN_DEBUGFS_H
-
-struct dentry * __init xen_init_debugfs(void);
-
-#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 7d7ffb9c826a..7250d0e0e1a9 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -16,6 +16,8 @@
#include <asm/setup.h>
#include <asm/xen/hypercall.h>
+#include "xen-ops.h"
+
static efi_char16_t vendor[100] __initdata;
static efi_system_table_t efi_systab_xen __initdata = {
@@ -136,7 +138,7 @@ void __init xen_efi_init(struct boot_params *boot_params)
if (efi_systab_xen == NULL)
return;
- strncpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
+ strscpy((char *)&boot_params->efi_info.efi_loader_signature, "Xen",
sizeof(boot_params->efi_info.efi_loader_signature));
boot_params->efi_info.efi_systab = (__u32)__pa(efi_systab_xen);
boot_params->efi_info.efi_systab_hi = (__u32)(__pa(efi_systab_xen) >> 32);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30c6e986a6cd..53282dc7d5ac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1,11 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-#include <linux/memblock.h>
-#endif
#include <linux/console.h>
#include <linux/cpu.h>
+#include <linux/instrumentation.h>
#include <linux/kexec.h>
+#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/panic_notifier.h>
@@ -22,20 +21,22 @@
#include <asm/setup.h>
#include "xen-ops.h"
-#include "smp.h"
-#include "pmu.h"
-EXPORT_SYMBOL_GPL(hypercall_page);
+DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm);
+EXPORT_STATIC_CALL_TRAMP(xen_hypercall);
/*
* Pointer to the xen_vcpu_info structure or
* &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
* and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
* but during boot it is switched to point to xen_vcpu_info.
- * The pointer is used in __xen_evtchn_do_upcall to acknowledge pending events.
+ * The pointer is used in xen_evtchn_do_upcall to acknowledge pending events.
+ * Make sure that xen_vcpu_info doesn't cross a page boundary by making it
+ * cache-line aligned (the struct is guaranteed to have a size of 64 bytes,
+ * which matches the cache line size of 64-bit x86 processors).
*/
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
-DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DEFINE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
/* Linux <-> Xen vCPU id mapping */
DEFINE_PER_CPU(uint32_t, xen_vcpu_id);
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(xen_start_info);
struct shared_info xen_dummy_shared_info;
-__read_mostly int xen_have_vector_callback;
+__read_mostly bool xen_have_vector_callback = true;
EXPORT_SYMBOL_GPL(xen_have_vector_callback);
/*
@@ -69,6 +70,65 @@ EXPORT_SYMBOL(xen_start_flags);
*/
struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
+/* Number of pages released from the initial allocation. */
+unsigned long xen_released_pages;
+
+static __ref void xen_get_vendor(void)
+{
+ init_cpu_devs();
+ cpu_detect(&boot_cpu_data);
+ get_cpu_vendor(&boot_cpu_data);
+}
+
+void xen_hypercall_setfunc(void)
+{
+ if (static_call_query(xen_hypercall) != xen_hypercall_hvm)
+ return;
+
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON))
+ static_call_update(xen_hypercall, xen_hypercall_amd);
+ else
+ static_call_update(xen_hypercall, xen_hypercall_intel);
+}
+
+/*
+ * Evaluate processor vendor in order to select the correct hypercall
+ * function for HVM/PVH guests.
+ * Might be called very early in boot before vendor has been set by
+ * early_cpu_init().
+ */
+noinstr void *__xen_hypercall_setfunc(void)
+{
+ void (*func)(void);
+
+ /*
+ * Note that __xen_hypercall_setfunc() is noinstr only due to a nasty
+ * dependency chain: it is being called via the xen_hypercall static
+ * call when running as a PVH or HVM guest. Hypercalls need to be
+ * noinstr due to PV guests using hypercalls in noinstr code. So we
+ * can safely tag the function body as "instrumentation ok", since
+ * the PV guest requirement is not of interest here (xen_get_vendor()
+ * calls noinstr functions, and static_call_update_early() might do
+ * so, too).
+ */
+ instrumentation_begin();
+
+ xen_get_vendor();
+
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+ boot_cpu_data.x86_vendor == X86_VENDOR_HYGON))
+ func = xen_hypercall_amd;
+ else
+ func = xen_hypercall_intel;
+
+ static_call_update_early(xen_hypercall, func);
+
+ instrumentation_end();
+
+ return func;
+}
+
static int xen_cpu_up_online(unsigned int cpu)
{
xen_init_lock_cpu(cpu);
@@ -160,6 +220,7 @@ void xen_vcpu_setup(int cpu)
int err;
struct vcpu_info *vcpup;
+ BUILD_BUG_ON(sizeof(*vcpup) > SMP_CACHE_BYTES);
BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
/*
@@ -346,3 +407,74 @@ void xen_arch_unregister_cpu(int num)
}
EXPORT_SYMBOL(xen_arch_unregister_cpu);
#endif
+
+/* Amount of extra memory space we add to the e820 ranges */
+struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
+
+void __init xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns)
+{
+ unsigned int i;
+
+ /*
+ * No need to check for zero size, should happen rarely and will only
+ * write a new entry regarded to be unused due to zero size.
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ /* Add new region. */
+ if (xen_extra_mem[i].n_pfns == 0) {
+ xen_extra_mem[i].start_pfn = start_pfn;
+ xen_extra_mem[i].n_pfns = n_pfns;
+ break;
+ }
+ /* Append to existing region. */
+ if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
+ start_pfn) {
+ xen_extra_mem[i].n_pfns += n_pfns;
+ break;
+ }
+ }
+ if (i == XEN_EXTRA_MEM_MAX_REGIONS)
+ printk(KERN_WARNING "Warning: not enough extra memory regions\n");
+
+ memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
+}
+
+#ifdef CONFIG_XEN_UNPOPULATED_ALLOC
+int __init arch_xen_unpopulated_init(struct resource **res)
+{
+ unsigned int i;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ /* Must be set strictly before calling xen_free_unpopulated_pages(). */
+ *res = &iomem_resource;
+
+ /*
+ * Initialize with pages from the extra memory regions (see
+ * arch/x86/xen/setup.c).
+ */
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ unsigned int j;
+
+ for (j = 0; j < xen_extra_mem[i].n_pfns; j++) {
+ struct page *pg =
+ pfn_to_page(xen_extra_mem[i].start_pfn + j);
+
+ xen_free_unpopulated_pages(1, &pg);
+ }
+
+ /*
+ * Account for the region being in the physmap but unpopulated.
+ * The value in xen_released_pages is used by the balloon
+ * driver to know how much of the physmap is unpopulated and
+ * set an accurate initial memory target.
+ */
+ xen_released_pages += xen_extra_mem[i].n_pfns;
+ /* Zero so region is not also added to the balloon driver. */
+ xen_extra_mem[i].n_pfns = 0;
+ }
+
+ return 0;
+}
+#endif
diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c
index 8b71b1dd7639..fe57ff85d004 100644
--- a/arch/x86/xen/enlighten_hvm.c
+++ b/arch/x86/xen/enlighten_hvm.c
@@ -4,9 +4,12 @@
#include <linux/cpu.h>
#include <linux/kexec.h>
#include <linux/memblock.h>
+#include <linux/virtio_anchor.h>
#include <xen/features.h>
#include <xen/events.h>
+#include <xen/hvm.h>
+#include <xen/interface/hvm/hvm_op.h>
#include <xen/interface/memory.h>
#include <asm/apic.h>
@@ -25,11 +28,12 @@
#include <asm/xen/page.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
static unsigned long shared_info_pfn;
+__ro_after_init bool xen_percpu_upcall;
+EXPORT_SYMBOL_GPL(xen_percpu_upcall);
+
void xen_hvm_init_shared_info(void)
{
struct xen_add_to_physmap xatp;
@@ -102,15 +106,8 @@ static void __init init_hvm_pv_info(void)
/* PVH set up hypercall page in xen_prepare_pvh(). */
if (xen_pvh_domain())
pv_info.name = "Xen PVH";
- else {
- u64 pfn;
- uint32_t msr;
-
+ else
pv_info.name = "Xen HVM";
- msr = cpuid_ebx(base + 2);
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
- }
xen_setup_features();
@@ -125,9 +122,12 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_xen_hvm_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ if (xen_percpu_upcall)
+ apic_eoi();
+
inc_irq_stat(irq_hv_callback_count);
- xen_hvm_evtchn_do_upcall();
+ xen_evtchn_do_upcall();
set_irq_regs(old_regs);
}
@@ -139,7 +139,9 @@ static void xen_hvm_shutdown(void)
if (kexec_in_progress)
xen_reboot(SHUTDOWN_soft_reset);
}
+#endif
+#ifdef CONFIG_CRASH_DUMP
static void xen_hvm_crash_shutdown(struct pt_regs *regs)
{
native_machine_crash_shutdown(regs);
@@ -152,15 +154,14 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
int rc = 0;
/*
- * This can happen if CPU was offlined earlier and
- * offlining timed out in common_cpu_die().
+ * If a CPU was offlined earlier and offlining timed out then the
+ * lock mechanism is still initialized. Uninit it unconditionally
+ * as it's safe to call even if already uninited. Interrupts and
+ * timer have already been handled in xen_cpu_dead_hvm().
*/
- if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- }
+ xen_uninit_lock_cpu(cpu);
- if (cpu_acpi_id(cpu) != U32_MAX)
+ if (cpu_acpi_id(cpu) != CPU_ACPIID_INVALID)
per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu);
else
per_cpu(xen_vcpu_id, cpu) = cpu;
@@ -168,6 +169,15 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu)
if (!xen_have_vector_callback)
return 0;
+ if (xen_percpu_upcall) {
+ rc = xen_set_upcall_vector(cpu);
+ if (rc) {
+ WARN(1, "HVMOP_set_evtchn_upcall_vector"
+ " for CPU %d failed: %d\n", cpu, rc);
+ return rc;
+ }
+ }
+
if (xen_feature(XENFEAT_hvm_safe_pvclock))
xen_setup_timer(cpu);
@@ -188,14 +198,13 @@ static int xen_cpu_dead_hvm(unsigned int cpu)
return 0;
}
-static bool no_vector_callback __initdata;
-
static void __init xen_hvm_guest_init(void)
{
if (xen_pv_domain())
return;
- xen_set_restricted_virtio_memory_access();
+ if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT))
+ virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
init_hvm_pv_info();
@@ -211,9 +220,6 @@ static void __init xen_hvm_guest_init(void)
xen_panic_handler_init();
- if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector))
- xen_have_vector_callback = 1;
-
xen_hvm_smp_init();
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm));
xen_unplug_emulated_devices();
@@ -223,6 +229,8 @@ static void __init xen_hvm_guest_init(void)
#ifdef CONFIG_KEXEC_CORE
machine_ops.shutdown = xen_hvm_shutdown;
+#endif
+#ifdef CONFIG_CRASH_DUMP
machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
#endif
}
@@ -239,7 +247,7 @@ early_param("xen_nopv", xen_parse_nopv);
static __init int xen_parse_no_vector_callback(char *arg)
{
- no_vector_callback = true;
+ xen_have_vector_callback = false;
return 0;
}
early_param("xen_no_vector_callback", xen_parse_no_vector_callback);
@@ -285,6 +293,10 @@ static uint32_t __init xen_platform_hvm(void)
if (xen_pv_domain())
return 0;
+ /* Set correct hypercall function. */
+ if (xen_domain)
+ xen_hypercall_setfunc();
+
if (xen_pvh_domain() && nopv) {
/* Guest booting via the Xen-PVH boot entry goes here */
pr_info("\"nopv\" parameter is ignored in PVH guest\n");
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 70fb2ea85e90..26bbaf4b7330 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -23,6 +23,7 @@
#include <linux/start_kernel.h>
#include <linux/sched.h>
#include <linux/kprobes.h>
+#include <linux/kstrtox.h>
#include <linux/memblock.h>
#include <linux/export.h>
#include <linux/mm.h>
@@ -31,6 +32,8 @@
#include <linux/gfp.h>
#include <linux/edd.h>
#include <linux/reboot.h>
+#include <linux/virtio_anchor.h>
+#include <linux/stackprotector.h>
#include <xen/xen.h>
#include <xen/events.h>
@@ -46,6 +49,7 @@
#include <xen/hvc-console.h>
#include <xen/acpi.h>
+#include <asm/cpuid/api.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
#include <asm/page.h>
@@ -57,18 +61,20 @@
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/msr-index.h>
+#include <asm/msr.h>
#include <asm/traps.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mach_traps.h>
+#include <asm/mtrr.h>
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#include <asm/irq_stack.h>
#ifdef CONFIG_X86_IOPL_IOPERM
#include <asm/io_bitmap.h>
#endif
@@ -76,16 +82,12 @@
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
#include <asm/acpi.h>
-#include <acpi/pdc_intel.h>
+#include <acpi/proc_cap_intel.h>
#include <acpi/processor.h>
#include <xen/interface/platform.h>
#endif
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
-#include "multicalls.h"
-#include "pmu.h"
#include "../kernel/cpu/cpu.h" /* get_cpu_cap() */
@@ -94,10 +96,58 @@ void *xen_initial_gdt;
static int xen_cpu_up_prepare_pv(unsigned int cpu);
static int xen_cpu_dead_pv(unsigned int cpu);
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+
+#else
+
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+
+#endif
+
struct tls_descs {
struct desc_struct desc[3];
};
+DEFINE_PER_CPU(enum xen_lazy_mode, xen_lazy_mode) = XEN_LAZY_NONE;
+
+enum xen_lazy_mode xen_get_lazy_mode(void)
+{
+ if (in_interrupt())
+ return XEN_LAZY_NONE;
+
+ return this_cpu_read(xen_lazy_mode);
+}
+
/*
* Updating the 3 TLS descriptors in the GDT on every task switch is
* surprisingly expensive so we avoid updating them if they haven't
@@ -107,9 +157,69 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+static __read_mostly bool xen_msr_safe = IS_ENABLED(CONFIG_XEN_PV_MSR_SAFE);
+
+static int __init parse_xen_msr_safe(char *str)
+{
+ if (str)
+ return kstrtobool(str, &xen_msr_safe);
+ return -EINVAL;
+}
+early_param("xen_msr_safe", parse_xen_msr_safe);
+
+/* Get MTRR settings from Xen and put them into mtrr_state. */
+static void __init xen_set_mtrr_data(void)
+{
+#ifdef CONFIG_MTRR
+ struct xen_platform_op op = {
+ .cmd = XENPF_read_memtype,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ };
+ unsigned int reg;
+ unsigned long mask;
+ uint32_t eax, width;
+ static struct mtrr_var_range var[MTRR_MAX_VAR_RANGES] __initdata;
+
+ /* Get physical address width (only 64-bit cpus supported). */
+ width = 36;
+ eax = cpuid_eax(0x80000000);
+ if ((eax >> 16) == 0x8000 && eax >= 0x80000008) {
+ eax = cpuid_eax(0x80000008);
+ width = eax & 0xff;
+ }
+
+ for (reg = 0; reg < MTRR_MAX_VAR_RANGES; reg++) {
+ op.u.read_memtype.reg = reg;
+ if (HYPERVISOR_platform_op(&op))
+ break;
+
+ /*
+ * Only called in dom0, which has all RAM PFNs mapped at
+ * RAM MFNs, and all PCI space etc. is identity mapped.
+ * This means we can treat MFN == PFN regarding MTRR settings.
+ */
+ var[reg].base_lo = op.u.read_memtype.type;
+ var[reg].base_lo |= op.u.read_memtype.mfn << PAGE_SHIFT;
+ var[reg].base_hi = op.u.read_memtype.mfn >> (32 - PAGE_SHIFT);
+ mask = ~((op.u.read_memtype.nr_mfns << PAGE_SHIFT) - 1);
+ mask &= (1UL << width) - 1;
+ if (mask)
+ mask |= MTRR_PHYSMASK_V;
+ var[reg].mask_lo = mask;
+ var[reg].mask_hi = mask >> 32;
+ }
+
+ /* Only overwrite MTRR state if any MTRR could be got from Xen. */
+ if (reg)
+ guest_force_mtrr_state(var, reg, MTRR_TYPE_UNCACHABLE);
+#endif
+}
+
static void __init xen_pv_init_platform(void)
{
- xen_set_restricted_virtio_memory_access();
+ /* PV guests can't operate virtio devices without grants. */
+ if (IS_ENABLED(CONFIG_XEN_VIRTIO))
+ virtio_set_mem_acc_cb(xen_virtio_restricted_mem_acc);
populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
@@ -121,6 +231,14 @@ static void __init xen_pv_init_platform(void)
/* pvclock is in shared info area */
xen_init_time_ops();
+
+ if (xen_initial_domain())
+ xen_set_mtrr_data();
+ else
+ guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK);
+
+ /* Adjust nr_cpu_ids before "enumeration" happens */
+ xen_smp_count_cpus();
}
static void __init xen_pv_guest_late_init(void)
@@ -137,14 +255,22 @@ static __read_mostly unsigned int cpuid_leaf5_edx_val;
static void xen_cpuid(unsigned int *ax, unsigned int *bx,
unsigned int *cx, unsigned int *dx)
{
- unsigned maskebx = ~0;
+ unsigned int maskebx = ~0;
+ unsigned int or_ebx = 0;
/*
* Mask out inconvenient features, to try and disable as many
* unsupported kernel subsystems as possible.
*/
switch (*ax) {
- case CPUID_MWAIT_LEAF:
+ case 0x1:
+ /* Replace initial APIC ID in bits 24-31 of EBX. */
+ /* See xen_pv_smp_config() for related topology preparations. */
+ maskebx = 0x00ffffff;
+ or_ebx = smp_processor_id() << 24;
+ break;
+
+ case CPUID_LEAF_MWAIT:
/* Synthesize the values.. */
*ax = 0;
*bx = 0;
@@ -166,6 +292,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
: "0" (*ax), "2" (*cx));
*bx &= maskebx;
+ *bx |= or_ebx;
}
static bool __init xen_check_mwait(void)
@@ -213,24 +340,24 @@ static bool __init xen_check_mwait(void)
* ecx and edx. The hypercall provides only partial information.
*/
- ax = CPUID_MWAIT_LEAF;
+ ax = CPUID_LEAF_MWAIT;
bx = 0;
cx = 0;
dx = 0;
native_cpuid(&ax, &bx, &cx, &dx);
- /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
+ /* Ask the Hypervisor whether to clear ACPI_PROC_CAP_C_C2C3_FFH. If so,
* don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
*/
buf[0] = ACPI_PDC_REVISION_ID;
buf[1] = 1;
- buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
+ buf[2] = (ACPI_PROC_CAP_C_CAPABILITY_SMP | ACPI_PROC_CAP_EST_CAPABILITY_SWSMP);
set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
if ((HYPERVISOR_platform_op(&op) == 0) &&
- (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
+ (buf[2] & (ACPI_PROC_CAP_C_C1_FFH | ACPI_PROC_CAP_C_C2C3_FFH))) {
cpuid_leaf5_ecx_val = cx;
cpuid_leaf5_edx_val = dx;
}
@@ -262,6 +389,7 @@ static void __init xen_init_capabilities(void)
setup_clear_cpu_cap(X86_FEATURE_ACC);
setup_clear_cpu_cap(X86_FEATURE_X2APIC);
setup_clear_cpu_cap(X86_FEATURE_SME);
+ setup_clear_cpu_cap(X86_FEATURE_LKGS);
/*
* Xen PV would need some work to support PCID: CR3 handling as well
@@ -293,10 +421,25 @@ static noinstr unsigned long xen_get_debugreg(int reg)
return HYPERVISOR_get_debugreg(reg);
}
+static void xen_start_context_switch(struct task_struct *prev)
+{
+ BUG_ON(preemptible());
+
+ if (this_cpu_read(xen_lazy_mode) == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+ }
+ enter_lazy(XEN_LAZY_CPU);
+}
+
static void xen_end_context_switch(struct task_struct *next)
{
+ BUG_ON(preemptible());
+
xen_mc_flush();
- paravirt_end_context_switch(next);
+ leave_lazy(XEN_LAZY_CPU);
+ if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+ arch_enter_lazy_mmu_mode();
}
static unsigned long xen_store_tr(void)
@@ -403,7 +546,7 @@ static void xen_set_ldt(const void *addr, unsigned entries)
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gdt(const struct desc_ptr *dtr)
@@ -454,7 +597,7 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG_ON(size > PAGE_SIZE);
BUG_ON(va & ~PAGE_MASK);
- pfn = virt_to_pfn(va);
+ pfn = virt_to_pfn((void *)va);
mfn = pfn_to_mfn(pfn);
pte = pfn_pte(pfn, PAGE_KERNEL_RO);
@@ -499,7 +642,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
* exception between the new %fs descriptor being loaded and
* %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (xen_get_lazy_mode() == XEN_LAZY_CPU)
loadsegment(fs, 0);
xen_mc_batch();
@@ -508,7 +651,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
load_TLS_descriptor(t, cpu, 1);
load_TLS_descriptor(t, cpu, 2);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_load_gs_index(unsigned int idx)
@@ -583,6 +726,36 @@ DEFINE_IDTENTRY_RAW(xenpv_exc_machine_check)
}
#endif
+static void __xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_evtchn_do_upcall();
+
+ set_irq_regs(old_regs);
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ irqentry_state_t state = irqentry_enter(regs);
+ bool inhcall;
+
+ instrumentation_begin();
+ run_sysvec_on_irqstack_cond(__xen_pv_evtchn_do_upcall, regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+ irqentry_exit_cond_resched();
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ instrumentation_end();
+ irqentry_exit(regs, state);
+ }
+}
+
struct trap_array_entry {
void (*orig)(void);
void (*xen)(void);
@@ -609,7 +782,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_int3, false ),
TRAP_ENTRY(exc_overflow, false ),
#ifdef CONFIG_IA32_EMULATION
- { entry_INT80_compat, xen_entry_INT80_compat, false },
+ TRAP_ENTRY(int80_emulation, false ),
#endif
TRAP_ENTRY(exc_page_fault, false ),
TRAP_ENTRY(exc_divide_error, false ),
@@ -625,7 +798,7 @@ static struct trap_array_entry trap_array[] = {
TRAP_ENTRY(exc_coprocessor_error, false ),
TRAP_ENTRY(exc_alignment_check, false ),
TRAP_ENTRY(exc_simd_coprocessor_error, false ),
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
TRAP_ENTRY(exc_control_protection, false ),
#endif
};
@@ -762,6 +935,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ static const struct trap_info zero = { };
unsigned out;
trace_xen_cpu_load_idt(desc);
@@ -771,7 +945,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
out = xen_convert_trap_info(desc, traps, false);
- memset(&traps[out], 0, sizeof(traps[0]));
+ traps[out] = zero;
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))
@@ -839,7 +1013,7 @@ static void xen_load_sp0(unsigned long sp0)
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0);
}
@@ -903,7 +1077,7 @@ static void xen_write_cr0(unsigned long cr0)
MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
}
static void xen_write_cr4(unsigned long cr4)
@@ -913,39 +1087,54 @@ static void xen_write_cr4(unsigned long cr4)
native_write_cr4(cr4);
}
-static u64 xen_read_msr_safe(unsigned int msr, int *err)
+static u64 xen_do_read_msr(u32 msr, int *err)
{
- u64 val;
+ u64 val = 0; /* Avoid uninitialized value for safe variant. */
- if (pmu_msr_read(msr, &val, err))
+ if (pmu_msr_chk_emulated(msr, &val, true))
return val;
- val = native_read_msr_safe(msr, err);
+ if (err)
+ *err = native_read_msr_safe(msr, &val);
+ else
+ val = native_read_msr(msr);
+
switch (msr) {
case MSR_IA32_APICBASE:
val &= ~X2APIC_ENABLE;
+ if (smp_processor_id() == 0)
+ val |= MSR_IA32_APICBASE_BSP;
+ else
+ val &= ~MSR_IA32_APICBASE_BSP;
break;
}
return val;
}
-static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+static void set_seg(u32 which, u64 base)
{
- int ret;
- unsigned int which;
- u64 base;
-
- ret = 0;
+ if (HYPERVISOR_set_segment_base(which, base))
+ WARN(1, "Xen set_segment_base(%u, %llx) failed\n", which, base);
+}
+/*
+ * Support write_msr_safe() and write_msr() semantics.
+ * With err == NULL write_msr() semantics are selected.
+ * Supplying an err pointer requires err to be pre-initialized with 0.
+ */
+static void xen_do_write_msr(u32 msr, u64 val, int *err)
+{
switch (msr) {
- case MSR_FS_BASE: which = SEGBASE_FS; goto set;
- case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
- case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
-
- set:
- base = ((u64)high << 32) | low;
- if (HYPERVISOR_set_segment_base(which, base) != 0)
- ret = -EIO;
+ case MSR_FS_BASE:
+ set_seg(SEGBASE_FS, val);
+ break;
+
+ case MSR_KERNEL_GS_BASE:
+ set_seg(SEGBASE_GS_USER, val);
+ break;
+
+ case MSR_GS_BASE:
+ set_seg(SEGBASE_GS_KERNEL, val);
break;
case MSR_STAR:
@@ -961,31 +1150,45 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
break;
default:
- if (!pmu_msr_write(msr, low, high, &ret))
- ret = native_write_msr_safe(msr, low, high);
+ if (pmu_msr_chk_emulated(msr, &val, false))
+ return;
+
+ if (err)
+ *err = native_write_msr_safe(msr, val);
+ else
+ native_write_msr(msr, val);
}
+}
- return ret;
+static int xen_read_msr_safe(u32 msr, u64 *val)
+{
+ int err = 0;
+
+ *val = xen_do_read_msr(msr, &err);
+ return err;
}
-static u64 xen_read_msr(unsigned int msr)
+static int xen_write_msr_safe(u32 msr, u64 val)
{
- /*
- * This will silently swallow a #GP from RDMSR. It may be worth
- * changing that.
- */
- int err;
+ int err = 0;
+
+ xen_do_write_msr(msr, val, &err);
- return xen_read_msr_safe(msr, &err);
+ return err;
}
-static void xen_write_msr(unsigned int msr, unsigned low, unsigned high)
+static u64 xen_read_msr(u32 msr)
{
- /*
- * This will silently swallow a #GP from WRMSR. It may be worth
- * changing that.
- */
- xen_write_msr_safe(msr, low, high);
+ int err = 0;
+
+ return xen_do_read_msr(msr, xen_msr_safe ? &err : NULL);
+}
+
+static void xen_write_msr(u32 msr, u64 val)
+{
+ int err;
+
+ xen_do_write_msr(msr, val, xen_msr_safe ? &err : NULL);
}
/* This is called once we have the cpu_possible_mask */
@@ -1022,8 +1225,6 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
.write_cr4 = xen_write_cr4,
- .wbinvd = native_wbinvd,
-
.read_msr = xen_read_msr,
.write_msr = xen_write_msr,
@@ -1055,7 +1256,7 @@ static const typeof(pv_ops) xen_cpu_ops __initconst = {
#endif
.io_delay = xen_io_delay,
- .start_context_switch = paravirt_start_context_switch,
+ .start_context_switch = xen_start_context_switch,
.end_context_switch = xen_end_context_switch,
},
};
@@ -1164,7 +1365,7 @@ static void __init xen_setup_gdt(int cpu)
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry_boot;
pv_ops.cpu.load_gdt = xen_load_gdt_boot;
- switch_to_new_gdt(cpu);
+ switch_gdt_and_percpu_base(cpu);
pv_ops.cpu.write_gdt_entry = xen_write_gdt_entry;
pv_ops.cpu.load_gdt = xen_load_gdt;
@@ -1202,6 +1403,9 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_domain_type = XEN_PV_DOMAIN;
xen_start_flags = xen_start_info->flags;
+ /* Interrupts are guaranteed to be off initially. */
+ early_boot_irqs_disabled = true;
+ static_call_update_early(xen_hypercall, xen_hypercall_pv);
xen_setup_features();
@@ -1220,10 +1424,12 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_vcpu_info_reset(0);
x86_platform.get_nmi_reason = xen_get_nmi_reason;
+ x86_platform.realmode_reserve = x86_init_noop;
+ x86_platform.realmode_init = x86_init_noop;
x86_init.resources.memory_setup = xen_memory_setup;
x86_init.irqs.intr_mode_select = x86_init_noop;
- x86_init.irqs.intr_mode_init = x86_init_noop;
+ x86_init.irqs.intr_mode_init = x86_64_probe_apic;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
x86_init.hyper.init_platform = xen_pv_init_platform;
@@ -1263,12 +1469,10 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
xen_init_capabilities();
-#ifdef CONFIG_X86_LOCAL_APIC
/*
* set up the basic apic ops.
*/
xen_init_apic();
-#endif
machine_ops = xen_machine_ops;
@@ -1292,7 +1496,6 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv));
local_irq_disable();
- early_boot_irqs_disabled = true;
xen_raw_console_write("mapping kernel into physical memory\n");
xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
@@ -1341,7 +1544,8 @@ asmlinkage __visible void __init xen_start_kernel(struct start_info *si)
x86_platform.set_legacy_features =
xen_dom0_set_legacy_features;
- xen_init_vga(info, xen_start_info->console.dom0.info_size);
+ xen_init_vga(info, xen_start_info->console.dom0.info_size,
+ &boot_params.screen_info);
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
diff --git a/arch/x86/xen/enlighten_pvh.c b/arch/x86/xen/enlighten_pvh.c
index bcae606bbc5c..9d25d9373945 100644
--- a/arch/x86/xen/enlighten_pvh.c
+++ b/arch/x86/xen/enlighten_pvh.c
@@ -1,12 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/acpi.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
#include <linux/export.h>
+#include <linux/mm.h>
#include <xen/hvc-console.h>
+#include <xen/acpi.h>
+#include <asm/bootparam.h>
#include <asm/io_apic.h>
#include <asm/hypervisor.h>
#include <asm/e820/api.h>
+#include <asm/setup.h>
#include <xen/xen.h>
#include <asm/xen/interface.h>
@@ -25,24 +31,142 @@
bool __ro_after_init xen_pvh;
EXPORT_SYMBOL_GPL(xen_pvh);
-void __init xen_pvh_init(struct boot_params *boot_params)
+#ifdef CONFIG_XEN_DOM0
+int xen_pvh_setup_gsi(int gsi, int trigger, int polarity)
+{
+ int ret;
+ struct physdev_setup_gsi setup_gsi;
+
+ setup_gsi.gsi = gsi;
+ setup_gsi.triggering = (trigger == ACPI_EDGE_SENSITIVE ? 0 : 1);
+ setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+ ret = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+ if (ret == -EEXIST) {
+ xen_raw_printk("Already setup the GSI :%d\n", gsi);
+ ret = 0;
+ } else if (ret)
+ xen_raw_printk("Fail to setup GSI (%d)!\n", gsi);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(xen_pvh_setup_gsi);
+#endif
+
+/*
+ * Reserve e820 UNUSABLE regions to inflate the memory balloon.
+ *
+ * On PVH dom0 the host memory map is used, RAM regions available to dom0 are
+ * located as the same place as in the native memory map, but since dom0 gets
+ * less memory than the total amount of host RAM the ranges that can't be
+ * populated are converted from RAM -> UNUSABLE. Use such regions (up to the
+ * ratio signaled in EXTRA_MEM_RATIO) in order to inflate the balloon driver at
+ * boot. Doing so prevents the guest (even if just temporary) from using holes
+ * in the memory map in order to map grants or foreign addresses, and
+ * hopefully limits the risk of a clash with a device MMIO region. Ideally the
+ * hypervisor should notify us which memory ranges are suitable for creating
+ * foreign mappings, but that's not yet implemented.
+ */
+static void __init pvh_reserve_extra_memory(void)
{
- u32 msr;
- u64 pfn;
+ struct boot_params *bootp = &boot_params;
+ unsigned int i, ram_pages = 0, extra_pages;
+
+ for (i = 0; i < bootp->e820_entries; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+
+ if (e->type != E820_TYPE_RAM)
+ continue;
+ ram_pages += PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr);
+ }
+
+ /* Max amount of extra memory. */
+ extra_pages = EXTRA_MEM_RATIO * ram_pages;
+
+ /*
+ * Convert UNUSABLE ranges to RAM and reserve them for foreign mapping
+ * purposes.
+ */
+ for (i = 0; i < bootp->e820_entries && extra_pages; i++) {
+ struct boot_e820_entry *e = &bootp->e820_table[i];
+ unsigned long pages;
+
+ if (e->type != E820_TYPE_UNUSABLE)
+ continue;
+
+ pages = min(extra_pages,
+ PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr));
+
+ if (pages != (PFN_DOWN(e->addr + e->size) - PFN_UP(e->addr))) {
+ struct boot_e820_entry *next;
+ if (bootp->e820_entries ==
+ ARRAY_SIZE(bootp->e820_table))
+ /* No space left to split - skip region. */
+ continue;
+
+ /* Split entry. */
+ next = e + 1;
+ memmove(next, e,
+ (bootp->e820_entries - i) * sizeof(*e));
+ bootp->e820_entries++;
+ next->addr = PAGE_ALIGN(e->addr) + PFN_PHYS(pages);
+ e->size = next->addr - e->addr;
+ next->size -= e->size;
+ }
+ e->type = E820_TYPE_RAM;
+ extra_pages -= pages;
+
+ xen_add_extra_mem(PFN_UP(e->addr), pages);
+ }
+}
+
+static void __init pvh_arch_setup(void)
+{
+ pvh_reserve_extra_memory();
+
+ if (xen_initial_domain()) {
+ xen_add_preferred_consoles();
+
+ /*
+ * Disable usage of CPU idle and frequency drivers: when
+ * running as hardware domain the exposed native ACPI tables
+ * causes idle and/or frequency drivers to attach and
+ * malfunction. It's Xen the entity that controls the idle and
+ * frequency states.
+ *
+ * For unprivileged domains the exposed ACPI tables are
+ * fabricated and don't contain such data.
+ */
+ disable_cpuidle();
+ disable_cpufreq();
+ WARN_ON(xen_set_default_idle());
+ }
+}
+
+void __init xen_pvh_init(struct boot_params *boot_params)
+{
xen_pvh = 1;
xen_domain_type = XEN_HVM_DOMAIN;
xen_start_flags = pvh_start_info.flags;
- msr = cpuid_ebx(xen_cpuid_base() + 2);
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
- if (xen_initial_domain())
- x86_init.oem.arch_setup = xen_add_preferred_consoles;
+ x86_init.oem.arch_setup = pvh_arch_setup;
x86_init.oem.banner = xen_banner;
xen_efi_init(boot_params);
+
+ if (xen_initial_domain()) {
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_dom0_console,
+ };
+ int ret = HYPERVISOR_platform_op(&op);
+
+ if (ret > 0)
+ xen_init_vga(&op.u.dom0_console,
+ min(ret * sizeof(char),
+ sizeof(op.u.dom0_console)),
+ &boot_params->screen_info);
+ }
}
void __init mem_map_via_hcall(struct boot_params *boot_params_p)
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 06c3c2fb4b06..39982f955cfe 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -24,7 +24,7 @@ noinstr void xen_force_evtchn_callback(void)
(void)HYPERVISOR_xen_version(0, NULL);
}
-static void xen_safe_halt(void)
+static noinstr void xen_safe_halt(void)
{
/* Blocking includes an implicit local_irq_enable(). */
if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
@@ -45,7 +45,7 @@ static const typeof(pv_ops) xen_irq_ops __initconst = {
/* Initial interrupt flag handling only called while interrupts off. */
.save_fl = __PV_IS_CALLEE_SAVE(paravirt_ret0),
.irq_disable = __PV_IS_CALLEE_SAVE(paravirt_nop),
- .irq_enable = __PV_IS_CALLEE_SAVE(paravirt_BUG),
+ .irq_enable = __PV_IS_CALLEE_SAVE(BUG_func),
.safe_halt = xen_safe_halt,
.halt = xen_halt,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 60e9c37fd79f..c4c479373249 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -5,8 +5,7 @@
#include <asm/xen/hypercall.h>
#include <xen/interface/memory.h>
-#include "multicalls.h"
-#include "mmu.h"
+#include "xen-ops.h"
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
deleted file mode 100644
index 6e4c6bd62203..000000000000
--- a/arch/x86/xen/mmu.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_MMU_H
-
-#include <linux/linkage.h>
-#include <asm/page.h>
-
-enum pt_level {
- PT_PGD,
- PT_P4D,
- PT_PUD,
- PT_PMD,
- PT_PTE
-};
-
-
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-
-void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
-
-pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep);
-void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t pte);
-
-unsigned long xen_read_cr2_direct(void);
-
-extern void xen_init_mmu_ops(void);
-extern void xen_hvm_init_mmu_ops(void);
-#endif /* _XEN_MMU_H */
diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c
index 509bdee3ab90..337955652202 100644
--- a/arch/x86/xen/mmu_hvm.c
+++ b/arch/x86/xen/mmu_hvm.c
@@ -5,7 +5,7 @@
#include <xen/interface/xen.h>
#include <xen/hvm.h>
-#include "mmu.h"
+#include "xen-ops.h"
#ifdef CONFIG_PROC_VMCORE
/*
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index ee29fb558f2e..2a4a8deaf612 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -34,7 +34,7 @@
* would need to validate the whole pagetable before going on.
* Naturally, this is quite slow. The solution is to "pin" a
* pagetable, which enforces all the constraints on the pagetable even
- * when it is not actively in use. This menas that Xen can be assured
+ * when it is not actively in use. This means that Xen can be assured
* that it is still valid when you do load it into %cr3, and doesn't
* need to revalidate it.
*
@@ -82,9 +82,23 @@
#include <xen/hvc-console.h>
#include <xen/swiotlb-xen.h>
-#include "multicalls.h"
-#include "mmu.h"
-#include "debugfs.h"
+#include "xen-ops.h"
+
+/*
+ * Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
+ * to avoid warnings with "-Wmissing-prototypes".
+ */
+pteval_t xen_pte_val(pte_t pte);
+pgdval_t xen_pgd_val(pgd_t pgd);
+pmdval_t xen_pmd_val(pmd_t pmd);
+pudval_t xen_pud_val(pud_t pud);
+p4dval_t xen_p4d_val(p4d_t p4d);
+pte_t xen_make_pte(pteval_t pte);
+pgd_t xen_make_pgd(pgdval_t pgd);
+pmd_t xen_make_pmd(pmdval_t pmd);
+pud_t xen_make_pud(pudval_t pud);
+p4d_t xen_make_p4d(p4dval_t p4d);
+pte_t xen_make_pte_init(pteval_t pte);
#ifdef CONFIG_X86_VSYSCALL_EMULATION
/* l3 pud for userspace vsyscall mapping */
@@ -97,6 +111,51 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
*/
static DEFINE_SPINLOCK(xen_reservation_lock);
+/* Protected by xen_reservation_lock. */
+#define MIN_CONTIG_ORDER 9 /* 2MB */
+static unsigned int discontig_frames_order = MIN_CONTIG_ORDER;
+static unsigned long discontig_frames_early[1UL << MIN_CONTIG_ORDER] __initdata;
+static unsigned long *discontig_frames __refdata = discontig_frames_early;
+static bool discontig_frames_dyn;
+
+static int alloc_discontig_frames(unsigned int order)
+{
+ unsigned long *new_array, *old_array;
+ unsigned int old_order;
+ unsigned long flags;
+
+ BUG_ON(order < MIN_CONTIG_ORDER);
+ BUILD_BUG_ON(sizeof(discontig_frames_early) != PAGE_SIZE);
+
+ new_array = (unsigned long *)__get_free_pages(GFP_KERNEL,
+ order - MIN_CONTIG_ORDER);
+ if (!new_array)
+ return -ENOMEM;
+
+ spin_lock_irqsave(&xen_reservation_lock, flags);
+
+ old_order = discontig_frames_order;
+
+ if (order > discontig_frames_order || !discontig_frames_dyn) {
+ if (!discontig_frames_dyn)
+ old_array = NULL;
+ else
+ old_array = discontig_frames;
+
+ discontig_frames = new_array;
+ discontig_frames_order = order;
+ discontig_frames_dyn = true;
+ } else {
+ old_array = new_array;
+ }
+
+ spin_unlock_irqrestore(&xen_reservation_lock, flags);
+
+ free_pages((unsigned long)old_array, old_order - MIN_CONTIG_ORDER);
+
+ return 0;
+}
+
/*
* Note about cr3 (pagetable base) values:
*
@@ -112,7 +171,7 @@ static DEFINE_SPINLOCK(xen_reservation_lock);
* looking at another vcpu's cr3 value, it should use this variable.
*/
DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
+static DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
static phys_addr_t xen_pt_base, xen_pt_size __initdata;
@@ -150,7 +209,7 @@ void make_lowmem_page_readwrite(void *vaddr)
if (pte == NULL)
return; /* vaddr missing */
- ptev = pte_mkwrite(*pte);
+ ptev = pte_mkwrite_novma(*pte);
if (HYPERVISOR_update_va_mapping(address, ptev, 0))
BUG();
@@ -220,7 +279,7 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
u.val = pmd_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -254,7 +313,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
{
struct mmu_update u;
- if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
+ if (xen_get_lazy_mode() != XEN_LAZY_MMU)
return false;
xen_mc_batch();
@@ -263,7 +322,7 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
u.val = pte_val_ma(pteval);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
return true;
}
@@ -289,16 +348,17 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval)
__xen_set_pte(ptep, pteval);
}
-pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
+static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
{
/* Just return the pte as-is. We preserve the bits on commit */
trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep);
return *ptep;
}
-void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
- pte_t *ptep, pte_t pte)
+static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma,
+ unsigned long addr,
+ pte_t *ptep, pte_t pte)
{
struct mmu_update u;
@@ -309,7 +369,7 @@ void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
u.val = pte_val_ma(pte);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
/* Assume pteval_t is equivalent to all the other *val_t types. */
@@ -403,7 +463,7 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
u.val = pud_val_ma(val);
xen_extend_mmu_update(&u);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -483,7 +543,7 @@ static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
__xen_set_p4d_hyper(ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -515,10 +575,9 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
if (user_ptr)
__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
-#if CONFIG_PGTABLE_LEVELS >= 5
__visible p4dval_t xen_p4d_val(p4d_t p4d)
{
return pte_mfn_to_pfn(p4d.p4d);
@@ -532,7 +591,6 @@ __visible p4d_t xen_make_p4d(p4dval_t p4d)
return native_make_p4d(p4d);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
-#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
static void xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
void (*func)(struct mm_struct *mm, struct page *,
@@ -650,8 +708,8 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
{
spinlock_t *ptl = NULL;
-#if USE_SPLIT_PTE_PTLOCKS
- ptl = ptlock_ptr(page);
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
+ ptl = ptlock_ptr(page_ptdesc(page));
spin_lock_nest_lock(ptl, &mm->page_table_lock);
#endif
@@ -766,6 +824,7 @@ void xen_mm_pin_all(void)
{
struct page *page;
+ spin_lock(&init_mm.page_table_lock);
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
@@ -776,6 +835,7 @@ void xen_mm_pin_all(void)
}
spin_unlock(&pgd_lock);
+ spin_unlock(&init_mm.page_table_lock);
}
static void __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
@@ -797,6 +857,9 @@ static void __init xen_after_bootmem(void)
SetPagePinned(virt_to_page(level3_user_vsyscall));
#endif
xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
+
+ if (alloc_discontig_frames(MIN_CONTIG_ORDER))
+ BUG();
}
static void xen_unpin_page(struct mm_struct *mm, struct page *page,
@@ -872,6 +935,7 @@ void xen_mm_unpin_all(void)
{
struct page *page;
+ spin_lock(&init_mm.page_table_lock);
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
@@ -883,16 +947,10 @@ void xen_mm_unpin_all(void)
}
spin_unlock(&pgd_lock);
+ spin_unlock(&init_mm.page_table_lock);
}
-static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
-{
- spin_lock(&next->page_table_lock);
- xen_pgd_pin(next);
- spin_unlock(&next->page_table_lock);
-}
-
-static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static void xen_enter_mmap(struct mm_struct *mm)
{
spin_lock(&mm->page_table_lock);
xen_pgd_pin(mm);
@@ -904,7 +962,7 @@ static void drop_mm_ref_this_cpu(void *info)
struct mm_struct *mm = info;
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
+ leave_mm();
/*
* If this cpu still has a stale cr3 reference, then make sure
@@ -1050,7 +1108,7 @@ static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
pte_t *pte_tbl;
int i;
- if (pmd_large(*pmd)) {
+ if (pmd_leaf(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
return;
@@ -1073,7 +1131,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
pmd_t *pmd_tbl;
int i;
- if (pud_large(*pud)) {
+ if (pud_leaf(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
return;
@@ -1095,7 +1153,7 @@ static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
pud_t *pud_tbl;
int i;
- if (p4d_large(*p4d)) {
+ if (p4d_leaf(*p4d)) {
pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, P4D_SIZE);
return;
@@ -1236,7 +1294,7 @@ static noinline void xen_flush_tlb(void)
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1256,7 +1314,7 @@ static void xen_flush_tlb_one_user(unsigned long addr)
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -1293,7 +1351,7 @@ static void xen_flush_tlb_multi(const struct cpumask *cpus,
MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
static unsigned long xen_read_cr3(void)
@@ -1352,7 +1410,7 @@ static void xen_write_cr3(unsigned long cr3)
else
__xen_write_cr3(false, 0);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
/*
@@ -1387,7 +1445,7 @@ static void __init xen_write_cr3_init(unsigned long cr3)
__xen_write_cr3(true, cr3);
- xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
+ xen_mc_issue(XEN_LAZY_CPU); /* interrupts restored */
}
static int xen_pgd_alloc(struct mm_struct *mm)
@@ -1545,10 +1603,11 @@ static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
__set_pfn_prot(pfn, PAGE_KERNEL_RO);
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
+ if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS) &&
+ !pinned)
__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
}
}
@@ -1573,12 +1632,12 @@ static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
if (pinned) {
xen_mc_batch();
- if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
+ if (level == PT_PTE && IS_ENABLED(CONFIG_SPLIT_PTE_PTLOCKS))
__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
__set_pfn_prot(pfn, PAGE_KERNEL);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
ClearPagePinned(page);
}
@@ -1795,7 +1854,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
*/
xen_mc_batch();
__xen_write_cr3(true, __pa(init_top_pgt));
- xen_mc_issue(PARAVIRT_LAZY_CPU);
+ xen_mc_issue(XEN_LAZY_CPU);
/* We can't that easily rip out L3 and L2, as the Xen pagetables are
* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
@@ -1854,7 +1913,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pud_present(pud))
return 0;
pa = pud_val(pud) & PTE_PFN_MASK;
- if (pud_large(pud))
+ if (pud_leaf(pud))
return pa + (vaddr & ~PUD_MASK);
pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
@@ -1862,7 +1921,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
if (!pmd_present(pmd))
return 0;
pa = pmd_val(pmd) & PTE_PFN_MASK;
- if (pmd_large(pmd))
+ if (pmd_leaf(pmd))
return pa + (vaddr & ~PMD_MASK);
pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
@@ -2010,10 +2069,7 @@ void __init xen_reserve_special_pages(void)
void __init xen_pt_check_e820(void)
{
- if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
- xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
- BUG();
- }
+ xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table");
}
static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
@@ -2074,6 +2130,23 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
}
+static void xen_enter_lazy_mmu(void)
+{
+ enter_lazy(XEN_LAZY_MMU);
+}
+
+static void xen_flush_lazy_mmu(void)
+{
+ preempt_disable();
+
+ if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
+ arch_leave_lazy_mmu_mode();
+ arch_enter_lazy_mmu_mode();
+ }
+
+ preempt_enable();
+}
+
static void __init xen_post_allocator_init(void)
{
pv_ops.mmu.set_pte = xen_set_pte;
@@ -2098,7 +2171,7 @@ static void xen_leave_lazy_mmu(void)
{
preempt_disable();
xen_mc_flush();
- paravirt_leave_lazy_mmu();
+ leave_lazy(XEN_LAZY_MMU);
preempt_enable();
}
@@ -2114,7 +2187,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_multi = xen_flush_tlb_multi,
- .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
@@ -2148,19 +2220,16 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = {
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#if CONFIG_PGTABLE_LEVELS >= 5
.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
-#endif
- .activate_mm = xen_activate_mm,
- .dup_mmap = xen_dup_mmap,
+ .enter_mmap = xen_enter_mmap,
.exit_mmap = xen_exit_mmap,
.lazy_mode = {
- .enter = paravirt_enter_lazy_mmu,
+ .enter = xen_enter_lazy_mmu,
.leave = xen_leave_lazy_mmu,
- .flush = paravirt_flush_lazy_mmu,
+ .flush = xen_flush_lazy_mmu,
},
.set_fixmap = xen_set_fixmap,
@@ -2177,10 +2246,6 @@ void __init xen_init_mmu_ops(void)
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
-/* Protected by xen_reservation_lock. */
-#define MAX_CONTIG_ORDER 9 /* 2MB */
-static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
-
#define VOID_PTE (mfn_pte(0, __pgprot(0)))
static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
unsigned long *in_frames,
@@ -2194,13 +2259,13 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
mcs = __xen_mc_entry(0);
if (in_frames)
- in_frames[i] = virt_to_mfn(vaddr);
+ in_frames[i] = virt_to_mfn((void *)vaddr);
MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
- __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
+ __set_phys_to_machine(virt_to_pfn((void *)vaddr), INVALID_P2M_ENTRY);
if (out_frames)
- out_frames[i] = virt_to_pfn(vaddr);
+ out_frames[i] = virt_to_pfn((void *)vaddr);
}
xen_mc_issue(0);
}
@@ -2242,7 +2307,7 @@ static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
MULTI_update_va_mapping(mcs.mc, vaddr,
mfn_pte(mfn, PAGE_KERNEL), flags);
- set_phys_to_machine(virt_to_pfn(vaddr), mfn);
+ set_phys_to_machine(virt_to_pfn((void *)vaddr), mfn);
}
xen_mc_issue(0);
@@ -2297,29 +2362,30 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
unsigned int address_bits,
dma_addr_t *dma_handle)
{
- unsigned long *in_frames = discontig_frames, out_frame;
+ unsigned long *in_frames, out_frame;
unsigned long flags;
int success;
unsigned long vstart = (unsigned long)phys_to_virt(pstart);
- /*
- * Currently an auto-translated guest will not perform I/O, nor will
- * it require PAE page directories below 4GB. Therefore any calls to
- * this function are redundant and can be ignored.
- */
+ if (unlikely(order > discontig_frames_order)) {
+ if (!discontig_frames_dyn)
+ return -ENOMEM;
- if (unlikely(order > MAX_CONTIG_ORDER))
- return -ENOMEM;
+ if (alloc_discontig_frames(order))
+ return -ENOMEM;
+ }
memset((void *) vstart, 0, PAGE_SIZE << order);
spin_lock_irqsave(&xen_reservation_lock, flags);
+ in_frames = discontig_frames;
+
/* 1. Zap current PTEs, remembering MFNs. */
xen_zap_pfn_range(vstart, order, in_frames, NULL);
/* 2. Get a new contiguous memory extent. */
- out_frame = virt_to_pfn(vstart);
+ out_frame = virt_to_pfn((void *)vstart);
success = xen_exchange_memory(1UL << order, 0, in_frames,
1, order, &out_frame,
address_bits);
@@ -2338,12 +2404,12 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
{
- unsigned long *out_frames = discontig_frames, in_frame;
+ unsigned long *out_frames, in_frame;
unsigned long flags;
int success;
unsigned long vstart;
- if (unlikely(order > MAX_CONTIG_ORDER))
+ if (unlikely(order > discontig_frames_order))
return;
vstart = (unsigned long)phys_to_virt(pstart);
@@ -2351,8 +2417,10 @@ void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
spin_lock_irqsave(&xen_reservation_lock, flags);
+ out_frames = discontig_frames;
+
/* 1. Find start MFN of contiguous extent. */
- in_frame = virt_to_mfn(vstart);
+ in_frame = virt_to_mfn((void *)vstart);
/* 2. Zap current PTEs. */
xen_zap_pfn_range(vstart, order, NULL, out_frames);
@@ -2383,7 +2451,7 @@ static noinline void xen_flush_tlb_all(void)
op->cmd = MMUEXT_TLB_FLUSH_ALL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
- xen_mc_issue(PARAVIRT_LAZY_MMU);
+ xen_mc_issue(XEN_LAZY_MMU);
preempt_enable();
}
@@ -2501,7 +2569,7 @@ out:
}
EXPORT_SYMBOL_GPL(xen_remap_pfn);
-#ifdef CONFIG_KEXEC_CORE
+#ifdef CONFIG_VMCORE_INFO
phys_addr_t paddr_vmcoreinfo_note(void)
{
if (xen_pv_domain())
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 07054572297f..7237d56a9d3f 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -23,26 +23,21 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/debugfs.h>
+#include <linux/jump_label.h>
+#include <linux/printk.h>
#include <asm/xen/hypercall.h>
-#include "multicalls.h"
-#include "debugfs.h"
+#include "xen-ops.h"
#define MC_BATCH 32
-#define MC_DEBUG 0
-
#define MC_ARGS (MC_BATCH * 16)
struct mc_buffer {
unsigned mcidx, argidx, cbidx;
struct multicall_entry entries[MC_BATCH];
-#if MC_DEBUG
- struct multicall_entry debug[MC_BATCH];
- void *caller[MC_BATCH];
-#endif
unsigned char args[MC_ARGS];
struct callback {
void (*fn)(void *);
@@ -50,13 +45,105 @@ struct mc_buffer {
} callbacks[MC_BATCH];
};
+struct mc_debug_data {
+ struct multicall_entry entries[MC_BATCH];
+ void *caller[MC_BATCH];
+ size_t argsz[MC_BATCH];
+ unsigned long *args[MC_BATCH];
+};
+
static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+static struct mc_debug_data mc_debug_data_early __initdata;
+static struct mc_debug_data __percpu *mc_debug_data_ptr;
DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
+static struct static_key mc_debug __ro_after_init;
+static bool mc_debug_enabled __initdata;
+
+static struct mc_debug_data * __ref get_mc_debug(void)
+{
+ if (!mc_debug_data_ptr)
+ return &mc_debug_data_early;
+
+ return this_cpu_ptr(mc_debug_data_ptr);
+}
+
+static int __init xen_parse_mc_debug(char *arg)
+{
+ mc_debug_enabled = true;
+ static_key_slow_inc(&mc_debug);
+
+ return 0;
+}
+early_param("xen_mc_debug", xen_parse_mc_debug);
+
+static int __init mc_debug_enable(void)
+{
+ unsigned long flags;
+ struct mc_debug_data __percpu *mcdb;
+
+ if (!mc_debug_enabled)
+ return 0;
+
+ mcdb = alloc_percpu(struct mc_debug_data);
+ if (!mcdb) {
+ pr_err("xen_mc_debug inactive\n");
+ static_key_slow_dec(&mc_debug);
+ return -ENOMEM;
+ }
+
+ /* Be careful when switching to percpu debug data. */
+ local_irq_save(flags);
+ xen_mc_flush();
+ mc_debug_data_ptr = mcdb;
+ local_irq_restore(flags);
+
+ pr_info("xen_mc_debug active\n");
+
+ return 0;
+}
+early_initcall(mc_debug_enable);
+
+/* Number of parameters of hypercalls used via multicalls. */
+static const uint8_t hpcpars[] = {
+ [__HYPERVISOR_mmu_update] = 4,
+ [__HYPERVISOR_stack_switch] = 2,
+ [__HYPERVISOR_fpu_taskswitch] = 1,
+ [__HYPERVISOR_update_descriptor] = 2,
+ [__HYPERVISOR_update_va_mapping] = 3,
+ [__HYPERVISOR_mmuext_op] = 4,
+};
+
+static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb,
+ int idx)
+{
+ unsigned int arg;
+ unsigned int opidx = mcdb->entries[idx].op & 0xff;
+ unsigned int pars = 0;
+
+ pr_err(" call %2d: op=%lu result=%ld caller=%pS ", idx + 1,
+ mcdb->entries[idx].op, b->entries[idx].result,
+ mcdb->caller[idx]);
+ if (opidx < ARRAY_SIZE(hpcpars))
+ pars = hpcpars[opidx];
+ if (pars) {
+ pr_cont("pars=");
+ for (arg = 0; arg < pars; arg++)
+ pr_cont("%lx ", mcdb->entries[idx].args[arg]);
+ }
+ if (mcdb->argsz[idx]) {
+ pr_cont("args=");
+ for (arg = 0; arg < mcdb->argsz[idx] / 8; arg++)
+ pr_cont("%lx ", mcdb->args[idx][arg]);
+ }
+ pr_cont("\n");
+}
+
void xen_mc_flush(void)
{
struct mc_buffer *b = this_cpu_ptr(&mc_buffer);
struct multicall_entry *mc;
+ struct mc_debug_data *mcdb = NULL;
int ret = 0;
unsigned long flags;
int i;
@@ -69,10 +156,11 @@ void xen_mc_flush(void)
trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
-#if MC_DEBUG
- memcpy(b->debug, b->entries,
- b->mcidx * sizeof(struct multicall_entry));
-#endif
+ if (static_key_false(&mc_debug)) {
+ mcdb = get_mc_debug();
+ memcpy(mcdb->entries, b->entries,
+ b->mcidx * sizeof(struct multicall_entry));
+ }
switch (b->mcidx) {
case 0:
@@ -103,21 +191,14 @@ void xen_mc_flush(void)
pr_err("%d of %d multicall(s) failed: cpu %d\n",
ret, b->mcidx, smp_processor_id());
for (i = 0; i < b->mcidx; i++) {
- if (b->entries[i].result < 0) {
-#if MC_DEBUG
- pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n",
- i + 1,
- b->debug[i].op,
- b->debug[i].args[0],
- b->entries[i].result,
- b->caller[i]);
-#else
+ if (static_key_false(&mc_debug)) {
+ print_debug_data(b, mcdb, i);
+ } else if (b->entries[i].result < 0) {
pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\n",
i + 1,
b->entries[i].op,
b->entries[i].args[0],
b->entries[i].result);
-#endif
}
}
}
@@ -155,9 +236,13 @@ struct multicall_space __xen_mc_entry(size_t args)
}
ret.mc = &b->entries[b->mcidx];
-#if MC_DEBUG
- b->caller[b->mcidx] = __builtin_return_address(0);
-#endif
+ if (static_key_false(&mc_debug)) {
+ struct mc_debug_data *mcdb = get_mc_debug();
+
+ mcdb->caller[b->mcidx] = __builtin_return_address(0);
+ mcdb->argsz[b->mcidx] = args;
+ mcdb->args[b->mcidx] = (unsigned long *)(&b->args[argidx]);
+ }
b->mcidx++;
ret.args = &b->args[argidx];
b->argidx = argidx + args;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
deleted file mode 100644
index 1c51b2c87f30..000000000000
--- a/arch/x86/xen/multicalls.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_MULTICALLS_H
-#define _XEN_MULTICALLS_H
-
-#include <trace/events/xen.h>
-
-#include "xen-ops.h"
-
-/* Multicalls */
-struct multicall_space
-{
- struct multicall_entry *mc;
- void *args;
-};
-
-/* Allocate room for a multicall and its args */
-struct multicall_space __xen_mc_entry(size_t args);
-
-DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
-
-/* Call to start a batch of multiple __xen_mc_entry()s. Must be
- paired with xen_mc_issue() */
-static inline void xen_mc_batch(void)
-{
- unsigned long flags;
-
- /* need to disable interrupts until this entry is complete */
- local_irq_save(flags);
- trace_xen_mc_batch(paravirt_get_lazy_mode());
- __this_cpu_write(xen_mc_irq_flags, flags);
-}
-
-static inline struct multicall_space xen_mc_entry(size_t args)
-{
- xen_mc_batch();
- return __xen_mc_entry(args);
-}
-
-/* Flush all pending multicalls */
-void xen_mc_flush(void);
-
-/* Issue a multicall if we're not in a lazy mode */
-static inline void xen_mc_issue(unsigned mode)
-{
- trace_xen_mc_issue(mode);
-
- if ((paravirt_get_lazy_mode() & mode) == 0)
- xen_mc_flush();
-
- /* restore flags saved in xen_mc_batch */
- local_irq_restore(this_cpu_read(xen_mc_irq_flags));
-}
-
-/* Set up a callback to be called when the current batch is flushed */
-void xen_mc_callback(void (*fn)(void *), void *data);
-
-/*
- * Try to extend the arguments of the previous multicall command. The
- * previous command's op must match. If it does, then it attempts to
- * extend the argument space allocated to the multicall entry by
- * arg_size bytes.
- *
- * The returned multicall_space will return with mc pointing to the
- * command on success, or NULL on failure, and args pointing to the
- * newly allocated space.
- */
-struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
-
-#endif /* _XEN_MULTICALLS_H */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 58db86f7b384..56914e21e303 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -70,6 +70,7 @@
#include <linux/memblock.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/acpi.h>
#include <asm/cache.h>
#include <asm/setup.h>
@@ -80,8 +81,8 @@
#include <asm/xen/hypervisor.h>
#include <xen/balloon.h>
#include <xen/grant_table.h>
+#include <xen/hvc-console.h>
-#include "multicalls.h"
#include "xen-ops.h"
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
@@ -134,11 +135,6 @@ static inline unsigned p2m_mid_index(unsigned long pfn)
return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
}
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
@@ -182,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
static void * __ref alloc_p2m_page(void)
{
if (unlikely(!slab_is_available())) {
- void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-
- if (!ptr)
- panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
- __func__, PAGE_SIZE, PAGE_SIZE);
-
- return ptr;
+ return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
}
return (void *)__get_free_page(GFP_KERNEL);
@@ -560,7 +550,6 @@ int xen_alloc_p2m_entry(unsigned long pfn)
/* Separately check the mid mfn level */
unsigned long missing_mfn;
unsigned long mid_mfn_mfn;
- unsigned long old_mfn;
mid_mfn = alloc_p2m_page();
if (!mid_mfn)
@@ -570,12 +559,12 @@ int xen_alloc_p2m_entry(unsigned long pfn)
missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
mid_mfn_mfn = virt_to_mfn(mid_mfn);
- old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
- if (old_mfn != missing_mfn) {
- free_p2m_page(mid_mfn);
- mid_mfn = mfn_to_virt(old_mfn);
- } else {
+ /* try_cmpxchg() updates missing_mfn on failure. */
+ if (try_cmpxchg(top_mfn_p, &missing_mfn, mid_mfn_mfn)) {
p2m_top_mfn_p[topidx] = mid_mfn;
+ } else {
+ free_p2m_page(mid_mfn);
+ mid_mfn = mfn_to_virt(missing_mfn);
}
}
} else {
@@ -736,7 +725,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
* immediate unmapping.
*/
map_ops[i].status = GNTST_general_error;
- unmap[0].host_addr = map_ops[i].host_addr,
+ unmap[0].host_addr = map_ops[i].host_addr;
unmap[0].handle = map_ops[i].handle;
map_ops[i].handle = INVALID_GRANT_HANDLE;
if (map_ops[i].flags & GNTMAP_device_map)
@@ -746,7 +735,7 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
if (kmap_ops) {
kmap_ops[i].status = GNTST_general_error;
- unmap[1].host_addr = kmap_ops[i].host_addr,
+ unmap[1].host_addr = kmap_ops[i].host_addr;
unmap[1].handle = kmap_ops[i].handle;
kmap_ops[i].handle = INVALID_GRANT_HANDLE;
if (kmap_ops[i].flags & GNTMAP_device_map)
@@ -799,9 +788,104 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
return ret;
}
+/* Remapped non-RAM areas */
+#define NR_NONRAM_REMAP 4
+static struct nonram_remap {
+ phys_addr_t maddr;
+ phys_addr_t paddr;
+ size_t size;
+} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init;
+static unsigned int nr_nonram_remap __ro_after_init;
+
+/*
+ * Do the real remapping of non-RAM regions as specified in the
+ * xen_nonram_remap[] array.
+ * In case of an error just crash the system.
+ */
+void __init xen_do_remap_nonram(void)
+{
+ unsigned int i;
+ unsigned int remapped = 0;
+ const struct nonram_remap *remap = xen_nonram_remap;
+ unsigned long pfn, mfn, end_pfn;
+
+ for (i = 0; i < nr_nonram_remap; i++) {
+ end_pfn = PFN_UP(remap->paddr + remap->size);
+ pfn = PFN_DOWN(remap->paddr);
+ mfn = PFN_DOWN(remap->maddr);
+ while (pfn < end_pfn) {
+ if (!set_phys_to_machine(pfn, mfn))
+ panic("Failed to set p2m mapping for pfn=%lx mfn=%lx\n",
+ pfn, mfn);
+
+ pfn++;
+ mfn++;
+ remapped++;
+ }
+
+ remap++;
+ }
+
+ pr_info("Remapped %u non-RAM page(s)\n", remapped);
+}
+
+#ifdef CONFIG_ACPI
+/*
+ * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM
+ * regions into account.
+ * Any attempt to map an area crossing a remap boundary will produce a
+ * WARN() splat.
+ * phys is related to remap->maddr on input and will be rebased to remap->paddr.
+ */
+static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys,
+ acpi_size size)
+{
+ unsigned int i;
+ const struct nonram_remap *remap = xen_nonram_remap;
+
+ for (i = 0; i < nr_nonram_remap; i++) {
+ if (phys + size > remap->maddr &&
+ phys < remap->maddr + remap->size) {
+ WARN_ON(phys < remap->maddr ||
+ phys + size > remap->maddr + remap->size);
+ phys += remap->paddr - remap->maddr;
+ break;
+ }
+ }
+
+ return x86_acpi_os_ioremap(phys, size);
+}
+#endif /* CONFIG_ACPI */
+
+/*
+ * Add a new non-RAM remap entry.
+ * In case of no free entry found, just crash the system.
+ */
+void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr,
+ unsigned long size)
+{
+ BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK));
+
+ if (nr_nonram_remap == NR_NONRAM_REMAP) {
+ xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n");
+ BUG();
+ }
+
+#ifdef CONFIG_ACPI
+ /* Switch to the Xen acpi_os_ioremap() variant. */
+ if (nr_nonram_remap == 0)
+ acpi_os_ioremap = xen_acpi_os_ioremap;
+#endif
+
+ xen_nonram_remap[nr_nonram_remap].maddr = maddr;
+ xen_nonram_remap[nr_nonram_remap].paddr = paddr;
+ xen_nonram_remap[nr_nonram_remap].size = size;
+
+ nr_nonram_remap++;
+}
+
#ifdef CONFIG_XEN_DEBUG_FS
#include <linux/debugfs.h>
-#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
{
static const char * const type_name[] = {
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 21ecbe754cb2..8f89ce0b67e3 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -2,6 +2,7 @@
#include <linux/types.h>
#include <linux/interrupt.h>
+#include <asm/msr.h>
#include <asm/xen/hypercall.h>
#include <xen/xen.h>
#include <xen/page.h>
@@ -10,7 +11,6 @@
#include <xen/interface/xenpmu.h>
#include "xen-ops.h"
-#include "pmu.h"
/* x86_pmu.handle_irq definition */
#include "../events/perf_event.h"
@@ -129,8 +129,12 @@ static inline uint32_t get_fam15h_addr(u32 addr)
return addr;
}
-static inline bool is_amd_pmu_msr(unsigned int msr)
+static bool is_amd_pmu_msr(u32 msr)
{
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
+ return false;
+
if ((msr >= MSR_F15H_PERF_CTL &&
msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) ||
(msr >= MSR_K7_EVNTSEL0 &&
@@ -140,10 +144,15 @@ static inline bool is_amd_pmu_msr(unsigned int msr)
return false;
}
-static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
+static bool is_intel_pmu_msr(u32 msr_index, int *type, int *index)
{
u32 msr_index_pmc;
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_ZHAOXIN)
+ return false;
+
switch (msr_index) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
case MSR_IA32_DS_AREA:
@@ -186,8 +195,7 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index)
}
}
-static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
- int index, bool is_read)
+static bool xen_intel_pmu_emulate(u32 msr, u64 *val, int type, int index, bool is_read)
{
uint64_t *reg = NULL;
struct xen_pmu_intel_ctxt *ctxt;
@@ -249,7 +257,7 @@ static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type,
return false;
}
-static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
+static bool xen_amd_pmu_emulate(u32 msr, u64 *val, bool is_read)
{
uint64_t *reg = NULL;
int i, off = 0;
@@ -290,51 +298,20 @@ static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read)
return false;
}
-bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err)
-{
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, val, 1))
- *val = native_read_msr_safe(msr, err);
- return true;
- }
- } else {
- int type, index;
-
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, val, type, index, 1))
- *val = native_read_msr_safe(msr, err);
- return true;
- }
- }
-
- return false;
-}
-
-bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err)
+bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read)
{
- uint64_t val = ((uint64_t)high << 32) | low;
+ int type, index = 0;
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
- if (is_amd_pmu_msr(msr)) {
- if (!xen_amd_pmu_emulate(msr, &val, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true;
- }
- } else {
- int type, index;
+ if (is_amd_pmu_msr(msr))
+ return xen_amd_pmu_emulate(msr, val, is_read);
- if (is_intel_pmu_msr(msr, &type, &index)) {
- if (!xen_intel_pmu_emulate(msr, &val, type, index, 0))
- *err = native_write_msr_safe(msr, low, high);
- return true;
- }
- }
+ if (is_intel_pmu_msr(msr, &type, &index))
+ return xen_intel_pmu_emulate(msr, val, type, index, is_read);
return false;
}
-static unsigned long long xen_amd_read_pmc(int counter)
+static u64 xen_amd_read_pmc(int counter)
{
struct xen_pmu_amd_ctxt *ctxt;
uint64_t *counter_regs;
@@ -342,11 +319,12 @@ static unsigned long long xen_amd_read_pmc(int counter)
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
- uint32_t msr;
- int err;
+ u32 msr;
+ u64 val;
msr = amd_counters_base + (counter * amd_msr_step);
- return native_read_msr_safe(msr, &err);
+ native_read_msr_safe(msr, &val);
+ return val;
}
ctxt = &xenpmu_data->pmu.c.amd;
@@ -354,7 +332,7 @@ static unsigned long long xen_amd_read_pmc(int counter)
return counter_regs[counter];
}
-static unsigned long long xen_intel_read_pmc(int counter)
+static u64 xen_intel_read_pmc(int counter)
{
struct xen_pmu_intel_ctxt *ctxt;
uint64_t *fixed_counters;
@@ -363,15 +341,16 @@ static unsigned long long xen_intel_read_pmc(int counter)
uint8_t xenpmu_flags = get_xenpmu_flags();
if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) {
- uint32_t msr;
- int err;
+ u32 msr;
+ u64 val;
if (counter & (1 << INTEL_PMC_TYPE_SHIFT))
msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff);
else
msr = MSR_IA32_PERFCTR0 + counter;
- return native_read_msr_safe(msr, &err);
+ native_read_msr_safe(msr, &val);
+ return val;
}
ctxt = &xenpmu_data->pmu.c.intel;
@@ -384,7 +363,7 @@ static unsigned long long xen_intel_read_pmc(int counter)
return arch_cntr_pair[counter].counter;
}
-unsigned long long xen_read_pmc(int counter)
+u64 xen_read_pmc(int counter)
{
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
return xen_amd_read_pmc(counter);
diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h
deleted file mode 100644
index 65c58894fc79..000000000000
--- a/arch/x86/xen/pmu.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef __XEN_PMU_H
-#define __XEN_PMU_H
-
-#include <xen/interface/xenpmu.h>
-
-extern bool is_xen_pmu;
-
-irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
-#ifdef CONFIG_XEN_HAVE_VPMU
-void xen_pmu_init(int cpu);
-void xen_pmu_finish(int cpu);
-#else
-static inline void xen_pmu_init(int cpu) {}
-static inline void xen_pmu_finish(int cpu) {}
-#endif
-bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err);
-bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);
-int pmu_apic_update(uint32_t reg);
-unsigned long long xen_read_pmc(int counter);
-
-#endif /* __XEN_PMU_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index cfa99e8f054b..3823e52aef52 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -6,19 +6,21 @@
*/
#include <linux/init.h>
+#include <linux/iscsi_ibft.h>
#include <linux/sched.h>
+#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/pm.h>
#include <linux/memblock.h>
#include <linux/cpuidle.h>
#include <linux/cpufreq.h>
#include <linux/memory_hotplug.h>
+#include <linux/acpi.h>
#include <asm/elf.h>
#include <asm/vdso.h>
#include <asm/e820/api.h>
#include <asm/setup.h>
-#include <asm/acpi.h>
#include <asm/numa.h>
#include <asm/idtentry.h>
#include <asm/xen/hypervisor.h>
@@ -32,19 +34,18 @@
#include <xen/features.h>
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "mmu.h"
#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024)
-/* Amount of extra memory space we add to the e820 ranges */
-struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
-
-/* Number of pages released from the initial allocation. */
-unsigned long xen_released_pages;
+/* Memory map would allow PCI passthrough. */
+bool xen_pv_pci_possible;
/* E820 map used during setting up memory. */
static struct e820_table xen_e820_table __initdata;
+/* Number of initially usable memory pages. */
+static unsigned long ini_nr_pages __initdata;
+
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
* The physical page behind this address is remapped as needed to different
@@ -59,18 +60,6 @@ static struct {
} xen_remap_buf __initdata __aligned(PAGE_SIZE);
static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
-/*
- * The maximum amount of extra memory compared to the base size. The
- * main scaling factor is the size of struct page. At extreme ratios
- * of base:extra, all the base memory can be filled with page
- * structures for the extra memory, leaving no space for anything
- * else.
- *
- * 10x seems like a reasonable balance between scaling flexibility and
- * leaving a practically usable system.
- */
-#define EXTRA_MEM_RATIO (10)
-
static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB);
static void __init xen_parse_512gb(void)
@@ -85,41 +74,12 @@ static void __init xen_parse_512gb(void)
arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit=");
if (!arg)
val = true;
- else if (strtobool(arg + strlen("xen_512gb_limit="), &val))
+ else if (kstrtobool(arg + strlen("xen_512gb_limit="), &val))
return;
xen_512gb_limit = val;
}
-static void __init xen_add_extra_mem(unsigned long start_pfn,
- unsigned long n_pfns)
-{
- int i;
-
- /*
- * No need to check for zero size, should happen rarely and will only
- * write a new entry regarded to be unused due to zero size.
- */
- for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
- /* Add new region. */
- if (xen_extra_mem[i].n_pfns == 0) {
- xen_extra_mem[i].start_pfn = start_pfn;
- xen_extra_mem[i].n_pfns = n_pfns;
- break;
- }
- /* Append to existing region. */
- if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns ==
- start_pfn) {
- xen_extra_mem[i].n_pfns += n_pfns;
- break;
- }
- }
- if (i == XEN_EXTRA_MEM_MAX_REGIONS)
- printk(KERN_WARNING "Warning: not enough extra memory regions\n");
-
- memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns));
-}
-
static void __init xen_del_extra_mem(unsigned long start_pfn,
unsigned long n_pfns)
{
@@ -252,7 +212,7 @@ static int __init xen_free_mfn(unsigned long mfn)
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
- unsigned long end_pfn, unsigned long nr_pages)
+ unsigned long end_pfn)
{
unsigned long pfn, end;
int ret;
@@ -260,7 +220,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
WARN_ON(start_pfn > end_pfn);
/* Release pages first. */
- end = min(end_pfn, nr_pages);
+ end = min(end_pfn, ini_nr_pages);
for (pfn = start_pfn; pfn < end; pfn++) {
unsigned long mfn = pfn_to_mfn(pfn);
@@ -338,7 +298,7 @@ static void __init xen_do_set_identity_and_remap_chunk(
WARN_ON(size == 0);
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
ident_pfn_iter < ident_end_pfn;
@@ -381,15 +341,14 @@ static void __init xen_do_set_identity_and_remap_chunk(
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
- unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
- unsigned long remap_pfn)
+ unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn)
{
unsigned long pfn;
unsigned long i = 0;
unsigned long n = end_pfn - start_pfn;
if (remap_pfn == 0)
- remap_pfn = nr_pages;
+ remap_pfn = ini_nr_pages;
while (i < n) {
unsigned long cur_pfn = start_pfn + i;
@@ -398,19 +357,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
unsigned long remap_range_size;
/* Do not remap pages beyond the current allocation */
- if (cur_pfn >= nr_pages) {
+ if (cur_pfn >= ini_nr_pages) {
/* Identity map remaining pages */
set_phys_range_identity(cur_pfn, cur_pfn + size);
break;
}
- if (cur_pfn + size > nr_pages)
- size = nr_pages - cur_pfn;
+ if (cur_pfn + size > ini_nr_pages)
+ size = ini_nr_pages - cur_pfn;
remap_range_size = xen_find_pfn_range(&remap_pfn);
if (!remap_range_size) {
pr_warn("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages);
+ cur_pfn + left);
break;
}
/* Adjust size to fit in current e820 RAM region */
@@ -437,18 +396,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
}
static unsigned long __init xen_count_remap_pages(
- unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
+ unsigned long start_pfn, unsigned long end_pfn,
unsigned long remap_pages)
{
- if (start_pfn >= nr_pages)
+ if (start_pfn >= ini_nr_pages)
return remap_pages;
- return remap_pages + min(end_pfn, nr_pages) - start_pfn;
+ return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn;
}
-static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
+static unsigned long __init xen_foreach_remap_area(
unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long nr_pages, unsigned long last_val))
+ unsigned long last_val))
{
phys_addr_t start = 0;
unsigned long ret_val = 0;
@@ -476,8 +435,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
- ret_val = func(start_pfn, end_pfn, nr_pages,
- ret_val);
+ ret_val = func(start_pfn, end_pfn, ret_val);
start = end;
}
}
@@ -501,7 +459,7 @@ void __init xen_remap_memory(void)
unsigned long pfn_s = ~0UL;
unsigned long len = 0;
- mfn_save = virt_to_mfn(buf);
+ mfn_save = virt_to_mfn((void *)buf);
while (xen_remap_mfn != INVALID_P2M_ENTRY) {
/* Map the remap information */
@@ -534,6 +492,8 @@ void __init xen_remap_memory(void)
set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
pr_info("Remapped %ld page(s)\n", remapped);
+
+ xen_do_remap_nonram();
}
static unsigned long __init xen_get_pages_limit(void)
@@ -607,7 +567,7 @@ static void __init xen_ignore_unusable(void)
}
}
-bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
+static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
struct e820_entry *entry;
unsigned mapcnt;
@@ -665,6 +625,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size)
}
/*
+ * Swap a non-RAM E820 map entry with RAM above ini_nr_pages.
+ * Note that the E820 map is modified accordingly, but the P2M map isn't yet.
+ * The adaption of the P2M must be deferred until page allocation is possible.
+ */
+static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry)
+{
+ struct e820_entry *entry;
+ unsigned int mapcnt;
+ phys_addr_t mem_end = PFN_PHYS(ini_nr_pages);
+ phys_addr_t swap_addr, swap_size, entry_end;
+
+ swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr);
+ swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size);
+ entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ entry_end = entry->addr + entry->size;
+ if (entry->type == E820_TYPE_RAM && entry->size >= swap_size &&
+ entry_end - swap_size >= mem_end) {
+ /* Reduce RAM entry by needed space (whole pages). */
+ entry->size -= swap_size;
+
+ /* Add new entry at the end of E820 map. */
+ entry = xen_e820_table.entries +
+ xen_e820_table.nr_entries;
+ xen_e820_table.nr_entries++;
+
+ /* Fill new entry (keep size and page offset). */
+ entry->type = swap_entry->type;
+ entry->addr = entry_end - swap_size +
+ swap_addr - swap_entry->addr;
+ entry->size = swap_entry->size;
+
+ /* Convert old entry to RAM, align to pages. */
+ swap_entry->type = E820_TYPE_RAM;
+ swap_entry->addr = swap_addr;
+ swap_entry->size = swap_size;
+
+ /* Remember PFN<->MFN relation for P2M update. */
+ xen_add_remap_nonram(swap_addr, entry_end - swap_size,
+ swap_size);
+
+ /* Order E820 table and merge entries. */
+ e820__update_table(&xen_e820_table);
+
+ return;
+ }
+
+ entry++;
+ }
+
+ xen_raw_console_write("No suitable area found for required E820 entry remapping action\n");
+ BUG();
+}
+
+/*
+ * Look for non-RAM memory types in a specific guest physical area and move
+ * those away if possible (ACPI NVS only for now).
+ */
+static void __init xen_e820_resolve_conflicts(phys_addr_t start,
+ phys_addr_t size)
+{
+ struct e820_entry *entry;
+ unsigned int mapcnt;
+ phys_addr_t end;
+
+ if (!size)
+ return;
+
+ end = start + size;
+ entry = xen_e820_table.entries;
+
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ if (entry->addr >= end)
+ return;
+
+ if (entry->addr + entry->size > start &&
+ entry->type == E820_TYPE_NVS)
+ xen_e820_swap_entry_with_ram(entry);
+
+ entry++;
+ }
+}
+
+/*
+ * Check for an area in physical memory to be usable for non-movable purposes.
+ * An area is considered to usable if the used E820 map lists it to be RAM or
+ * some other type which can be moved to higher PFNs while keeping the MFNs.
+ * In case the area is not usable, crash the system with an error message.
+ */
+void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
+ const char *component)
+{
+ xen_e820_resolve_conflicts(start, size);
+
+ if (!xen_is_e820_reserved(start, size))
+ return;
+
+ xen_raw_console_write("Xen hypervisor allocated ");
+ xen_raw_console_write(component);
+ xen_raw_console_write(" memory conflicts with E820 map\n");
+ BUG();
+}
+
+/*
* Like memcpy, but with physical addresses for dest and src.
*/
static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src,
@@ -723,20 +788,20 @@ static void __init xen_reserve_xen_mfnlist(void)
**/
char * __init xen_memory_setup(void)
{
- unsigned long max_pfn, pfn_s, n_pfns;
+ unsigned long pfn_s, n_pfns;
phys_addr_t mem_end, addr, size, chunk_size;
u32 type;
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
unsigned long extra_pages = 0;
+ unsigned long maxmem_pages;
int i;
int op;
xen_parse_512gb();
- max_pfn = xen_get_pages_limit();
- max_pfn = min(max_pfn, xen_start_info->nr_pages);
- mem_end = PFN_PHYS(max_pfn);
+ ini_nr_pages = min(xen_get_pages_limit(), xen_start_info->nr_pages);
+ mem_end = PFN_PHYS(ini_nr_pages);
memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
@@ -763,27 +828,58 @@ char * __init xen_memory_setup(void)
BUG_ON(memmap.nr_entries == 0);
xen_e820_table.nr_entries = memmap.nr_entries;
- /*
- * Xen won't allow a 1:1 mapping to be created to UNUSABLE
- * regions, so if we're using the machine memory map leave the
- * region as RAM as it is in the pseudo-physical map.
- *
- * UNUSABLE regions in domUs are not handled and will need
- * a patch in the future.
- */
- if (xen_initial_domain())
+ if (xen_initial_domain()) {
+ /*
+ * Xen won't allow a 1:1 mapping to be created to UNUSABLE
+ * regions, so if we're using the machine memory map leave the
+ * region as RAM as it is in the pseudo-physical map.
+ *
+ * UNUSABLE regions in domUs are not handled and will need
+ * a patch in the future.
+ */
xen_ignore_unusable();
+#ifdef CONFIG_ISCSI_IBFT_FIND
+ /* Reserve 0.5 MiB to 1 MiB region so iBFT can be found */
+ xen_e820_table.entries[xen_e820_table.nr_entries].addr = IBFT_START;
+ xen_e820_table.entries[xen_e820_table.nr_entries].size = IBFT_END - IBFT_START;
+ xen_e820_table.entries[xen_e820_table.nr_entries].type = E820_TYPE_RESERVED;
+ xen_e820_table.nr_entries++;
+#endif
+ }
+
/* Make sure the Xen-supplied memory map is well-ordered. */
e820__update_table(&xen_e820_table);
+ /*
+ * Check whether the kernel itself conflicts with the target E820 map.
+ * Failing now is better than running into weird problems later due
+ * to relocating (and even reusing) pages with kernel text or data.
+ */
+ xen_chk_is_e820_usable(__pa_symbol(_text),
+ __pa_symbol(_end) - __pa_symbol(_text),
+ "kernel");
+
+ /*
+ * Check for a conflict of the xen_start_info memory with the target
+ * E820 map.
+ */
+ xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info),
+ "xen_start_info");
+
+ /*
+ * Check for a conflict of the hypervisor supplied page tables with
+ * the target E820 map.
+ */
+ xen_pt_check_e820();
+
max_pages = xen_get_max_pages();
/* How many extra pages do we need due to remapping? */
- max_pages += xen_foreach_remap_area(max_pfn, xen_count_remap_pages);
+ max_pages += xen_foreach_remap_area(xen_count_remap_pages);
- if (max_pages > max_pfn)
- extra_pages += max_pages - max_pfn;
+ if (max_pages > ini_nr_pages)
+ extra_pages += max_pages - ini_nr_pages;
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
@@ -792,8 +888,8 @@ char * __init xen_memory_setup(void)
* Make sure we have no memory above max_pages, as this area
* isn't handled by the p2m management.
*/
- extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
- extra_pages, max_pages - max_pfn);
+ maxmem_pages = EXTRA_MEM_RATIO * min(ini_nr_pages, PFN_DOWN(MAXMEM));
+ extra_pages = min3(maxmem_pages, extra_pages, max_pages - ini_nr_pages);
i = 0;
addr = xen_e820_table.entries[0].addr;
size = xen_e820_table.entries[0].size;
@@ -803,6 +899,9 @@ char * __init xen_memory_setup(void)
chunk_size = size;
type = xen_e820_table.entries[i].type;
+ if (type == E820_TYPE_RESERVED)
+ xen_pv_pci_possible = true;
+
if (type == E820_TYPE_RAM) {
if (addr < mem_end) {
chunk_size = min(size, mem_end - addr);
@@ -846,23 +945,6 @@ char * __init xen_memory_setup(void)
e820__update_table(e820_table);
- /*
- * Check whether the kernel itself conflicts with the target E820 map.
- * Failing now is better than running into weird problems later due
- * to relocating (and even reusing) pages with kernel text or data.
- */
- if (xen_is_e820_reserved(__pa_symbol(_text),
- __pa_symbol(__bss_stop) - __pa_symbol(_text))) {
- xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n");
- BUG();
- }
-
- /*
- * Check for a conflict of the hypervisor supplied page tables with
- * the target E820 map.
- */
- xen_pt_check_e820();
-
xen_reserve_xen_mfnlist();
/* Check for a conflict of the initrd with the target E820 map. */
@@ -890,7 +972,7 @@ char * __init xen_memory_setup(void)
* Set identity map on non-RAM pages and prepare remapping the
* underlying RAM.
*/
- xen_foreach_remap_area(max_pfn, xen_set_identity_and_remap_chunk);
+ xen_foreach_remap_area(xen_set_identity_and_remap_chunk);
pr_info("Released %ld page(s)\n", xen_released_pages);
@@ -910,17 +992,9 @@ static int register_callback(unsigned type, const void *func)
void xen_enable_sysenter(void)
{
- int ret;
- unsigned sysenter_feature;
-
- sysenter_feature = X86_FEATURE_SYSENTER32;
-
- if (!boot_cpu_has(sysenter_feature))
- return;
-
- ret = register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat);
- if(ret != 0)
- setup_clear_cpu_cap(sysenter_feature);
+ if (cpu_feature_enabled(X86_FEATURE_SYSENTER32) &&
+ register_callback(CALLBACKTYPE_sysenter, xen_entry_SYSENTER_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
}
void xen_enable_syscall(void)
@@ -934,22 +1008,15 @@ void xen_enable_syscall(void)
mechanism for syscalls. */
}
- if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
- ret = register_callback(CALLBACKTYPE_syscall32,
- xen_entry_SYSCALL_compat);
- if (ret != 0)
- setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
- }
+ if (cpu_feature_enabled(X86_FEATURE_SYSCALL32) &&
+ register_callback(CALLBACKTYPE_syscall32, xen_entry_SYSCALL_compat))
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
}
static void __init xen_pvmmu_arch_setup(void)
{
- HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables);
- HYPERVISOR_vm_assist(VMASST_CMD_enable,
- VMASST_TYPE_pae_extended_cr3);
-
if (register_callback(CALLBACKTYPE_event,
xen_asm_exc_xen_hypervisor_callback) ||
register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c3e1f9a7d43a..05f92c812ac8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -9,7 +9,6 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "smp.h"
static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
@@ -32,30 +31,30 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
void xen_smp_intr_free(unsigned int cpu)
{
+ kfree(per_cpu(xen_resched_irq, cpu).name);
+ per_cpu(xen_resched_irq, cpu).name = NULL;
if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu).irq, NULL);
per_cpu(xen_resched_irq, cpu).irq = -1;
- kfree(per_cpu(xen_resched_irq, cpu).name);
- per_cpu(xen_resched_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfunc_irq, cpu).name);
+ per_cpu(xen_callfunc_irq, cpu).name = NULL;
if (per_cpu(xen_callfunc_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu).irq, NULL);
per_cpu(xen_callfunc_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfunc_irq, cpu).name);
- per_cpu(xen_callfunc_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_debug_irq, cpu).name);
+ per_cpu(xen_debug_irq, cpu).name = NULL;
if (per_cpu(xen_debug_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu).irq, NULL);
per_cpu(xen_debug_irq, cpu).irq = -1;
- kfree(per_cpu(xen_debug_irq, cpu).name);
- per_cpu(xen_debug_irq, cpu).name = NULL;
}
+ kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
+ per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
if (per_cpu(xen_callfuncsingle_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu).irq,
NULL);
per_cpu(xen_callfuncsingle_irq, cpu).irq = -1;
- kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
- per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
}
}
@@ -65,6 +64,9 @@ int xen_smp_intr_init(unsigned int cpu)
char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
+ if (!resched_name)
+ goto fail_mem;
+ per_cpu(xen_resched_irq, cpu).name = resched_name;
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
cpu,
xen_reschedule_interrupt,
@@ -74,9 +76,11 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_resched_irq, cpu).irq = rc;
- per_cpu(xen_resched_irq, cpu).name = resched_name;
callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+ per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
cpu,
xen_call_function_interrupt,
@@ -86,10 +90,13 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfunc_irq, cpu).irq = rc;
- per_cpu(xen_callfunc_irq, cpu).name = callfunc_name;
if (!xen_fifo_events) {
debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ if (!debug_name)
+ goto fail_mem;
+
+ per_cpu(xen_debug_irq, cpu).name = debug_name;
rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu,
xen_debug_interrupt,
IRQF_PERCPU | IRQF_NOBALANCING,
@@ -97,10 +104,13 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_debug_irq, cpu).irq = rc;
- per_cpu(xen_debug_irq, cpu).name = debug_name;
}
callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+ if (!callfunc_name)
+ goto fail_mem;
+
+ per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
cpu,
xen_call_function_single_interrupt,
@@ -110,10 +120,11 @@ int xen_smp_intr_init(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
- per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
return 0;
+ fail_mem:
+ rc = -ENOMEM;
fail:
xen_smp_intr_free(cpu);
return rc;
@@ -123,8 +134,6 @@ void __init xen_smp_cpus_done(unsigned int max_cpus)
{
if (xen_hvm_domain())
native_smp_cpus_done(max_cpus);
- else
- calculate_max_logical_packages();
}
void xen_smp_send_reschedule(int cpu)
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
deleted file mode 100644
index bd02f9d50107..000000000000
--- a/arch/x86/xen/smp.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _XEN_SMP_H
-
-#ifdef CONFIG_SMP
-extern void xen_send_IPI_mask(const struct cpumask *mask,
- int vector);
-extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
- int vector);
-extern void xen_send_IPI_allbutself(int vector);
-extern void xen_send_IPI_all(int vector);
-extern void xen_send_IPI_self(int vector);
-
-extern int xen_smp_intr_init(unsigned int cpu);
-extern void xen_smp_intr_free(unsigned int cpu);
-int xen_smp_intr_init_pv(unsigned int cpu);
-void xen_smp_intr_free_pv(unsigned int cpu);
-
-void xen_smp_cpus_done(unsigned int max_cpus);
-
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(const struct cpumask *mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
-
-struct xen_common_irq {
- int irq;
- char *name;
-};
-#else /* CONFIG_SMP */
-
-static inline int xen_smp_intr_init(unsigned int cpu)
-{
- return 0;
-}
-static inline void xen_smp_intr_free(unsigned int cpu) {}
-
-static inline int xen_smp_intr_init_pv(unsigned int cpu)
-{
- return 0;
-}
-static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
-#endif /* CONFIG_SMP */
-
-#endif
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
index b70afdff419c..485c1d8804f7 100644
--- a/arch/x86/xen/smp_hvm.c
+++ b/arch/x86/xen/smp_hvm.c
@@ -5,8 +5,6 @@
#include <xen/events.h>
#include "xen-ops.h"
-#include "smp.h"
-
static void __init xen_hvm_smp_prepare_boot_cpu(void)
{
@@ -55,18 +53,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
}
#ifdef CONFIG_HOTPLUG_CPU
-static void xen_hvm_cpu_die(unsigned int cpu)
+static void xen_hvm_cleanup_dead_cpu(unsigned int cpu)
{
- if (common_cpu_die(cpu) == 0) {
- if (xen_have_vector_callback) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
- }
+ if (xen_have_vector_callback) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
}
}
#else
-static void xen_hvm_cpu_die(unsigned int cpu)
+static void xen_hvm_cleanup_dead_cpu(unsigned int cpu)
{
BUG();
}
@@ -77,7 +73,7 @@ void __init xen_hvm_smp_init(void)
smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_cpus_done = xen_smp_cpus_done;
- smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.cleanup_dead_cpu = xen_hvm_cleanup_dead_cpu;
if (!xen_have_vector_callback) {
#ifdef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index ba7af2eca755..9bb8ff8bff30 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -29,6 +29,7 @@
#include <asm/idtentry.h>
#include <asm/desc.h>
#include <asm/cpu.h>
+#include <asm/apic.h>
#include <asm/io_apic.h>
#include <xen/interface/xen.h>
@@ -45,9 +46,6 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "smp.h"
-#include "pmu.h"
cpumask_var_t xen_cpu_initialized_map;
@@ -55,14 +53,15 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
-void asm_cpu_bringup_and_idle(void);
static void cpu_bringup(void)
{
int cpu;
cr4_init();
+ cpuhp_ap_sync_alive();
cpu_init();
+ fpu__init_cpu();
touch_softlockup_watchdog();
/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
@@ -71,8 +70,7 @@ static void cpu_bringup(void)
xen_enable_syscall();
}
cpu = smp_processor_id();
- smp_store_cpu_info(cpu);
- cpu_data(cpu).x86_max_cores = 1;
+ identify_secondary_cpu(cpu);
set_cpu_sibling_map(cpu);
speculative_store_bypass_ht_init();
@@ -83,7 +81,7 @@ static void cpu_bringup(void)
set_cpu_online(cpu, true);
- cpu_set_state_online(cpu); /* Implies full memory barrier. */
+ smp_mb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -97,18 +95,18 @@ asmlinkage __visible void cpu_bringup_and_idle(void)
void xen_smp_intr_free_pv(unsigned int cpu)
{
+ kfree(per_cpu(xen_irq_work, cpu).name);
+ per_cpu(xen_irq_work, cpu).name = NULL;
if (per_cpu(xen_irq_work, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
per_cpu(xen_irq_work, cpu).irq = -1;
- kfree(per_cpu(xen_irq_work, cpu).name);
- per_cpu(xen_irq_work, cpu).name = NULL;
}
+ kfree(per_cpu(xen_pmu_irq, cpu).name);
+ per_cpu(xen_pmu_irq, cpu).name = NULL;
if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
per_cpu(xen_pmu_irq, cpu).irq = -1;
- kfree(per_cpu(xen_pmu_irq, cpu).name);
- per_cpu(xen_pmu_irq, cpu).name = NULL;
}
}
@@ -118,6 +116,7 @@ int xen_smp_intr_init_pv(unsigned int cpu)
char *callfunc_name, *pmu_name;
callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+ per_cpu(xen_irq_work, cpu).name = callfunc_name;
rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
cpu,
xen_irq_work_interrupt,
@@ -127,10 +126,10 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_irq_work, cpu).irq = rc;
- per_cpu(xen_irq_work, cpu).name = callfunc_name;
if (is_xen_pmu) {
pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+ per_cpu(xen_pmu_irq, cpu).name = pmu_name;
rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
xen_pmu_irq_handler,
IRQF_PERCPU|IRQF_NOBALANCING,
@@ -138,7 +137,6 @@ int xen_smp_intr_init_pv(unsigned int cpu)
if (rc < 0)
goto fail;
per_cpu(xen_pmu_irq, cpu).irq = rc;
- per_cpu(xen_pmu_irq, cpu).name = pmu_name;
}
return 0;
@@ -148,40 +146,18 @@ int xen_smp_intr_init_pv(unsigned int cpu)
return rc;
}
-static void __init _get_smp_config(unsigned int early)
+static void __init xen_pv_smp_config(void)
{
- int i, rc;
- unsigned int subtract = 0;
-
- if (early)
- return;
-
- num_processors = 0;
- disabled_cpus = 0;
- for (i = 0; i < nr_cpu_ids; i++) {
- rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0) {
- num_processors++;
- set_cpu_possible(i, true);
- } else {
- set_cpu_possible(i, false);
- set_cpu_present(i, false);
- subtract++;
- }
- }
-#ifdef CONFIG_HOTPLUG_CPU
- /* This is akin to using 'nr_cpus' on the Linux command line.
- * Which is OK as when we use 'dom0_max_vcpus=X' we can only
- * have up to X, while nr_cpu_ids is greater than X. This
- * normally is not a problem, except when CPU hotplugging
- * is involved and then there might be more than X CPUs
- * in the guest - which will not work as there is no
- * hypercall to expand the max number of VCPUs an already
- * running guest has. So cap it up to X. */
- if (subtract)
- nr_cpu_ids = nr_cpu_ids - subtract;
-#endif
+ u32 apicid = 0;
+ int i;
+
+ topology_register_boot_apic(apicid);
+ for (i = 0; i < nr_cpu_ids; i++)
+ topology_register_apic(apicid++, CPU_ACPIID_INVALID, true);
+
+ /* Pretend to be a proper enumerated system */
+ smp_found_config = 1;
}
static void __init xen_pv_smp_prepare_boot_cpu(void)
@@ -209,7 +185,7 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
- if (skip_ioapic_setup) {
+ if (ioapic_is_disabled) {
char *m = (max_cpus == 0) ?
"The nosmp parameter is incompatible with Xen; " \
"use Xen dom0_max_vcpus=1 parameter" :
@@ -222,8 +198,6 @@ static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
smp_prepare_cpus_common();
- cpu_data(0).x86_max_cores = 1;
-
speculative_store_bypass_ht_init();
xen_pmu_init(0);
@@ -254,15 +228,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
struct desc_struct *gdt;
unsigned long gdt_mfn;
- /* used to tell cpu_init() that it can proceed with initialization */
- cpumask_set_cpu(cpu, cpu_callout_mask);
if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
if (ctxt == NULL) {
cpumask_clear_cpu(cpu, xen_cpu_initialized_map);
- cpumask_clear_cpu(cpu, cpu_callout_mask);
return -ENOMEM;
}
@@ -316,7 +287,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
return 0;
}
-static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
+static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle)
{
int rc;
@@ -326,14 +297,6 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_setup_runstate_info(cpu);
- /*
- * PV VCPUs are always successfully taken down (see 'while' loop
- * in xen_cpu_die()), so -EBUSY is an error.
- */
- rc = cpu_check_up_prepare(cpu);
- if (rc)
- return rc;
-
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -343,15 +306,20 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_pmu_init(cpu);
- rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
- BUG_ON(rc);
-
- while (cpu_report_state(cpu) != CPU_ONLINE)
- HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+ /*
+ * Why is this a BUG? If the hypercall fails then everything can be
+ * rolled back, no?
+ */
+ BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL));
return 0;
}
+static void xen_pv_poll_sync_state(void)
+{
+ HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+}
+
#ifdef CONFIG_HOTPLUG_CPU
static int xen_pv_cpu_disable(void)
{
@@ -367,35 +335,26 @@ static int xen_pv_cpu_disable(void)
static void xen_pv_cpu_die(unsigned int cpu)
{
- while (HYPERVISOR_vcpu_op(VCPUOP_is_up,
- xen_vcpu_nr(cpu), NULL)) {
+ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), NULL)) {
__set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(HZ/10);
}
+}
- if (common_cpu_die(cpu) == 0) {
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
- xen_pmu_finish(cpu);
- }
+static void xen_pv_cleanup_dead_cpu(unsigned int cpu)
+{
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ xen_pmu_finish(cpu);
}
-static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+static void __noreturn xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
{
play_dead_common();
HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
- cpu_bringup();
- /*
- * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
- * clears certain data that the cpu_idle loop (which called us
- * and that we return from) expects. The only way to get that
- * data back is to call:
- */
- tick_nohz_idle_enter();
- tick_nohz_idle_stop_tick_protected();
-
- cpuhp_online_idle(CPUHP_AP_ONLINE_IDLE);
+ xen_cpu_bringup_again((unsigned long)task_pt_regs(current));
+ BUG();
}
#else /* !CONFIG_HOTPLUG_CPU */
@@ -409,7 +368,12 @@ static void xen_pv_cpu_die(unsigned int cpu)
BUG();
}
-static void xen_pv_play_dead(void)
+static void xen_pv_cleanup_dead_cpu(unsigned int cpu)
+{
+ BUG();
+}
+
+static void __noreturn xen_pv_play_dead(void)
{
BUG();
}
@@ -442,13 +406,29 @@ static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}
+void __init xen_smp_count_cpus(void)
+{
+ unsigned int cpus;
+
+ for (cpus = 0; cpus < nr_cpu_ids; cpus++) {
+ if (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpus, NULL) < 0)
+ break;
+ }
+
+ pr_info("Xen PV: Detected %u vCPUS\n", cpus);
+ if (cpus < nr_cpu_ids)
+ set_nr_cpu_ids(cpus);
+}
+
static const struct smp_ops xen_smp_ops __initconst = {
.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
.smp_cpus_done = xen_smp_cpus_done,
- .cpu_up = xen_pv_cpu_up,
+ .kick_ap_alive = xen_pv_kick_ap,
.cpu_die = xen_pv_cpu_die,
+ .cleanup_dead_cpu = xen_pv_cleanup_dead_cpu,
+ .poll_sync_state = xen_pv_poll_sync_state,
.cpu_disable = xen_pv_cpu_disable,
.play_dead = xen_pv_play_dead,
@@ -464,6 +444,12 @@ void __init xen_smp_init(void)
smp_ops = xen_smp_ops;
/* Avoid searching for BIOS MP tables */
- x86_init.mpparse.find_smp_config = x86_init_noop;
- x86_init.mpparse.get_smp_config = _get_smp_config;
+ x86_init.mpparse.find_mptable = x86_init_noop;
+ x86_init.mpparse.early_parse_smp_cfg = x86_init_noop;
+
+ /* XEN/PV Dom0 has halfways sane topology information via CPUID/MADT */
+ if (xen_initial_domain())
+ x86_init.mpparse.parse_smp_cfg = x86_init_noop;
+ else
+ x86_init.mpparse.parse_smp_cfg = xen_pv_smp_config;
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 043c73dfd2c9..8e4efe0fb6f9 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -18,7 +18,6 @@
static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest);
-static bool xen_pvspin = true;
static void xen_qlock_kick(int cpu)
{
@@ -68,13 +67,14 @@ void xen_init_lock_cpu(int cpu)
int irq;
char *name;
- if (!xen_pvspin)
+ if (nopvspin)
return;
WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n",
cpu, per_cpu(lock_kicker_irq, cpu));
name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
+ per_cpu(irq_name, cpu) = name;
irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
cpu,
dummy_handler,
@@ -85,7 +85,6 @@ void xen_init_lock_cpu(int cpu)
if (irq >= 0) {
disable_irq(irq); /* make sure it's never delivered */
per_cpu(lock_kicker_irq, cpu) = irq;
- per_cpu(irq_name, cpu) = name;
}
printk("cpu %d spinlock event irq %d\n", cpu, irq);
@@ -95,9 +94,11 @@ void xen_uninit_lock_cpu(int cpu)
{
int irq;
- if (!xen_pvspin)
+ if (nopvspin)
return;
+ kfree(per_cpu(irq_name, cpu));
+ per_cpu(irq_name, cpu) = NULL;
/*
* When booting the kernel with 'mitigations=auto,nosmt', the secondary
* CPUs are not activated, and lock_kicker_irq is not initialized.
@@ -108,8 +109,6 @@ void xen_uninit_lock_cpu(int cpu)
unbind_from_irqhandler(irq, NULL);
per_cpu(lock_kicker_irq, cpu) = -1;
- kfree(per_cpu(irq_name, cpu));
- per_cpu(irq_name, cpu) = NULL;
}
PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
@@ -125,10 +124,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen);
void __init xen_init_spinlocks(void)
{
/* Don't need to use pvqspinlock code if there is only 1 vCPU. */
- if (num_possible_cpus() == 1 || nopvspin)
- xen_pvspin = false;
+ if (num_possible_cpus() == 1)
+ nopvspin = true;
- if (!xen_pvspin) {
+ if (nopvspin) {
printk(KERN_DEBUG "xen: PV spinlocks disabled\n");
static_branch_disable(&virt_spin_lock_key);
return;
@@ -143,12 +142,3 @@ void __init xen_init_spinlocks(void)
pv_ops.lock.kick = xen_qlock_kick;
pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
}
-
-static __init int xen_parse_nopvspin(char *arg)
-{
- pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n");
- xen_pvspin = false;
- return 0;
-}
-early_param("xen_nopvspin", xen_parse_nopvspin);
-
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d83152c761b..ba2f17e64321 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -13,10 +13,9 @@
#include <asm/xen/hypercall.h>
#include <asm/xen/page.h>
#include <asm/fixmap.h>
+#include <asm/msr.h>
#include "xen-ops.h"
-#include "mmu.h"
-#include "pmu.h"
static DEFINE_PER_CPU(u64, spec_ctrl);
@@ -41,7 +40,7 @@ void xen_arch_post_suspend(int cancelled)
static void xen_vcpu_notify_restore(void *data)
{
if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL))
- wrmsrl(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl));
+ wrmsrq(MSR_IA32_SPEC_CTRL, this_cpu_read(spec_ctrl));
/* Boot processor notified via generic timekeeping_resume() */
if (smp_processor_id() == 0)
@@ -57,9 +56,9 @@ static void xen_vcpu_notify_suspend(void *data)
tick_suspend_local();
if (xen_pv_domain() && boot_cpu_has(X86_FEATURE_SPEC_CTRL)) {
- rdmsrl(MSR_IA32_SPEC_CTRL, tmp);
+ rdmsrq(MSR_IA32_SPEC_CTRL, tmp);
this_cpu_write(spec_ctrl, tmp);
- wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+ wrmsrq(MSR_IA32_SPEC_CTRL, 0);
}
}
diff --git a/arch/x86/xen/suspend_hvm.c b/arch/x86/xen/suspend_hvm.c
index 9d548b0c772f..0c4f7554b7cc 100644
--- a/arch/x86/xen/suspend_hvm.c
+++ b/arch/x86/xen/suspend_hvm.c
@@ -5,6 +5,7 @@
#include <xen/hvm.h>
#include <xen/features.h>
#include <xen/interface/features.h>
+#include <xen/events.h>
#include "xen-ops.h"
@@ -14,6 +15,13 @@ void xen_hvm_post_suspend(int suspend_cancelled)
xen_hvm_init_shared_info();
xen_vcpu_restore();
}
- xen_setup_callback_vector();
+ if (xen_percpu_upcall) {
+ unsigned int cpu;
+
+ for_each_online_cpu(cpu)
+ BUG_ON(xen_set_upcall_vector(cpu));
+ } else {
+ xen_setup_callback_vector();
+ }
xen_unplug_emulated_devices();
}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 9ef0a5cca96e..96521b1874ac 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -20,6 +20,7 @@
#include <asm/pvclock.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <asm/xen/cpuid.h>
#include <xen/events.h>
#include <xen/features.h>
@@ -29,7 +30,7 @@
#include "xen-ops.h"
/* Minimum amount of time until next clock event fires */
-#define TIMER_SLOP 100000
+#define TIMER_SLOP 1
static u64 xen_sched_clock_offset __read_mostly;
@@ -60,9 +61,16 @@ static u64 xen_clocksource_get_cycles(struct clocksource *cs)
return xen_clocksource_read();
}
-static u64 xen_sched_clock(void)
+static noinstr u64 xen_sched_clock(void)
{
- return xen_clocksource_read() - xen_sched_clock_offset;
+ struct pvclock_vcpu_time_info *src;
+ u64 ret;
+
+ src = &__this_cpu_read(xen_vcpu)->time;
+ ret = pvclock_clocksource_read_nowd(src);
+ ret -= xen_sched_clock_offset;
+
+ return ret;
}
static void xen_read_wallclock(struct timespec64 *ts)
@@ -474,15 +482,47 @@ static void xen_setup_vsyscall_time_info(void)
xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
}
+/*
+ * Check if it is possible to safely use the tsc as a clocksource. This is
+ * only true if the hypervisor notifies the guest that its tsc is invariant,
+ * the tsc is stable, and the tsc instruction will never be emulated.
+ */
+static int __init xen_tsc_safe_clocksource(void)
+{
+ u32 eax, ebx, ecx, edx;
+
+ if (!(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)))
+ return 0;
+
+ if (!(boot_cpu_has(X86_FEATURE_NONSTOP_TSC)))
+ return 0;
+
+ if (check_tsc_unstable())
+ return 0;
+
+ /* Leaf 4, sub-leaf 0 (0x40000x03) */
+ cpuid_count(xen_cpuid_base() + 3, 0, &eax, &ebx, &ecx, &edx);
+
+ return ebx == XEN_CPUID_TSC_MODE_NEVER_EMULATE;
+}
+
static void __init xen_time_init(void)
{
struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec64 tp;
- /* As Dom0 is never moved, no penalty on using TSC there */
+ /*
+ * As Dom0 is never moved, no penalty on using TSC there.
+ *
+ * If it is possible for the guest to determine that the tsc is a safe
+ * clocksource, then set xen_clocksource rating below that of the tsc
+ * so that the system prefers tsc instead.
+ */
if (xen_initial_domain())
xen_clocksource.rating = 275;
+ else if (xen_tsc_safe_clocksource())
+ xen_clocksource.rating = 299;
clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
index 14ea32e734d5..f7547807b0bd 100644
--- a/arch/x86/xen/vga.c
+++ b/arch/x86/xen/vga.c
@@ -2,17 +2,15 @@
#include <linux/screen_info.h>
#include <linux/init.h>
-#include <asm/bootparam.h>
#include <asm/setup.h>
#include <xen/interface/xen.h>
#include "xen-ops.h"
-void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size,
+ struct screen_info *screen_info)
{
- struct screen_info *screen_info = &boot_params.screen_info;
-
/* This is drawn from a dump from vgacon:startup in
* standard Linux. */
screen_info->orig_video_mode = 3;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b4fdf6b9542..461bb1526502 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -20,15 +20,39 @@
#include <linux/init.h>
#include <linux/linkage.h>
+#include <linux/objtool.h>
#include <../entry/calling.h>
.pushsection .noinstr.text, "ax"
/*
+ * PV hypercall interface to the hypervisor.
+ *
+ * Called via inline asm(), so better preserve %rcx and %r11.
+ *
+ * Input:
+ * %eax: hypercall number
+ * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall
+ * Output: %rax
+ */
+SYM_FUNC_START(xen_hypercall_pv)
+ ANNOTATE_NOENDBR
+ push %rcx
+ push %r11
+ UNWIND_HINT_SAVE
+ syscall
+ UNWIND_HINT_RESTORE
+ pop %r11
+ pop %rcx
+ RET
+SYM_FUNC_END(xen_hypercall_pv)
+
+/*
* Disabling events is simply a matter of making the event mask
* non-zero.
*/
SYM_FUNC_START(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ ENDBR
+ movb $1, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
RET
SYM_FUNC_END(xen_irq_disable_direct)
@@ -67,9 +91,10 @@ SYM_FUNC_END(check_events)
* then enter the hypervisor to get them handled.
*/
SYM_FUNC_START(xen_irq_enable_direct)
+ ENDBR
FRAME_BEGIN
/* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ movb $0, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
/*
* Preempt here doesn't matter because that will deal with any
@@ -78,7 +103,7 @@ SYM_FUNC_START(xen_irq_enable_direct)
*/
/* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_pending)
jz 1f
call check_events
@@ -97,13 +122,15 @@ SYM_FUNC_END(xen_irq_enable_direct)
* x86 use opposite senses (mask vs enable).
*/
SYM_FUNC_START(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_mask
+ ENDBR
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_mask)
setz %ah
addb %ah, %ah
RET
SYM_FUNC_END(xen_save_fl_direct)
SYM_FUNC_START(xen_read_cr2)
+ ENDBR
FRAME_BEGIN
_ASM_MOV PER_CPU_VAR(xen_vcpu), %_ASM_AX
_ASM_MOV XEN_vcpu_info_arch_cr2(%_ASM_AX), %_ASM_AX
@@ -112,8 +139,9 @@ SYM_FUNC_START(xen_read_cr2)
SYM_FUNC_END(xen_read_cr2);
SYM_FUNC_START(xen_read_cr2_direct)
+ ENDBR
FRAME_BEGIN
- _ASM_MOV PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_arch_cr2, %_ASM_AX
+ _ASM_MOV PER_CPU_VAR(xen_vcpu_info + XEN_vcpu_info_arch_cr2), %_ASM_AX
FRAME_END
RET
SYM_FUNC_END(xen_read_cr2_direct);
@@ -148,7 +176,7 @@ xen_pv_trap asm_exc_page_fault
xen_pv_trap asm_exc_spurious_interrupt_bug
xen_pv_trap asm_exc_coprocessor_error
xen_pv_trap asm_exc_alignment_check
-#ifdef CONFIG_X86_KERNEL_IBT
+#ifdef CONFIG_X86_CET
xen_pv_trap asm_exc_control_protection
#endif
#ifdef CONFIG_X86_MCE
@@ -156,7 +184,7 @@ xen_pv_trap asm_xenpv_exc_machine_check
#endif /* CONFIG_X86_MCE */
xen_pv_trap asm_exc_simd_coprocessor_error
#ifdef CONFIG_IA32_EMULATION
-xen_pv_trap entry_INT80_compat
+xen_pv_trap asm_int80_emulation
#endif
xen_pv_trap asm_exc_xen_unknown_trap
xen_pv_trap asm_exc_xen_hypervisor_callback
@@ -165,7 +193,7 @@ xen_pv_trap asm_exc_xen_hypervisor_callback
SYM_CODE_START(xen_early_idt_handler_array)
i = 0
.rept NUM_EXCEPTION_VECTORS
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ENDBR
pop %rcx
pop %r11
@@ -176,7 +204,6 @@ SYM_CODE_START(xen_early_idt_handler_array)
SYM_CODE_END(xen_early_idt_handler_array)
__FINIT
-hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
* Xen64 iret frame:
*
@@ -186,17 +213,26 @@ hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
* cs
* rip <-- standard iret frame
*
- * flags
+ * flags <-- xen_iret must push from here on
*
- * rcx }
- * r11 }<-- pushed by hypercall page
- * rsp->rax }
+ * rcx
+ * r11
+ * rsp->rax
*/
+.macro xen_hypercall_iret
+ pushq $0 /* Flags */
+ push %rcx
+ push %r11
+ push %rax
+ mov $__HYPERVISOR_iret, %eax
+ syscall /* Do the IRET. */
+ ud2 /* The SYSCALL should never return. */
+.endm
+
SYM_CODE_START(xen_iret)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_UNDEFINED
ANNOTATE_NOENDBR
- pushq $0
- jmp hypercall_iret
+ xen_hypercall_iret
SYM_CODE_END(xen_iret)
/*
@@ -262,10 +298,10 @@ SYM_CODE_START(xen_entry_SYSCALL_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSCALL_compat_after_hwframe
@@ -284,10 +320,10 @@ SYM_CODE_START(xen_entry_SYSENTER_compat)
/*
* Neither Xen nor the kernel really knows what the old SS and
- * CS were. The kernel expects __USER32_DS and __USER32_CS, so
+ * CS were. The kernel expects __USER_DS and __USER32_CS, so
* report those values even though Xen will guess its own values.
*/
- movq $__USER32_DS, 4*8(%rsp)
+ movq $__USER_DS, 4*8(%rsp)
movq $__USER32_CS, 1*8(%rsp)
jmp entry_SYSENTER_compat_after_hwframe
@@ -301,8 +337,7 @@ SYM_CODE_START(xen_entry_SYSENTER_compat)
ENDBR
lea 16(%rsp), %rsp /* strip %rcx, %r11 */
mov $-ENOSYS, %rax
- pushq $0
- jmp hypercall_iret
+ xen_hypercall_iret
SYM_CODE_END(xen_entry_SYSENTER_compat)
SYM_CODE_END(xen_entry_SYSCALL_compat)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index ffaa62167f6e..5dad6c51cdc3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -6,9 +6,11 @@
#include <linux/elfnote.h>
#include <linux/init.h>
+#include <linux/instrumentation.h>
#include <asm/boot.h>
#include <asm/asm.h>
+#include <asm/frame.h>
#include <asm/msr.h>
#include <asm/page_types.h>
#include <asm/percpu.h>
@@ -20,47 +22,23 @@
#include <xen/interface/xen-mca.h>
#include <asm/xen/interface.h>
-.pushsection .noinstr.text, "ax"
- .balign PAGE_SIZE
-SYM_CODE_START(hypercall_page)
- .rept (PAGE_SIZE / 32)
- UNWIND_HINT_FUNC
- ANNOTATE_NOENDBR
- ANNOTATE_UNRET_SAFE
- ret
- /*
- * Xen will write the hypercall page, and sort out ENDBR.
- */
- .skip 31, 0xcc
- .endr
-
-#define HYPERCALL(n) \
- .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
- .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
-#include <asm/xen-hypercalls.h>
-#undef HYPERCALL
-SYM_CODE_END(hypercall_page)
-.popsection
-
#ifdef CONFIG_XEN_PV
__INIT
SYM_CODE_START(startup_xen)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR
cld
- mov initial_stack(%rip), %rsp
+ leaq __top_init_kernel_stack(%rip), %rsp
- /* Set up %gs.
- *
- * The base of %gs always points to fixed_percpu_data. If the
- * stack protector canary is enabled, it is located at %gs:40.
+ /*
+ * Set up GSBASE.
* Note that, on SMP, the boot cpu uses init data section until
* the per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax
- cdq
+ xorl %eax, %eax
+ xorl %edx, %edx
wrmsr
mov %rsi, %rdi
@@ -71,42 +49,132 @@ SYM_CODE_END(startup_xen)
#ifdef CONFIG_XEN_PV_SMP
.pushsection .text
SYM_CODE_START(asm_cpu_bringup_and_idle)
- UNWIND_HINT_EMPTY
+ UNWIND_HINT_END_OF_STACK
ENDBR
call cpu_bringup_and_idle
SYM_CODE_END(asm_cpu_bringup_and_idle)
+
+SYM_CODE_START(xen_cpu_bringup_again)
+ UNWIND_HINT_FUNC
+ mov %rdi, %rsp
+ UNWIND_HINT_REGS
+ call cpu_bringup_and_idle
+SYM_CODE_END(xen_cpu_bringup_again)
.popsection
#endif
#endif
+ .pushsection .noinstr.text, "ax"
+/*
+ * Xen hypercall interface to the hypervisor.
+ *
+ * Input:
+ * %eax: hypercall number
+ * 32-bit:
+ * %ebx, %ecx, %edx, %esi, %edi: args 1..5 for the hypercall
+ * 64-bit:
+ * %rdi, %rsi, %rdx, %r10, %r8: args 1..5 for the hypercall
+ * Output: %[er]ax
+ */
+SYM_FUNC_START(xen_hypercall_hvm)
+ ENDBR
+ FRAME_BEGIN
+ /* Save all relevant registers (caller save and arguments). */
+#ifdef CONFIG_X86_32
+ push %eax
+ push %ebx
+ push %ecx
+ push %edx
+ push %esi
+ push %edi
+#else
+ push %rax
+ push %rcx
+ push %rdx
+ push %rdi
+ push %rsi
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+#endif
+ /* Set the vendor specific function. */
+ call __xen_hypercall_setfunc
+ /* Set ZF = 1 if AMD, Restore saved registers. */
+#ifdef CONFIG_X86_32
+ lea xen_hypercall_amd, %ebx
+ cmp %eax, %ebx
+ pop %edi
+ pop %esi
+ pop %edx
+ pop %ecx
+ pop %ebx
+ pop %eax
+#else
+ lea xen_hypercall_amd(%rip), %rcx
+ cmp %rax, %rcx
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+ pop %rsi
+ pop %rdi
+ pop %rdx
+ pop %rcx
+ pop %rax
+#endif
+ FRAME_END
+ /* Use correct hypercall function. */
+ jz xen_hypercall_amd
+ jmp xen_hypercall_intel
+SYM_FUNC_END(xen_hypercall_hvm)
+
+SYM_FUNC_START(xen_hypercall_amd)
+ ANNOTATE_NOENDBR
+ vmmcall
+ RET
+SYM_FUNC_END(xen_hypercall_amd)
+
+SYM_FUNC_START(xen_hypercall_intel)
+ ANNOTATE_NOENDBR
+ vmcall
+ RET
+SYM_FUNC_END(xen_hypercall_intel)
+ .popsection
+
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
-#ifdef CONFIG_X86_32
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
-#else
+#ifdef CONFIG_XEN_PV
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
/* Map the p2m table to a 512GB-aligned user address. */
ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD))
-#endif
-#ifdef CONFIG_XEN_PV
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
-#endif
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
- .ascii "!writable_page_tables|pae_pgdir_above_4gb")
- ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
- .long (1 << XENFEAT_writable_page_tables) | \
- (1 << XENFEAT_dom0) | \
- (1 << XENFEAT_linux_rsdp_unrestricted))
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .globl xen_elfnote_entry;
+ xen_elfnote_entry: _ASM_PTR xen_elfnote_entry_value - .)
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
- ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
- ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
- ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
+# define FEATURES_PV (1 << XENFEAT_writable_page_tables)
+#else
+# define FEATURES_PV 0
+#endif
+#ifdef CONFIG_XEN_PVH
+# define FEATURES_PVH (1 << XENFEAT_linux_rsdp_unrestricted)
+#else
+# define FEATURES_PVH 0
+#endif
+#ifdef CONFIG_XEN_DOM0
+# define FEATURES_DOM0 (1 << XENFEAT_dom0)
+#else
+# define FEATURES_DOM0 0
+#endif
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+ .long FEATURES_PV | FEATURES_PVH | FEATURES_DOM0)
+ ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 9a8bb972193d..090349baec09 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -5,8 +5,15 @@
#include <linux/init.h>
#include <linux/clocksource.h>
#include <linux/irqreturn.h>
+#include <linux/linkage.h>
+
+#include <xen/interface/xenpmu.h>
#include <xen/xen-ops.h>
+#include <asm/page.h>
+
+#include <trace/events/xen.h>
+
/* These are code, but not functions. Defined in entry.S */
extern const char xen_failsafe_callback[];
@@ -21,16 +28,13 @@ extern void *xen_initial_gdt;
struct trap_info;
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info);
DECLARE_PER_CPU(unsigned long, xen_cr3);
-DECLARE_PER_CPU(unsigned long, xen_current_cr3);
extern struct start_info *xen_start_info;
extern struct shared_info xen_dummy_shared_info;
extern struct shared_info *HYPERVISOR_shared_info;
-extern bool xen_fifo_events;
-
void xen_setup_mfn_list_list(void);
void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
@@ -43,8 +47,12 @@ void xen_mm_unpin_all(void);
#ifdef CONFIG_X86_64
void __init xen_relocate_p2m(void);
#endif
+void __init xen_do_remap_nonram(void);
+void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr,
+ unsigned long size);
-bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size);
+void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size,
+ const char *component);
unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
@@ -72,8 +80,6 @@ void xen_restore_time_memory_area(void);
void xen_init_time_ops(void);
void xen_hvm_init_time_ops(void);
-irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
-
bool xen_vcpu_stolen(int vcpu);
void xen_vcpu_setup(int cpu);
@@ -108,11 +114,12 @@ static inline void xen_uninit_lock_cpu(int cpu)
struct dom0_vga_console_info;
-#ifdef CONFIG_XEN_PV_DOM0
-void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size,
+ struct screen_info *);
#else
static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
- size_t size)
+ size_t size, struct screen_info *si)
{
}
#endif
@@ -147,9 +154,12 @@ int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int),
void xen_pin_vcpu(int cpu);
void xen_emergency_restart(void);
+void xen_force_evtchn_callback(void);
+
#ifdef CONFIG_XEN_PV
void xen_pv_pre_suspend(void);
void xen_pv_post_suspend(int suspend_cancelled);
+void xen_start_kernel(struct start_info *si);
#else
static inline void xen_pv_pre_suspend(void) {}
static inline void xen_pv_post_suspend(int suspend_cancelled) {}
@@ -161,4 +171,164 @@ void xen_hvm_post_suspend(int suspend_cancelled);
static inline void xen_hvm_post_suspend(int suspend_cancelled) {}
#endif
+/*
+ * The maximum amount of extra memory compared to the base size. The
+ * main scaling factor is the size of struct page. At extreme ratios
+ * of base:extra, all the base memory can be filled with page
+ * structures for the extra memory, leaving no space for anything
+ * else.
+ *
+ * 10x seems like a reasonable balance between scaling flexibility and
+ * leaving a practically usable system.
+ */
+#define EXTRA_MEM_RATIO (10)
+
+void xen_add_extra_mem(unsigned long start_pfn, unsigned long n_pfns);
+
+struct dentry * __init xen_init_debugfs(void);
+
+enum pt_level {
+ PT_PGD,
+ PT_P4D,
+ PT_PUD,
+ PT_PMD,
+ PT_PTE
+};
+
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+unsigned long xen_read_cr2_direct(void);
+void xen_init_mmu_ops(void);
+void xen_hvm_init_mmu_ops(void);
+
+/* Multicalls */
+struct multicall_space
+{
+ struct multicall_entry *mc;
+ void *args;
+};
+
+/* Allocate room for a multicall and its args */
+struct multicall_space __xen_mc_entry(size_t args);
+
+DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
+
+/* Call to start a batch of multiple __xen_mc_entry()s. Must be
+ paired with xen_mc_issue() */
+static inline void xen_mc_batch(void)
+{
+ unsigned long flags;
+
+ /* need to disable interrupts until this entry is complete */
+ local_irq_save(flags);
+ trace_xen_mc_batch(xen_get_lazy_mode());
+ __this_cpu_write(xen_mc_irq_flags, flags);
+}
+
+static inline struct multicall_space xen_mc_entry(size_t args)
+{
+ xen_mc_batch();
+ return __xen_mc_entry(args);
+}
+
+/* Flush all pending multicalls */
+void xen_mc_flush(void);
+
+/* Issue a multicall if we're not in a lazy mode */
+static inline void xen_mc_issue(unsigned mode)
+{
+ trace_xen_mc_issue(mode);
+
+ if ((xen_get_lazy_mode() & mode) == 0)
+ xen_mc_flush();
+
+ /* restore flags saved in xen_mc_batch */
+ local_irq_restore(this_cpu_read(xen_mc_irq_flags));
+}
+
+/* Set up a callback to be called when the current batch is flushed */
+void xen_mc_callback(void (*fn)(void *), void *data);
+
+/*
+ * Try to extend the arguments of the previous multicall command. The
+ * previous command's op must match. If it does, then it attempts to
+ * extend the argument space allocated to the multicall entry by
+ * arg_size bytes.
+ *
+ * The returned multicall_space will return with mc pointing to the
+ * command on success, or NULL on failure, and args pointing to the
+ * newly allocated space.
+ */
+struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size);
+
+extern bool is_xen_pmu;
+
+irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id);
+#ifdef CONFIG_XEN_HAVE_VPMU
+void xen_pmu_init(int cpu);
+void xen_pmu_finish(int cpu);
+#else
+static inline void xen_pmu_init(int cpu) {}
+static inline void xen_pmu_finish(int cpu) {}
+#endif
+bool pmu_msr_chk_emulated(u32 msr, u64 *val, bool is_read);
+int pmu_apic_update(uint32_t reg);
+u64 xen_read_pmc(int counter);
+
+#ifdef CONFIG_SMP
+
+void asm_cpu_bringup_and_idle(void);
+asmlinkage void cpu_bringup_and_idle(void);
+
+extern void xen_send_IPI_mask(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
+ int vector);
+extern void xen_send_IPI_allbutself(int vector);
+extern void xen_send_IPI_all(int vector);
+extern void xen_send_IPI_self(int vector);
+
+extern int xen_smp_intr_init(unsigned int cpu);
+extern void xen_smp_intr_free(unsigned int cpu);
+int xen_smp_intr_init_pv(unsigned int cpu);
+void xen_smp_intr_free_pv(unsigned int cpu);
+
+void xen_smp_count_cpus(void);
+void xen_smp_cpus_done(unsigned int max_cpus);
+
+void xen_smp_send_reschedule(int cpu);
+void xen_smp_send_call_function_ipi(const struct cpumask *mask);
+void xen_smp_send_call_function_single_ipi(int cpu);
+
+void __noreturn xen_cpu_bringup_again(unsigned long stack);
+
+struct xen_common_irq {
+ int irq;
+ char *name;
+};
+#else /* CONFIG_SMP */
+
+static inline int xen_smp_intr_init(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free(unsigned int cpu) {}
+
+static inline int xen_smp_intr_init_pv(unsigned int cpu)
+{
+ return 0;
+}
+static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
+static inline void xen_smp_count_cpus(void) { }
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_XEN_PV
+void xen_hypercall_pv(void);
+#endif
+void xen_hypercall_hvm(void);
+void xen_hypercall_amd(void);
+void xen_hypercall_intel(void);
+void xen_hypercall_setfunc(void);
+void *__xen_hypercall_setfunc(void);
+
#endif /* XEN_OPS_H */