From 51c71a3bbaca868043cc45b3ad3786dd48a90235 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 26 Nov 2013 15:05:40 -0500 Subject: xen/pvhvm: If xen_platform_pci=0 is set don't blow up (v4). The user has the option of disabling the platform driver: 00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev 01) which is used to unplug the emulated drivers (IDE, Realtek 8169, etc) and allow the PV drivers to take over. If the user wishes to disable that they can set: xen_platform_pci=0 (in the guest config file) or xen_emul_unplug=never (on the Linux command line) except it does not work properly. The PV drivers still try to load and since the Xen platform driver is not run - and it has not initialized the grant tables, most of the PV drivers stumble upon: input: Xen Virtual Keyboard as /devices/virtual/input/input5 input: Xen Virtual Pointer as /devices/virtual/input/input6M ------------[ cut here ]------------ kernel BUG at /home/konrad/ssd/konrad/linux/drivers/xen/grant-table.c:1206! invalid opcode: 0000 [#1] SMP Modules linked in: xen_kbdfront(+) xenfs xen_privcmd CPU: 6 PID: 1389 Comm: modprobe Not tainted 3.13.0-rc1upstream-00021-ga6c892b-dirty #1 Hardware name: Xen HVM domU, BIOS 4.4-unstable 11/26/2013 RIP: 0010:[] [] get_free_entries+0x2e0/0x300 Call Trace: [] ? evdev_connect+0x1e3/0x240 [] gnttab_grant_foreign_access+0x2e/0x70 [] xenkbd_connect_backend+0x41/0x290 [xen_kbdfront] [] xenkbd_probe+0x2f2/0x324 [xen_kbdfront] [] xenbus_dev_probe+0x77/0x130 [] xenbus_frontend_dev_probe+0x47/0x50 [] driver_probe_device+0x89/0x230 [] __driver_attach+0x9b/0xa0 [] ? driver_probe_device+0x230/0x230 [] ? driver_probe_device+0x230/0x230 [] bus_for_each_dev+0x8c/0xb0 [] driver_attach+0x19/0x20 [] bus_add_driver+0x1a0/0x220 [] driver_register+0x5f/0xf0 [] xenbus_register_driver_common+0x15/0x20 [] xenbus_register_frontend+0x23/0x40 [] ? 0xffffffffa0014fff [] xenkbd_init+0x2b/0x1000 [xen_kbdfront] [] do_one_initcall+0x49/0x170 .. snip.. which is hardly nice. This patch fixes this by having each PV driver check for: - if running in PV, then it is fine to execute (as that is their native environment). - if running in HVM, check if user wanted 'xen_emul_unplug=never', in which case bail out and don't load any PV drivers. - if running in HVM, and if PCI device 5853:0001 (xen_platform_pci) does not exist, then bail out and not load PV drivers. - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=ide-disks', then bail out for all PV devices _except_ the block one. Ditto for the network one ('nics'). - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=unnecessary' then load block PV driver, and also setup the legacy IDE paths. In (v3) make it actually load PV drivers. Reported-by: Sander Eikelenboom Reported-and-Tested-by: Fabio Fantoni Signed-off-by: Konrad Rzeszutek Wilk [v2: Add extra logic to handle the myrid ways 'xen_emul_unplug' can be used per Ian and Stefano suggestion] [v3: Make the unnecessary case work properly] [v4: s/disks/ide-disks/ spotted by Fabio] Reviewed-by: Stefano Stabellini Acked-by: Bjorn Helgaas [for PCI parts] CC: stable@vger.kernel.org --- arch/x86/xen/platform-pci-unplug.c | 74 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0a7852483ffe..ab84ac198a9a 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -69,6 +69,80 @@ static int check_platform_magic(void) return 0; } +bool xen_has_pv_devices() +{ + if (!xen_domain()) + return false; + + /* PV domains always have them. */ + if (xen_pv_domain()) + return true; + + /* And user has xen_platform_pci=0 set in guest config as + * driver did not modify the value. */ + if (xen_platform_pci_unplug == 0) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) + return true; + + /* This is an odd one - we are going to run legacy + * and PV drivers at the same time. */ + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + /* And the caller has to follow with xen_pv_{disk,nic}_devices + * to be certain which driver can load. */ + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_devices); + +static bool __xen_has_pv_device(int state) +{ + /* HVM domains might or might not */ + if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) + return true; + + return xen_has_pv_devices(); +} + +bool xen_has_pv_nic_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); + +bool xen_has_pv_disk_devices(void) +{ + return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | + XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); +} +EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); + +/* + * This one is odd - it determines whether you want to run PV _and_ + * legacy (IDE) drivers together. This combination is only possible + * under HVM. + */ +bool xen_has_pv_and_legacy_disk_devices(void) +{ + if (!xen_domain()) + return false; + + /* N.B. This is only ever used in HVM mode */ + if (xen_pv_domain()) + return false; + + if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); + void xen_unplug_emulated_devices(void) { int r; -- cgit v1.2.3-59-g8ed1b From 6f6c15ef912465b3aaafe709f39bd6026a8b3e72 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Wed, 11 Dec 2013 15:22:01 -0500 Subject: xen/pvhvm: Remove the xen_platform_pci int. Since we have xen_has_pv_devices,xen_has_pv_disk_devices, xen_has_pv_nic_devices, and xen_has_pv_and_legacy_disk_devices to figure out the different 'unplug' behaviors - lets use those instead of this single int. Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/platform-pci-unplug.c | 5 ++--- include/xen/platform_pci.h | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index ab84ac198a9a..a8261716d58d 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -30,10 +30,9 @@ #define XEN_PLATFORM_ERR_PROTOCOL -2 #define XEN_PLATFORM_ERR_BLACKLIST -3 -/* store the value of xen_emul_unplug after the unplug is done */ -int xen_platform_pci_unplug; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); #ifdef CONFIG_XEN_PVHVM +/* store the value of xen_emul_unplug after the unplug is done */ +static int xen_platform_pci_unplug; static int xen_emul_unplug; static int check_platform_magic(void) diff --git a/include/xen/platform_pci.h b/include/xen/platform_pci.h index b49eeab0262e..5c52b5583917 100644 --- a/include/xen/platform_pci.h +++ b/include/xen/platform_pci.h @@ -46,8 +46,6 @@ static inline int xen_must_unplug_disks(void) { #endif } -extern int xen_platform_pci_unplug; - #if defined(CONFIG_XEN_PVHVM) extern bool xen_has_pv_devices(void); extern bool xen_has_pv_disk_devices(void); -- cgit v1.2.3-59-g8ed1b From 8785c67663b6ba023c7c6d61d37d8e08c00d86a8 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 23 Sep 2013 12:52:21 +0100 Subject: xen/x86: set VIRQ_TIMER priority to maximum Commit bee980d9e (xen/events: Handle VIRQ_TIMER before any other hardirq in event loop) effectively made the VIRQ_TIMER the highest priority event when using the 2-level ABI. Set the VIRQ_TIMER priority to the highest so this behaviour is retained when using the FIFO-based ABI. Signed-off-by: David Vrabel Reviewed-by: Konrad Rzeszutek Wilk Reviewed-by: Boris Ostrovsky --- arch/x86/xen/time.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 12a1ca707b94..7b78f88c1707 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -446,6 +446,7 @@ void xen_setup_timer(int cpu) IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER| IRQF_FORCE_RESUME, name, NULL); + (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX); memcpy(evt, xen_clockevent, sizeof(*evt)); -- cgit v1.2.3-59-g8ed1b From fc590efe667338f7da250cf2d2eb6fdd486e1b97 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 12:09:28 -0500 Subject: xen/p2m: Check for auto-xlat when doing mfn_to_local_pfn. Most of the functions in page.h are prefaced with if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; Except the mfn_to_local_pfn. At a first sight, the function should work without this patch - as the 'mfn_to_mfn' has a similar check. But there are no such check in the 'get_phys_to_machine' function - so we would crash in there. This fixes it by following the convention of having the check for auto-xlat in these static functions. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/include/asm/xen/page.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index b913915e8e63..4a092ccdd147 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -167,7 +167,12 @@ static inline xpaddr_t machine_to_phys(xmaddr_t machine) */ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) { - unsigned long pfn = mfn_to_pfn(mfn); + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + + pfn = mfn_to_pfn(mfn); if (get_phys_to_machine(pfn) != mfn) return -1; /* force !pfn_valid() */ return pfn; -- cgit v1.2.3-59-g8ed1b From ddc416cbc4e34f52bebca027de1c099bd30795f8 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 12:39:56 -0500 Subject: xen/pvh/x86: Define what an PVH guest is (v3). Which is a PV guest with auto page translation enabled and with vector callback. It is a cross between PVHVM and PV. The Xen side defines PVH as (from docs/misc/pvh-readme.txt, with modifications): "* the guest uses auto translate: - p2m is managed by Xen - pagetables are owned by the guest - mmu_update hypercall not available * it uses event callback and not vlapic emulation, * IDT is native, so set_trap_table hcall is also N/A for a PVH guest. For a full list of hcalls supported for PVH, see pvh_hypercall64_table in arch/x86/hvm/hvm.c in xen. From the ABI prespective, it's mostly a PV guest with auto translate, although it does use hvm_op for setting callback vector." Also we use the PV cpuid, albeit we can use the HVM (native) cpuid. However, we do have a fair bit of filtering in the xen_cpuid and we can piggyback on that until the hypervisor/toolstack filters the appropiate cpuids. Once that is done we can swap over to use the native one. We setup a Kconfig entry that is disabled by default and cannot be enabled. Note that on ARM the concept of PVH is non-existent. As Ian put it: "an ARM guest is neither PV nor HVM nor PVHVM. It's a bit like PVH but is different also (it's further towards the H end of the spectrum than even PVH).". As such these options (PVHVM, PVH) are never enabled nor seen on ARM compilations. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 5 +++++ include/xen/xen.h | 14 ++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 1a3c76505649..e7d05900efac 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -51,3 +51,8 @@ config XEN_DEBUG_FS Enable statistics output and various tuning options in debugfs. Enabling this option may incur a significant performance overhead. +config XEN_PVH + bool "Support for running as a PVH guest" + depends on X86_64 && XEN && BROKEN + select XEN_PVHVM + def_bool n diff --git a/include/xen/xen.h b/include/xen/xen.h index a74d4362c4f8..0c0e3ef4c45d 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -29,4 +29,18 @@ extern enum xen_domain_type xen_domain_type; #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ +#ifdef CONFIG_XEN_PVH +/* This functionality exists only for x86. The XEN_PVHVM support exists + * only in x86 world - hence on ARM it will be always disabled. + * N.B. ARM guests are neither PV nor HVM nor PVHVM. + * It's a bit like PVH but is different also (it's further towards the H + * end of the spectrum than even PVH). + */ +#include +#define xen_pvh_domain() (xen_pv_domain() && \ + xen_feature(XENFEAT_auto_translated_physmap) && \ + xen_have_vector_callback) +#else +#define xen_pvh_domain() (0) +#endif #endif /* _XEN_XEN_H */ -- cgit v1.2.3-59-g8ed1b From d285d68314af49c4456b71d248e355dd33ae375c Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 12:45:31 -0500 Subject: xen/pvh: Early bootup changes in PV code (v4). We don't use the filtering that 'xen_cpuid' is doing because the hypervisor treats 'XEN_EMULATE_PREFIX' as an invalid instruction. This means that all of the filtering will have to be done in the hypervisor/toolstack. Without the filtering we expose to the guest the: - cpu topology (sockets, cores, etc); - the APERF (which the generic scheduler likes to use), see 5e626254206a709c6e937f3dda69bf26c7344f6f "xen/setup: filter APERFMPERF cpuid feature out" - and the inability to figure out whether MWAIT_LEAF should be exposed or not. See df88b2d96e36d9a9e325bfcd12eb45671cbbc937 "xen/enlighten: Disable MWAIT_LEAF so that acpi-pad won't be loaded." - x2apic, see 4ea9b9aca90cfc71e6872ed3522356755162932c "xen: mask x2APIC feature in PV" We also check for vector callback early on, as it is a required feature. PVH also runs at default kernel IOPL. Finally, pure PV settings are moved to a separate function that are only called for pure PV, ie, pv with pvmmu. They are also #ifdef with CONFIG_XEN_PVMMU. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/xen/enlighten.c | 48 ++++++++++++++++++++++++++++++++++-------------- arch/x86/xen/setup.c | 18 ++++++++++++------ 2 files changed, 46 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index fa6ade76ef3f..eb0efc2f9d3c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -262,8 +263,9 @@ static void __init xen_banner(void) struct xen_extraversion extra; HYPERVISOR_xen_version(XENVER_extraversion, &extra); - printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - pv_info.name); + pr_info("Booting paravirtualized kernel %son %s\n", + xen_feature(XENFEAT_auto_translated_physmap) ? + "with PVH extensions " : "", pv_info.name); printk(KERN_INFO "Xen version: %d.%d%s%s\n", version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); @@ -433,7 +435,7 @@ static void __init xen_init_cpuid_mask(void) ax = 1; cx = 0; - xen_cpuid(&ax, &bx, &cx, &dx); + cpuid(1, &ax, &bx, &cx, &dx); xsave_mask = (1 << (X86_FEATURE_XSAVE % 32)) | @@ -1420,6 +1422,19 @@ static void __init xen_setup_stackprotector(void) pv_cpu_ops.load_gdt = xen_load_gdt; } +static void __init xen_pvh_early_guest_init(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + return; + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + +#ifdef CONFIG_X86_32 + BUG(); /* PVH: Implement proper support. */ +#endif +} + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1431,13 +1446,16 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; + xen_setup_features(); + xen_pvh_early_guest_init(); xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - pv_cpu_ops = xen_cpu_ops; pv_apic_ops = xen_apic_ops; + if (!xen_pvh_domain()) + pv_cpu_ops = xen_cpu_ops; x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; @@ -1469,8 +1487,6 @@ asmlinkage void __init xen_start_kernel(void) /* Work out if we support NX */ x86_configure_nx(); - xen_setup_features(); - /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) xen_build_dynamic_phys_to_machine(); @@ -1548,14 +1564,18 @@ asmlinkage void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* We used to do this in xen_arch_setup, but that is too late on AMD - * were early_cpu_init (run before ->arch_setup()) calls early_amd_init - * which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); + /* PVH: runs at default kernel iopl of 0 */ + if (!xen_pvh_domain()) { + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + } #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 68c054f59de6..2137c5101dac 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -563,16 +563,13 @@ void xen_enable_nmi(void) BUG(); #endif } -void __init xen_arch_setup(void) +void __init xen_pvmmu_arch_setup(void) { - xen_panic_handler_init(); - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, - VMASST_TYPE_pae_extended_cr3); + HYPERVISOR_vm_assist(VMASST_CMD_enable, + VMASST_TYPE_pae_extended_cr3); if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) @@ -581,6 +578,15 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); xen_enable_nmi(); +} + +/* This function is not called for HVM domains */ +void __init xen_arch_setup(void) +{ + xen_panic_handler_init(); + if (!xen_feature(XENFEAT_auto_translated_physmap)) + xen_pvmmu_arch_setup(); + #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); -- cgit v1.2.3-59-g8ed1b From 696fd7c5b2ecb31b339019ced4fe15a3f9e7419a Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Sun, 15 Dec 2013 12:37:46 -0500 Subject: xen/pvh: Don't setup P2M tree. P2M is not available for PVH. Fortunatly for us the P2M code already has mostly the support for auto-xlat guest thanks to commit 3d24bbd7dddbea54358a9795abaf051b0f18973c "grant-table: call set_phys_to_machine after mapping grant refs" which: " introduces set_phys_to_machine calls for auto_translated guests (even on x86) in gnttab_map_refs and gnttab_unmap_refs. translated by swiotlb-xen... " so we don't need to muck much. with above mentioned "commit you'll get set_phys_to_machine calls from gnttab_map_refs and gnttab_unmap_refs but PVH guests won't do anything with them " (Stefano Stabellini) which is OK - we want them to be NOPs. This is because we assume that an "IOMMU is always present on the plaform and Xen is going to make the appropriate IOMMU pagetable changes in the hypercall implementation of GNTTABOP_map_grant_ref and GNTTABOP_unmap_grant_ref, then eveything should be transparent from PVH priviligied point of view and DMA transfers involving foreign pages keep working with no issues[sp] Otherwise we would need a P2M (and an M2P) for PVH priviligied to track these foreign pages .. (see arch/arm/xen/p2m.c)." (Stefano Stabellini). We still have to inhibit the building of the P2M tree. That had been done in the past by not calling xen_build_dynamic_phys_to_machine (which setups the P2M tree and gives us virtual address to access them). But we are missing a check for xen_build_mfn_list_list - which was continuing to setup the P2M tree and would blow up at trying to get the virtual address of p2m_missing (which would have been setup by xen_build_dynamic_phys_to_machine). Hence a check is needed to not call xen_build_mfn_list_list when running in auto-xlat mode. Instead of replicating the check for auto-xlat in enlighten.c do it in the p2m.c code. The reason is that the xen_build_mfn_list_list is called also in xen_arch_post_suspend without any checks for auto-xlat. So for PVH or PV with auto-xlat - we would needlessly allocate space for an P2M tree. Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: David Vrabel Acked-by: Stefano Stabellini --- arch/x86/xen/enlighten.c | 3 +-- arch/x86/xen/p2m.c | 12 ++++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index eb0efc2f9d3c..23ead298edbd 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1488,8 +1488,7 @@ asmlinkage void __init xen_start_kernel(void) x86_configure_nx(); /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_build_dynamic_phys_to_machine(); + xen_build_dynamic_phys_to_machine(); /* * Set up kernel GDT and segment registers, mainly so that diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 2ae8699e8767..fb7ee0a4f9d0 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -280,6 +280,9 @@ void __ref xen_build_mfn_list_list(void) { unsigned long pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); @@ -346,10 +349,15 @@ void xen_setup_mfn_list_list(void) /* Set up p2m_top to point to the domain-builder provided p2m pages */ void __init xen_build_dynamic_phys_to_machine(void) { - unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; - unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); + unsigned long *mfn_list; + unsigned long max_pfn; unsigned long pfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + mfn_list = (unsigned long *)xen_start_info->mfn_list; + max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); xen_max_p2m_pfn = max_pfn; p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); -- cgit v1.2.3-59-g8ed1b From 32df75cd148b43e007848ddbfdb1ea25535114cb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 31 Dec 2013 12:37:52 -0500 Subject: xen/mmu/p2m: Refactor the xen_pagetable_init code (v2). The revectoring and copying of the P2M only happens when !auto-xlat and on 64-bit builds. It is not obvious from the code, so lets have seperate 32 and 64-bit functions. We also invert the check for auto-xlat to make the code flow simpler. Suggested-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 70 +++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ce563be09cc1..c140efffe37e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1198,44 +1198,40 @@ static void __init xen_cleanhighmap(unsigned long vaddr, * instead of somewhere later and be confusing. */ xen_mc_flush(); } -#endif -static void __init xen_pagetable_init(void) +static void __init xen_pagetable_p2m_copy(void) { -#ifdef CONFIG_X86_64 unsigned long size; unsigned long addr; -#endif - paging_init(); - xen_setup_shared_info(); -#ifdef CONFIG_X86_64 - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long new_mfn_list; + unsigned long new_mfn_list; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + + /* On 32-bit, we get zero so this never gets executed. */ + new_mfn_list = xen_revector_p2m_tree(); + if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + } else + return; - /* On 32-bit, we get zero so this never gets executed. */ - new_mfn_list = xen_revector_p2m_tree(); - if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { - /* using __ka address and sticking INVALID_P2M_ENTRY! */ - memset((void *)xen_start_info->mfn_list, 0xff, size); - - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); - addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ - size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = new_mfn_list; - } else - goto skip; - } /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page @@ -1255,7 +1251,15 @@ static void __init xen_pagetable_init(void) * anything at this stage. */ xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); #endif -skip: +} +#endif + +static void __init xen_pagetable_init(void) +{ + paging_init(); + xen_setup_shared_info(); +#ifdef CONFIG_X86_64 + xen_pagetable_p2m_copy(); #endif xen_post_allocator_init(); } -- cgit v1.2.3-59-g8ed1b From b621e157ba48fb7d36945405de68c5fa25e7b73c Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 3 Jan 2014 14:08:39 -0500 Subject: xen/mmu: Cleanup xen_pagetable_p2m_copy a bit. Stefano noticed that the code runs only under 64-bit so the comments about 32-bit are pointless. Also we change the condition for xen_revector_p2m_tree returning the same value (because it could not allocate a swath of space to put the new P2M in) or it had been called once already. In such we return early from the function. Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/xen/mmu.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c140efffe37e..9d74249542c5 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1209,29 +1209,29 @@ static void __init xen_pagetable_p2m_copy(void) size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - /* On 32-bit, we get zero so this never gets executed. */ new_mfn_list = xen_revector_p2m_tree(); - if (new_mfn_list && new_mfn_list != xen_start_info->mfn_list) { - /* using __ka address and sticking INVALID_P2M_ENTRY! */ - memset((void *)xen_start_info->mfn_list, 0xff, size); - - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); - addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ - size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); - /* And revector! Bye bye old array */ - xen_start_info->mfn_list = new_mfn_list; - } else + /* No memory or already called. */ + if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list) return; + /* using __ka address and sticking INVALID_P2M_ENTRY! */ + memset((void *)xen_start_info->mfn_list, 0xff, size); + + /* We should be in __ka space. */ + BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); + addr = xen_start_info->mfn_list; + /* We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or xen_start_info->shared_info + * they are in going to crash. Fortunatly we have already revectored + * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + size = roundup(size, PMD_SIZE); + xen_cleanhighmap(addr, addr + size); + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + memblock_free(__pa(xen_start_info->mfn_list), size); + /* And revector! Bye bye old array */ + xen_start_info->mfn_list = new_mfn_list; + /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of * the ramdisk). We continue on, erasing PMD entries that point to page -- cgit v1.2.3-59-g8ed1b From 4e44e44b0bd25bc4ed23232f06fc7275f1e4e38d Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Tue, 31 Dec 2013 12:41:27 -0500 Subject: xen/pvh: MMU changes for PVH (v2) .. which are surprisingly small compared to the amount for PV code. PVH uses mostly native mmu ops, we leave the generic (native_*) for the majority and just overwrite the baremetal with the ones we need. At startup, we are running with pre-allocated page-tables courtesy of the tool-stack. But we still need to graft them in the Linux initial pagetables. However there is no need to unpin/pin and change them to R/O or R/W. Note that the xen_pagetable_init due to 7836fec9d0994cc9c9150c5a33f0eb0eb08a335a "xen/mmu/p2m: Refactor the xen_pagetable_init code." does not need any changes - we just need to make sure that xen_post_allocator_init does not alter the pvops from the default native one. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/xen/mmu.c | 81 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9d74249542c5..490ddb354590 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1757,6 +1757,10 @@ static void set_page_prot_flags(void *addr, pgprot_t prot, unsigned long flags) unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); + /* For PVH no need to set R/O or R/W to pin them or unpin them. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1867,6 +1871,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * but that's enough to get __va working. We need to fill in the rest * of the physical mapping once some sort of allocator has been set * up. + * NOTE: for PVH, the page tables are native. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -1888,17 +1893,18 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); - /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt - * L4[511] -> level3_kernel_pgt */ - convert_pfn_mfn(init_level4_pgt); - - /* L3_i[0] -> level2_ident_pgt */ - convert_pfn_mfn(level3_ident_pgt); - /* L3_k[510] -> level2_kernel_pgt - * L3_i[511] -> level2_fixmap_pgt */ - convert_pfn_mfn(level3_kernel_pgt); - + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Pre-constructed entries are in pfn, so convert to mfn */ + /* L4[272] -> level3_ident_pgt + * L4[511] -> level3_kernel_pgt */ + convert_pfn_mfn(init_level4_pgt); + + /* L3_i[0] -> level2_ident_pgt */ + convert_pfn_mfn(level3_ident_pgt); + /* L3_k[510] -> level2_kernel_pgt + * L3_i[511] -> level2_fixmap_pgt */ + convert_pfn_mfn(level3_kernel_pgt); + } /* We get [511][511] and have Xen's version of level2_kernel_pgt */ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); @@ -1922,31 +1928,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) copy_page(level2_fixmap_pgt, l2); /* Note that we don't do anything with level1_fixmap_pgt which * we don't need. */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(init_level4_pgt)); - xen_mc_issue(PARAVIRT_LAZY_CPU); + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(init_level4_pgt)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + } else + native_write_cr3(__pa(init_level4_pgt)); /* We can't that easily rip out L3 and L2, as the Xen pagetables are * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for @@ -2107,6 +2115,9 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) static void __init xen_post_allocator_init(void) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -- cgit v1.2.3-59-g8ed1b From 76bcceff0bfbded075c9703ec5413a9bf50ef8c4 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 3 Jan 2014 09:48:08 -0500 Subject: xen/pvh/mmu: Use PV TLB instead of native. We also optimize one - the TLB flush. The native operation would needlessly IPI offline VCPUs causing extra wakeups. Using the Xen one avoids that and lets the hypervisor determine which VCPU needs the TLB flush. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/mmu.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 490ddb354590..c1d406f35523 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2222,6 +2222,15 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { void __init xen_init_mmu_ops(void) { x86_init.paging.pagetable_init = xen_pagetable_init; + + /* Optimization - we can use the HVM one but it has no idea which + * VCPUs are descheduled - which means that it will needlessly IPI + * them. Xen knows so let it do the job. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + pv_mmu_ops.flush_tlb_others = xen_flush_tlb_others; + return; + } pv_mmu_ops = xen_mmu_ops; memset(dummy_mapping, 0xff, PAGE_SIZE); -- cgit v1.2.3-59-g8ed1b From 4dd322bc3b25be40dfa91e8ac483b846f3e8dffc Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Tue, 31 Dec 2013 14:02:44 -0500 Subject: xen/pvh: Setup up shared_info. For PVHVM the shared_info structure is provided via the same way as for normal PV guests (see include/xen/interface/xen.h). That is during bootup we get 'xen_start_info' via the %esi register in startup_xen. Then later we extract the 'shared_info' from said structure (in xen_setup_shared_info) and start using it. The 'xen_setup_shared_info' is all setup to work with auto-xlat guests, but there are two functions which it calls that are not: xen_setup_mfn_list_list and xen_setup_vcpu_info_placement. This patch modifies the P2M code (xen_setup_mfn_list_list) while the "Piggyback on PVHVM for event channels" modifies the xen_setup_vcpu_info_placement. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/p2m.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index fb7ee0a4f9d0..696c694986d0 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -339,6 +339,9 @@ void __ref xen_build_mfn_list_list(void) void xen_setup_mfn_list_list(void) { + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = -- cgit v1.2.3-59-g8ed1b From 8d656bbe43aee6d1be6b49fcf8acbc04588472bc Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 13:03:37 -0500 Subject: xen/pvh: Load GDT/GS in early PV bootup code for BSP. During early bootup we start life using the Xen provided GDT, which means that we are running with %cs segment set to FLAT_KERNEL_CS (FLAT_RING3_CS64 0xe033, GDT index 261). But for PVH we want to be use HVM type mechanism for segment operations. As such we need to switch to the HVM one and also reload ourselves with the __KERNEL_CS:eip to run in the proper GDT and segment. For HVM this is usually done in 'secondary_startup_64' in (head_64.S) but since we are not taking that bootup path (we start in PV - xen_start_kernel) we need to do that in the early PV bootup paths. For good measure we also zero out the %fs, %ds, and %es (not strictly needed as Xen has already cleared them for us). The %gs is loaded by 'switch_to_new_gdt'. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: David Vrabel --- arch/x86/xen/enlighten.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 23ead298edbd..1170d00879d5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1410,8 +1410,43 @@ static void __init xen_boot_params_init_edd(void) * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. */ -static void __init xen_setup_stackprotector(void) +static void __init xen_setup_gdt(void) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { +#ifdef CONFIG_X86_64 + unsigned long dummy; + + switch_to_new_gdt(0); /* GDT and GS set */ + + /* We are switching of the Xen provided GDT to our HVM mode + * GDT. The new GDT has __KERNEL_CS with CS.L = 1 + * and we are jumping to reload it. + */ + asm volatile ("pushq %0\n" + "leaq 1f(%%rip),%0\n" + "pushq %0\n" + "lretq\n" + "1:\n" + : "=&r" (dummy) : "0" (__KERNEL_CS)); + + /* + * While not needed, we also set the %es, %ds, and %fs + * to zero. We don't care about %ss as it is NULL. + * Strictly speaking this is not needed as Xen zeros those + * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) + * + * Linux zeros them in cpu_init() and in secondary_startup_64 + * (for BSP). + */ + loadsegment(es, 0); + loadsegment(ds, 0); + loadsegment(fs, 0); +#else + /* PVH: TODO Implement. */ + BUG(); +#endif + return; /* PVH does not need any PV GDT ops. */ + } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1494,7 +1529,7 @@ asmlinkage void __init xen_start_kernel(void) * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed. */ - xen_setup_stackprotector(); + xen_setup_gdt(); xen_init_irq_ops(); xen_init_cpuid_mask(); -- cgit v1.2.3-59-g8ed1b From 5840c84b16aad223d5305d8a569ea55de4120d67 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 11:48:08 -0500 Subject: xen/pvh: Secondary VCPU bringup (non-bootup CPUs) The VCPU bringup protocol follows the PV with certain twists. From xen/include/public/arch-x86/xen.h: Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise for HVM and PVH guests, not all information in this structure is updated: - For HVM guests, the structures read include: fpu_ctxt (if VGCT_I387_VALID is set), flags, user_regs, debugreg[*] - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to set cr3. All other fields not used should be set to 0. This is what we do. We piggyback on the 'xen_setup_gdt' - but modify a bit - we need to call 'load_percpu_segment' so that 'switch_to_new_gdt' can load per-cpu data-structures. It has no effect on the VCPU0. We also piggyback on the %rdi register to pass in the CPU number - so that when we bootup a new CPU, the cpu_bringup_and_idle will have passed as the first parameter the CPU number (via %rdi for 64-bit). Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 11 ++++++++--- arch/x86/xen/smp.c | 49 ++++++++++++++++++++++++++++++++---------------- arch/x86/xen/xen-ops.h | 1 + 3 files changed, 42 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1170d00879d5..2eca6187fc92 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1409,14 +1409,19 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. + * + * Note, that it is __ref because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. */ -static void __init xen_setup_gdt(void) +void __ref xen_setup_gdt(int cpu) { if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_X86_64 unsigned long dummy; - switch_to_new_gdt(0); /* GDT and GS set */ + load_percpu_segment(cpu); /* We need to access per-cpu area */ + switch_to_new_gdt(cpu); /* GDT and GS set */ /* We are switching of the Xen provided GDT to our HVM mode * GDT. The new GDT has __KERNEL_CS with CS.L = 1 @@ -1529,7 +1534,7 @@ asmlinkage void __init xen_start_kernel(void) * Set up kernel GDT and segment registers, mainly so that * -fstack-protector code can be executed. */ - xen_setup_gdt(); + xen_setup_gdt(0); xen_init_irq_ops(); xen_init_cpuid_mask(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c36b325abd83..5e46190133b2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -73,9 +73,11 @@ static void cpu_bringup(void) touch_softlockup_watchdog(); preempt_disable(); - xen_enable_sysenter(); - xen_enable_syscall(); - + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { + xen_enable_sysenter(); + xen_enable_syscall(); + } cpu = smp_processor_id(); smp_store_cpu_info(cpu); cpu_data(cpu).x86_max_cores = 1; @@ -97,8 +99,14 @@ static void cpu_bringup(void) wmb(); /* make sure everything is out */ } -static void cpu_bringup_and_idle(void) +/* Note: cpu parameter is only relevant for PVH */ +static void cpu_bringup_and_idle(int cpu) { +#ifdef CONFIG_X86_64 + if (xen_feature(XENFEAT_auto_translated_physmap) && + xen_feature(XENFEAT_supervisor_mode_kernel)) + xen_setup_gdt(cpu); +#endif cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); } @@ -274,9 +282,10 @@ static void __init xen_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); if (xen_pv_domain()) { - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(xen_initial_gdt); + if (!xen_feature(XENFEAT_writable_page_tables)) + /* We've switched to the "real" per-cpu gdt, so make + * sure the old memory can be recycled. */ + make_lowmem_page_readwrite(xen_initial_gdt); #ifdef CONFIG_X86_32 /* @@ -360,22 +369,21 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 + /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; -#else - ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - { + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; xen_copy_trap_info(ctxt->trap_ctxt); @@ -396,18 +404,27 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; ctxt->failsafe_callback_cs = __KERNEL_CS; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); +#ifdef CONFIG_X86_32 } - ctxt->user_regs.cs = __KERNEL_CS; +#else + } else + /* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with + * %rdi having the cpu number - which means are passing in + * as the first parameter the cpu. Subtle! + */ + ctxt->user_regs.rdi = cpu; +#endif ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); - - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) BUG(); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 95f8c6142328..9059c24ed564 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -123,4 +123,5 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); +void xen_setup_gdt(int cpu); #endif /* XEN_OPS_H */ -- cgit v1.2.3-59-g8ed1b From 9103bb0f8240b2a55aac3ff7ecba9c7dcf66b08b Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Fri, 13 Dec 2013 13:02:58 -0500 Subject: xen/pvh: Update E820 to work with PVH (v2) In xen_add_extra_mem() we can skip updating P2M as it's managed by Xen. PVH maps the entire IO space, but only RAM pages need to be repopulated. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: David Vrabel Acked-by: Stefano Stabellini --- arch/x86/xen/setup.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2137c5101dac..dd5f905e33d5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,6 +27,7 @@ #include #include #include +#include "mmu.h" #include "xen-ops.h" #include "vdso.h" @@ -81,6 +82,9 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + xen_max_p2m_pfn = PFN_DOWN(start + size); for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -103,6 +107,7 @@ static unsigned long __init xen_do_chunk(unsigned long start, .domid = DOMID_SELF }; unsigned long len = 0; + int xlated_phys = xen_feature(XENFEAT_auto_translated_physmap); unsigned long pfn; int ret; @@ -116,7 +121,7 @@ static unsigned long __init xen_do_chunk(unsigned long start, continue; frame = mfn; } else { - if (mfn != INVALID_P2M_ENTRY) + if (!xlated_phys && mfn != INVALID_P2M_ENTRY) continue; frame = pfn; } @@ -154,6 +159,13 @@ static unsigned long __init xen_do_chunk(unsigned long start, static unsigned long __init xen_release_chunk(unsigned long start, unsigned long end) { + /* + * Xen already ballooned out the E820 non RAM regions for us + * and set them up properly in EPT. + */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return end - start; + return xen_do_chunk(start, end, true); } @@ -222,7 +234,13 @@ static void __init xen_set_identity_and_release_chunk( * (except for the ISA region which must be 1:1 mapped) to * release the refcounts (in Xen) on the original frames. */ - for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { + + /* + * PVH E820 matches the hypervisor's P2M which means we need to + * account for the proper values of *release and *identity. + */ + for (pfn = start_pfn; !xen_feature(XENFEAT_auto_translated_physmap) && + pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) { pte_t pte = __pte_ma(0); if (pfn < PFN_UP(ISA_END_ADDRESS)) -- cgit v1.2.3-59-g8ed1b From 2771374d47220c7ec271281437625e9519505bb2 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Wed, 11 Dec 2013 15:36:51 -0500 Subject: xen/pvh: Piggyback on PVHVM for event channels (v2) PVH is a PV guest with a twist - there are certain things that work in it like HVM and some like PV. There is a similar mode - PVHVM where we run in HVM mode with PV code enabled - and this patch explores that. The most notable PV interfaces are the XenBus and event channels. We will piggyback on how the event channel mechanism is used in PVHVM - that is we want the normal native IRQ mechanism and we will install a vector (hvm callback) for which we will call the event channel mechanism. This means that from a pvops perspective, we can use native_irq_ops instead of the Xen PV specific. Albeit in the future we could support pirq_eoi_map. But that is a feature request that can be shared with PVHVM. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: David Vrabel Acked-by: Stefano Stabellini --- arch/x86/xen/enlighten.c | 5 +++-- arch/x86/xen/irq.c | 5 ++++- drivers/xen/events/events_base.c | 14 +++++++++----- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2eca6187fc92..a4e2f30917a3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1144,8 +1144,9 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); /* xen_vcpu_setup managed to place the vcpu_info within the - percpu area for all cpus, so make use of it */ - if (have_vcpu_info_placement) { + * percpu area for all cpus, so make use of it. Note that for + * PVH we want to use native IRQ mechanism. */ + if (have_vcpu_info_placement && !xen_pvh_domain()) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 0da7f863056f..76ca326105f7 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -128,6 +129,8 @@ static const struct pv_irq_ops xen_irq_ops __initconst = { void __init xen_init_irq_ops(void) { - pv_irq_ops = xen_irq_ops; + /* For PVH we use default pv_irq_ops settings. */ + if (!xen_feature(XENFEAT_hvm_callback_vector)) + pv_irq_ops = xen_irq_ops; x86_init.irqs.intr_init = xen_init_IRQ; } diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 1d16185e82b2..4672e003c0ad 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1685,8 +1685,15 @@ void __init xen_init_IRQ(void) pirq_needs_eoi = pirq_needs_eoi_flag; #ifdef CONFIG_X86 - if (xen_hvm_domain()) { + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) xen_callback_vector(); + + if (xen_hvm_domain()) { native_init_IRQ(); /* pci_xen_hvm_init must be called after native_init_IRQ so that * __acpi_register_gsi can point at the right function */ @@ -1695,13 +1702,10 @@ void __init xen_init_IRQ(void) int rc; struct physdev_pirq_eoi_gmfn eoi_gmfn; - irq_ctx_init(smp_processor_id()); - if (xen_initial_domain()) - pci_xen_initial_domain(); - pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; -- cgit v1.2.3-59-g8ed1b From efaf30a3357872cf0fc7d555b1f9968ec71535d3 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 6 Jan 2014 10:40:36 -0500 Subject: xen/grant: Implement an grant frame array struct (v3). The 'xen_hvm_resume_frames' used to be an 'unsigned long' and contain the virtual address of the grants. That was OK for most architectures (PVHVM, ARM) were the grants are contiguous in memory. That however is not the case for PVH - in which case we will have to do a lookup for each virtual address for the PFN. Instead of doing that, lets make it a structure which will contain the array of PFNs, the virtual address and the count of said PFNs. Also provide a generic functions: gnttab_setup_auto_xlat_frames and gnttab_free_auto_xlat_frames to populate said structure with appropriate values for PVHVM and ARM. To round it off, change the name from 'xen_hvm_resume_frames' to a more descriptive one - 'xen_auto_xlat_grant_frames'. For PVH, in patch "xen/pvh: Piggyback on PVHVM for grant driver" we will populate the 'xen_auto_xlat_grant_frames' by ourselves. v2 moves the xen_remap in the gnttab_setup_auto_xlat_frames and also introduces xen_unmap for gnttab_free_auto_xlat_frames. Suggested-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk [v3: Based on top of 'asm/xen/page.h: remove redundant semicolon'] Acked-by: Stefano Stabellini --- arch/arm/include/asm/xen/page.h | 1 + arch/arm/xen/enlighten.c | 9 +++++-- arch/x86/include/asm/xen/page.h | 1 + drivers/xen/grant-table.c | 58 ++++++++++++++++++++++++++++++++++++----- drivers/xen/platform-pci.c | 10 ++++--- include/xen/grant_table.h | 9 ++++++- 6 files changed, 75 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index ac6789aad059..709c4b4d2f1d 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -118,5 +118,6 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) } #define xen_remap(cookie, size) ioremap_cached((cookie), (size)) +#define xen_unmap(cookie) iounmap((cookie)) #endif /* _ASM_ARM_XEN_PAGE_H */ diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 85501238b425..2162172c0ddc 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -208,6 +208,7 @@ static int __init xen_guest_init(void) const char *version = NULL; const char *xen_prefix = "xen,xen-"; struct resource res; + unsigned long grant_frames; node = of_find_compatible_node(NULL, NULL, "xen,xen"); if (!node) { @@ -224,10 +225,10 @@ static int __init xen_guest_init(void) } if (of_address_to_resource(node, GRANT_TABLE_PHYSADDR, &res)) return 0; - xen_hvm_resume_frames = res.start; + grant_frames = res.start; xen_events_irq = irq_of_parse_and_map(node, 0); pr_info("Xen %s support found, events_irq=%d gnttab_frame_pfn=%lx\n", - version, xen_events_irq, (xen_hvm_resume_frames >> PAGE_SHIFT)); + version, xen_events_irq, (grant_frames >> PAGE_SHIFT)); xen_domain_type = XEN_HVM_DOMAIN; xen_setup_features(); @@ -265,6 +266,10 @@ static int __init xen_guest_init(void) if (xen_vcpu_info == NULL) return -ENOMEM; + if (gnttab_setup_auto_xlat_frames(grant_frames)) { + free_percpu(xen_vcpu_info); + return -ENOMEM; + } gnttab_init(); if (!xen_initial_domain()) xenbus_probe(NULL); diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4a092ccdd147..3e276eb23d1b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -227,5 +227,6 @@ void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); #define xen_remap(cookie, size) ioremap((cookie), (size)); +#define xen_unmap(cookie) iounmap((cookie)) #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index e69c7780c208..44b75ccfbbff 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -65,8 +65,7 @@ static unsigned int nr_grant_frames; static int gnttab_free_count; static grant_ref_t gnttab_free_head; static DEFINE_SPINLOCK(gnttab_list_lock); -unsigned long xen_hvm_resume_frames; -EXPORT_SYMBOL_GPL(xen_hvm_resume_frames); +struct grant_frames xen_auto_xlat_grant_frames; static union { struct grant_entry_v1 *v1; @@ -838,6 +837,51 @@ unsigned int gnttab_max_grant_frames(void) } EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); +int gnttab_setup_auto_xlat_frames(unsigned long addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n", + addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + /* Handling of paged out grant targets (GNTST_eagain) */ #define MAX_DELAY 256 static inline void @@ -1068,6 +1112,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); /* * Loop backwards, so that the first hypercall has the largest * index, ensuring that the table will grow only once. @@ -1076,7 +1121,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) xatp.domid = DOMID_SELF; xatp.idx = i; xatp.space = XENMAPSPACE_grant_table; - xatp.gpfn = (xen_hvm_resume_frames >> PAGE_SHIFT) + i; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); if (rc != 0) { pr_warn("grant table add_to_physmap failed, err=%d\n", @@ -1174,11 +1219,10 @@ static int gnttab_setup(void) return -ENOSYS; if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { - gnttab_shared.addr = xen_remap(xen_hvm_resume_frames, - PAGE_SIZE * max_nr_gframes); + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; if (gnttab_shared.addr == NULL) { - pr_warn("Failed to ioremap gnttab share frames (addr=0x%08lx)!\n", - xen_hvm_resume_frames); + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); return -ENOMEM; } } diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 2f3528e93cb9..f1947ac218d9 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -108,6 +108,7 @@ static int platform_pci_init(struct pci_dev *pdev, long ioaddr; long mmio_addr, mmio_len; unsigned int max_nr_gframes; + unsigned long grant_frames; if (!xen_domain()) return -ENODEV; @@ -154,13 +155,16 @@ static int platform_pci_init(struct pci_dev *pdev, } max_nr_gframes = gnttab_max_grant_frames(); - xen_hvm_resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + if (gnttab_setup_auto_xlat_frames(grant_frames)) + goto out; ret = gnttab_init(); if (ret) - goto out; + goto grant_out; xenbus_probe(NULL); return 0; - +grant_out: + gnttab_free_auto_xlat_frames(); out: pci_release_region(pdev, 0); mem_out: diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index 694dcaf266e6..5acb1e4ac0d3 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -178,8 +178,15 @@ int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, grant_status_t **__shared); void arch_gnttab_unmap(void *shared, unsigned long nr_gframes); -extern unsigned long xen_hvm_resume_frames; +struct grant_frames { + xen_pfn_t *pfn; + unsigned int count; + void *vaddr; +}; +extern struct grant_frames xen_auto_xlat_grant_frames; unsigned int gnttab_max_grant_frames(void); +int gnttab_setup_auto_xlat_frames(unsigned long addr); +void gnttab_free_auto_xlat_frames(void); #define gnttab_map_vaddr(map) ((void *)(map.host_virt_addr)) -- cgit v1.2.3-59-g8ed1b From 6926f6d6109714aab7b26df7099b12555e36676f Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 3 Jan 2014 10:20:18 -0500 Subject: xen/pvh: Piggyback on PVHVM for grant driver (v4) In PVH the shared grant frame is the PFN and not MFN, hence its mapped via the same code path as HVM. The allocation of the grant frame is done differently - we do not use the early platform-pci driver and have an ioremap area - instead we use balloon memory and stitch all of the non-contingous pages in a virtualized area. That means when we call the hypervisor to replace the GMFN with a XENMAPSPACE_grant_table type, we need to lookup the old PFN for every iteration instead of assuming a flat contingous PFN allocation. Lastly, we only use v1 for grants. This is because PVHVM is not able to use v2 due to no XENMEM_add_to_physmap calls on the error status page (see commit 69e8f430e243d657c2053f097efebc2e2cd559f0 xen/granttable: Disable grant v2 for HVM domains.) Until that is implemented this workaround has to be in place. Also per suggestions by Stefano utilize the PVHVM paths as they share common functionality. v2 of this patch moves most of the PVH code out in the arch/x86/xen/grant-table driver and touches only minimally the generic driver. v3, v4: fixes us some of the code due to earlier patches. Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/xen/grant-table.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/xen/gntdev.c | 2 +- drivers/xen/grant-table.c | 9 ++++--- 3 files changed, 68 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 3a5f55d51907..2d719799fd6a 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -125,3 +125,65 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) apply_to_page_range(&init_mm, (unsigned long)shared, PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); } +#ifdef CONFIG_XEN_PVH +#include +#include +#include +static int __init xlated_setup_gnttab_pages(void) +{ + struct page **pages; + xen_pfn_t *pfns; + int rc; + unsigned int i; + unsigned long nr_grant_frames = gnttab_max_grant_frames(); + + BUG_ON(nr_grant_frames == 0); + pages = kcalloc(nr_grant_frames, sizeof(pages[0]), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + pfns = kcalloc(nr_grant_frames, sizeof(pfns[0]), GFP_KERNEL); + if (!pfns) { + kfree(pages); + return -ENOMEM; + } + rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */); + if (rc) { + pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + kfree(pages); + kfree(pfns); + return rc; + } + for (i = 0; i < nr_grant_frames; i++) + pfns[i] = page_to_pfn(pages[i]); + + rc = arch_gnttab_map_shared(pfns, nr_grant_frames, nr_grant_frames, + &xen_auto_xlat_grant_frames.vaddr); + + kfree(pages); + if (rc) { + pr_warn("%s Couldn't map %ld pfns rc:%d\n", __func__, + nr_grant_frames, rc); + free_xenballooned_pages(nr_grant_frames, pages); + kfree(pfns); + return rc; + } + + xen_auto_xlat_grant_frames.pfn = pfns; + xen_auto_xlat_grant_frames.count = nr_grant_frames; + + return 0; +} + +static int __init xen_pvh_gnttab_setup(void) +{ + if (!xen_pvh_domain()) + return -ENODEV; + + return xlated_setup_gnttab_pages(); +} +/* Call it _before_ __gnttab_init as we need to initialize the + * xen_auto_xlat_grant_frames first. */ +core_initcall(xen_pvh_gnttab_setup); +#endif diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index e41c79c986ea..073b4a19a8b0 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -846,7 +846,7 @@ static int __init gntdev_init(void) if (!xen_domain()) return -ENODEV; - use_ptemod = xen_pv_domain(); + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); err = misc_register(&gntdev_miscdev); if (err != 0) { diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 44b75ccfbbff..1d5fbce4acb7 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1108,7 +1108,7 @@ static int gnttab_map(unsigned int start_idx, unsigned int end_idx) unsigned int nr_gframes = end_idx + 1; int rc; - if (xen_hvm_domain()) { + if (xen_feature(XENFEAT_auto_translated_physmap)) { struct xen_add_to_physmap xatp; unsigned int i = end_idx; rc = 0; @@ -1184,7 +1184,7 @@ static void gnttab_request_version(void) int rc; struct gnttab_set_version gsv; - if (xen_hvm_domain()) + if (xen_feature(XENFEAT_auto_translated_physmap)) gsv.version = 1; else gsv.version = 2; @@ -1327,5 +1327,6 @@ static int __gnttab_init(void) return gnttab_init(); } - -core_initcall(__gnttab_init); +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); -- cgit v1.2.3-59-g8ed1b From 4e903a20da51ed2329c1b9c182dba74f47ac2ca8 Mon Sep 17 00:00:00 2001 From: Mukesh Rathor Date: Tue, 31 Dec 2013 11:16:25 -0500 Subject: xen/pvh: Support ParaVirtualized Hardware extensions (v3). PVH allows PV linux guest to utilize hardware extended capabilities, such as running MMU updates in a HVM container. The Xen side defines PVH as (from docs/misc/pvh-readme.txt, with modifications): "* the guest uses auto translate: - p2m is managed by Xen - pagetables are owned by the guest - mmu_update hypercall not available * it uses event callback and not vlapic emulation, * IDT is native, so set_trap_table hcall is also N/A for a PVH guest. For a full list of hcalls supported for PVH, see pvh_hypercall64_table in arch/x86/hvm/hvm.c in xen. From the ABI prespective, it's mostly a PV guest with auto translate, although it does use hvm_op for setting callback vector." Use .ascii and .asciz to define xen feature string. Note, the PVH string must be in a single line (not multiple lines with \) to keep the assembler from putting null char after each string before \. This patch allows it to be configured and enabled. We also use introduce the 'XEN_ELFNOTE_SUPPORTED_FEATURES' ELF note to tell the hypervisor that 'hvm_callback_vector' is what the kernel needs. We can not put it in 'XEN_ELFNOTE_FEATURES' as older hypervisor parse fields they don't understand as errors and refuse to load the kernel. This work-around fixes the problem. Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk Acked-by: Stefano Stabellini --- arch/x86/xen/Kconfig | 2 +- arch/x86/xen/xen-head.S | 25 ++++++++++++++++++++++++- include/xen/interface/elfnote.h | 13 +++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e7d05900efac..d88bfd66aaab 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -53,6 +53,6 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" - depends on X86_64 && XEN && BROKEN + depends on X86_64 && XEN select XEN_PVHVM def_bool n diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7faed5869e5b..485b69585540 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -11,8 +11,28 @@ #include #include +#include #include +#ifdef CONFIG_XEN_PVH +#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" +/* Note the lack of 'hvm_callback_vector'. Older hypervisor will + * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in + * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. + */ +#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ + (1 << XENFEAT_auto_translated_physmap) | \ + (1 << XENFEAT_supervisor_mode_kernel) | \ + (1 << XENFEAT_hvm_callback_vector)) +/* The XENFEAT_writable_page_tables is not stricly neccessary as we set that + * up regardless whether this CONFIG option is enabled or not, but it + * clarifies what the right flags need to be. + */ +#else +#define PVH_FEATURES_STR "" +#define PVH_FEATURES (0) +#endif + __INIT ENTRY(startup_xen) cld @@ -95,7 +115,10 @@ NEXT_HYPERCALL(arch_6) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | + (1 << XENFEAT_writable_page_tables) | + (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 0360b15f4883..6f4eae328ca7 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -140,6 +140,19 @@ */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 +/* + * The features supported by this kernel (numeric). + * + * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a + * kernel to specify support for features that older hypervisors don't + * know about. The set of features 4.2 and newer hypervisors will + * consider supported by the kernel is the combination of the sets + * specified through this and the string note. + * + * LEGACY: FEATURES + */ +#define XEN_ELFNOTE_SUPPORTED_FEATURES 17 + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* -- cgit v1.2.3-59-g8ed1b From 0869a642326e786544ef3e307175dcec150fe3dc Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 7 Jan 2014 09:56:06 -0500 Subject: xen/pvh: Fix compile issues with xen_pvh_domain() Oddly enough it compiles for my ancient compiler but with the supplied .config it does blow up. Fix is easy enough. Reported-by: kbuild test robot Reported-by: Jim Davis Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/grant-table.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 2d719799fd6a..103c93f874b2 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -128,6 +128,7 @@ void arch_gnttab_unmap(void *shared, unsigned long nr_gframes) #ifdef CONFIG_XEN_PVH #include #include +#include #include static int __init xlated_setup_gnttab_pages(void) { -- cgit v1.2.3-59-g8ed1b From 5602aba808aa5440e4b8fa4340079cb2047651ee Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 7 Jan 2014 21:37:09 +0800 Subject: xen/pvh: remove duplicated include from enlighten.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a4e2f30917a3..b6d61c353fe5 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-59-g8ed1b From 54d44eb3c7f6f893ae3b3cfb545145aa337976ca Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 10 Jan 2014 10:45:35 -0500 Subject: xen/pvh: Use 'depend' instead of 'select'. The usage of 'select' means it will enable the CONFIG options without checking their dependencies. That meant we would inadvertently turn on CONFIG_XEN_PVHM while its core dependency (CONFIG_PCI) was turned off. This patch fixes the warnings and compile failures: warning: (XEN_PVH) selects XEN_PVHVM which has unmet direct dependencies (HYPERVISOR_GUEST && XEN && PCI && X86_LOCAL_APIC) Reported-by: Jim Davis Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index d88bfd66aaab..01b90261fa38 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -53,6 +53,5 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" - depends on X86_64 && XEN - select XEN_PVHVM + depends on X86_64 && XEN && XEN_PVHVM def_bool n -- cgit v1.2.3-59-g8ed1b From c9f6e9977e38de15da96b732a8dec0ef56cbf977 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Mon, 20 Jan 2014 09:20:07 -0500 Subject: xen/pvh: Set X86_CR0_WP and others in CR0 (v2) otherwise we will get for some user-space applications that use 'clone' with CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID end up hitting an assert in glibc manifested by: general protection ip:7f80720d364c sp:7fff98fd8a80 error:0 in libc-2.13.so[7f807209e000+180000] This is due to the nature of said operations which sets and clears the PID. "In the successful one I can see that the page table of the parent process has been updated successfully to use a different physical page, so the write of the tid on that page only affects the child... On the other hand, in the failed case, the write seems to happen before the copy of the original page is done, so both the parent and the child end up with the same value (because the parent copies the page after the write of the child tid has already happened)." (Roger's analysis). The nature of this is due to the Xen's commit of 51e2cac257ec8b4080d89f0855c498cbbd76a5e5 "x86/pvh: set only minimal cr0 and cr4 flags in order to use paging" the CR0_WP was removed so COW features of the Linux kernel were not operating properly. While doing that also update the rest of the CR0 flags to be inline with what a baremetal Linux kernel would set them to. In 'secondary_startup_64' (baremetal Linux) sets: X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | X86_CR0_PG The hypervisor for HVM type guests (which PVH is a bit) sets: X86_CR0_PE | X86_CR0_ET | X86_CR0_TS For PVH it specifically sets: X86_CR0_PG Which means we need to set the rest: X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM to have full parity. Signed-off-by: Roger Pau Monne Signed-off-by: Mukesh Rathor Signed-off-by: Konrad Rzeszutek Wilk [v1: Took out the cr4 writes to be a seperate patch] [v2: 0-DAY kernel found xen_setup_gdt to be missing a static] --- arch/x86/xen/enlighten.c | 33 ++++++++++++++++++++++++++++++--- arch/x86/xen/smp.c | 2 +- arch/x86/xen/xen-ops.h | 2 +- 3 files changed, 32 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b6d61c353fe5..a4d7b647867f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1414,7 +1414,7 @@ static void __init xen_boot_params_init_edd(void) * is PVH which is not going to use xen_load_gdt_boot or other * __init functions. */ -void __ref xen_setup_gdt(int cpu) +static void __ref xen_setup_gdt(int cpu) { if (xen_feature(XENFEAT_auto_translated_physmap)) { #ifdef CONFIG_X86_64 @@ -1462,13 +1462,40 @@ void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } +/* + * A PV guest starts with default flags that are not set for PVH, set them + * here asap. + */ +static void xen_pvh_set_cr_flags(int cpu) +{ + + /* Some of these are setup in 'secondary_startup_64'. The others: + * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests + * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ + write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); +} + +/* + * Note, that it is ref - because the only caller of this after init + * is PVH which is not going to use xen_load_gdt_boot or other + * __init functions. + */ +void __ref xen_pvh_secondary_vcpu_init(int cpu) +{ + xen_setup_gdt(cpu); + xen_pvh_set_cr_flags(cpu); +} + static void __init xen_pvh_early_guest_init(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) return; - if (xen_feature(XENFEAT_hvm_callback_vector)) - xen_have_vector_callback = 1; + if (!xen_feature(XENFEAT_hvm_callback_vector)) + return; + + xen_have_vector_callback = 1; + xen_pvh_set_cr_flags(0); #ifdef CONFIG_X86_32 BUG(); /* PVH: Implement proper support. */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 5e46190133b2..a18eadd8bb40 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -105,7 +105,7 @@ static void cpu_bringup_and_idle(int cpu) #ifdef CONFIG_X86_64 if (xen_feature(XENFEAT_auto_translated_physmap) && xen_feature(XENFEAT_supervisor_mode_kernel)) - xen_setup_gdt(cpu); + xen_pvh_secondary_vcpu_init(cpu); #endif cpu_bringup(); cpu_startup_entry(CPUHP_ONLINE); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9059c24ed564..1cb6f4c37300 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -123,5 +123,5 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); -void xen_setup_gdt(int cpu); +void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ -- cgit v1.2.3-59-g8ed1b