From 034702a64a6692a8d5d0d9630064a014fc633728 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 10 Jan 2017 14:32:51 +0100 Subject: xen/netfront: set default upper limit of tx/rx queues to 8 The default for the number of tx/rx queues of one interface is the number of vcpus of the system today. As each queue pair reserves 512 grant pages this default consumes a ridiculous number of grants for large guests. Limit the queue number to 8 as default. This value can be modified via a module parameter if required. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/net/xen-netfront.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index 8315fe73ecd0..7ce5d2f96b51 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -57,6 +57,7 @@ #include /* Module parameters */ +#define MAX_QUEUES_DEFAULT 8 static unsigned int xennet_max_queues; module_param_named(max_queues, xennet_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, @@ -2164,11 +2165,12 @@ static int __init netif_init(void) pr_info("Initialising Xen virtual ethernet driver\n"); - /* Allow as many queues as there are CPUs if user has not + /* Allow as many queues as there are CPUs inut max. 8 if user has not * specified a value. */ if (xennet_max_queues == 0) - xennet_max_queues = num_online_cpus(); + xennet_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT, + num_online_cpus()); return xenbus_register_frontend(&netfront_driver); } -- cgit v1.2.3-59-g8ed1b From 56dd5af9bc23d0d5d23bb207c477715b4c2216c5 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 10 Jan 2017 14:32:52 +0100 Subject: xen/netback: set default upper limit of tx/rx queues to 8 The default for the maximum number of tx/rx queues of one interface is the number of cpus of the system today. As each queue pair reserves 512 grant pages this default consumes a ridiculous number of grants for large guests. Limit the queue number to 8 as default. This value can be modified via a module parameter if required. Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/net/xen-netback/netback.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 47b481095d77..f9bcf4a665bc 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -67,6 +67,7 @@ module_param(rx_drain_timeout_msecs, uint, 0444); unsigned int rx_stall_timeout_msecs = 60000; module_param(rx_stall_timeout_msecs, uint, 0444); +#define MAX_QUEUES_DEFAULT 8 unsigned int xenvif_max_queues; module_param_named(max_queues, xenvif_max_queues, uint, 0644); MODULE_PARM_DESC(max_queues, @@ -1622,11 +1623,12 @@ static int __init netback_init(void) if (!xen_domain()) return -ENODEV; - /* Allow as many queues as there are CPUs if user has not + /* Allow as many queues as there are CPUs but max. 8 if user has not * specified a value. */ if (xenvif_max_queues == 0) - xenvif_max_queues = num_online_cpus(); + xenvif_max_queues = min_t(unsigned int, MAX_QUEUES_DEFAULT, + num_online_cpus()); if (fatal_skb_slots < XEN_NETBK_LEGACY_SLOTS_MAX) { pr_info("fatal_skb_slots too small (%d), bump it to XEN_NETBK_LEGACY_SLOTS_MAX (%d)\n", -- cgit v1.2.3-59-g8ed1b From 4fed1b125eb6252bde478665fc05d4819f774fa8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2017 01:54:05 -0700 Subject: xen/manage: correct return value check on xenbus_scanf() A negative return value indicates an error; in fact the function at present won't ever return zero. Signed-off-by: Jan Beulich Reviewed-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 26e5e8507f03..357a8db859c9 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -277,7 +277,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, err = xenbus_transaction_start(&xbt); if (err) return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + if (xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key) < 0) { pr_err("Unable to read sysrq code in control/sysrq\n"); xenbus_transaction_end(xbt, 1); return; -- cgit v1.2.3-59-g8ed1b From 063334f30543597430f172bd7690d21e3590e148 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Fri, 3 Feb 2017 16:57:22 -0500 Subject: xen/x86: Remove PVH support We are replacing existing PVH guests with new implementation. We are keeping xen_pvh_domain() macro (for now set to zero) because when we introduce new PVH implementation later in this series we will reuse current PVH-specific code (xen_pvh_gnttab_setup()), and that code is conditioned by 'if (xen_pvh_domain())'. (We will also need a noop xen_pvh_domain() for !CONFIG_XEN_PVH). Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross Reviewed-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 140 ++++++--------------------------------- arch/x86/xen/mmu.c | 21 +----- arch/x86/xen/setup.c | 37 +---------- arch/x86/xen/smp.c | 78 ++++++++-------------- arch/x86/xen/smp.h | 8 --- arch/x86/xen/xen-head.S | 62 ++--------------- arch/x86/xen/xen-ops.h | 1 - drivers/xen/events/events_base.c | 1 - include/xen/xen.h | 13 +--- 9 files changed, 54 insertions(+), 307 deletions(-) (limited to 'drivers') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 51ef95232725..828f1b226f56 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1138,10 +1138,11 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); } - /* xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. Note that for - * PVH we want to use native IRQ mechanism. */ - if (have_vcpu_info_placement && !xen_pvh_domain()) { + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (have_vcpu_info_placement) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1413,49 +1414,9 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. - * - * Note, that it is __ref because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. */ -static void __ref xen_setup_gdt(int cpu) +static void xen_setup_gdt(int cpu) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_X86_64 - unsigned long dummy; - - load_percpu_segment(cpu); /* We need to access per-cpu area */ - switch_to_new_gdt(cpu); /* GDT and GS set */ - - /* We are switching of the Xen provided GDT to our HVM mode - * GDT. The new GDT has __KERNEL_CS with CS.L = 1 - * and we are jumping to reload it. - */ - asm volatile ("pushq %0\n" - "leaq 1f(%%rip),%0\n" - "pushq %0\n" - "lretq\n" - "1:\n" - : "=&r" (dummy) : "0" (__KERNEL_CS)); - - /* - * While not needed, we also set the %es, %ds, and %fs - * to zero. We don't care about %ss as it is NULL. - * Strictly speaking this is not needed as Xen zeros those - * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) - * - * Linux zeros them in cpu_init() and in secondary_startup_64 - * (for BSP). - */ - loadsegment(es, 0); - loadsegment(ds, 0); - loadsegment(fs, 0); -#else - /* PVH: TODO Implement. */ - BUG(); -#endif - return; /* PVH does not need any PV GDT ops. */ - } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1466,59 +1427,6 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } -#ifdef CONFIG_XEN_PVH -/* - * A PV guest starts with default flags that are not set for PVH, set them - * here asap. - */ -static void xen_pvh_set_cr_flags(int cpu) -{ - - /* Some of these are setup in 'secondary_startup_64'. The others: - * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests - * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ - write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); - - if (!cpu) - return; - /* - * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). - */ - if (boot_cpu_has(X86_FEATURE_PSE)) - cr4_set_bits_and_update_boot(X86_CR4_PSE); - - if (boot_cpu_has(X86_FEATURE_PGE)) - cr4_set_bits_and_update_boot(X86_CR4_PGE); -} - -/* - * Note, that it is ref - because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. - */ -void __ref xen_pvh_secondary_vcpu_init(int cpu) -{ - xen_setup_gdt(cpu); - xen_pvh_set_cr_flags(cpu); -} - -static void __init xen_pvh_early_guest_init(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - return; - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_pvh_early_cpu_init(0, false); - xen_pvh_set_cr_flags(0); - -#ifdef CONFIG_X86_32 - BUG(); /* PVH: Implement proper support. */ -#endif -} -#endif /* CONFIG_XEN_PVH */ - static void __init xen_dom0_set_legacy_features(void) { x86_platform.legacy.rtc = 1; @@ -1555,24 +1463,17 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); -#ifdef CONFIG_XEN_PVH - xen_pvh_early_guest_init(); -#endif + xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - if (!xen_pvh_domain()) { - pv_cpu_ops = xen_cpu_ops; + pv_cpu_ops = xen_cpu_ops; - x86_platform.get_nmi_reason = xen_get_nmi_reason; - } + x86_platform.get_nmi_reason = xen_get_nmi_reason; - if (xen_feature(XENFEAT_auto_translated_physmap)) - x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; - else - x86_init.resources.memory_setup = xen_memory_setup; + x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; @@ -1665,18 +1566,15 @@ asmlinkage __visible void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* PVH: runs at default kernel iopl of 0 */ - if (!xen_pvh_domain()) { - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - } + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7d5afdb417cc..f6740b5b1738 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1792,10 +1792,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot, unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - /* For PVH no need to set R/O or R/W to pin them or unpin them. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1902,8 +1898,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * level2_ident_pgt, and level2_kernel_pgt. This means that only the * kernel has a physical mapping to start with - but that's enough to * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. NOTE: for - * PVH, the page tables are native. + * mapping once some sort of allocator has been set up. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -2812,16 +2807,6 @@ static int do_remap_gfn(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_XEN_PVH - /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, - prot, domid, pages); -#else - return -EINVAL; -#endif - } - rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contiguous @@ -2915,10 +2900,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; -#ifdef CONFIG_XEN_PVH - return xen_xlate_unmap_gfn_range(vma, numpgs, pages); -#else return -EINVAL; -#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f3f7b41116f7..a8c306cf8868 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -914,39 +914,6 @@ char * __init xen_memory_setup(void) return "Xen"; } -/* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_map); - set_xen_guest_handle(memmap.buffer, xen_e820_map); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_map_entries = memmap.nr_entries; - - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); - - for (i = 0; i < xen_e820_map_entries; i++) - e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, - xen_e820_map[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - /* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes @@ -1032,8 +999,8 @@ void __init xen_pvmmu_arch_setup(void) void __init xen_arch_setup(void) { xen_panic_handler_init(); - if (!xen_feature(XENFEAT_auto_translated_physmap)) - xen_pvmmu_arch_setup(); + + xen_pvmmu_arch_setup(); #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 311acad7dad2..0dee6f59ea82 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -99,18 +99,8 @@ static void cpu_bringup(void) local_irq_enable(); } -/* - * Note: cpu parameter is only relevant for PVH. The reason for passing it - * is we can't do smp_processor_id until the percpu segments are loaded, for - * which we need the cpu number! So we pass it in rdi as first parameter. - */ -asmlinkage __visible void cpu_bringup_and_idle(int cpu) +asmlinkage __visible void cpu_bringup_and_idle(void) { -#ifdef CONFIG_XEN_PVH - if (xen_feature(XENFEAT_auto_translated_physmap) && - xen_feature(XENFEAT_supervisor_mode_kernel)) - xen_pvh_secondary_vcpu_init(cpu); -#endif cpu_bringup(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } @@ -404,61 +394,47 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) gdt = get_cpu_gdt_table(cpu); #ifdef CONFIG_X86_32 - /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; - xen_copy_trap_info(ctxt->trap_ctxt); + xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; + ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; #else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - } -#ifdef CONFIG_XEN_PVH - else { - /* - * The vcpu comes on kernel page tables which have the NX pte - * bit set. This means before DS/SS is touched, NX in - * EFER must be set. Hence the following assembly glue code. - */ - ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; - ctxt->user_regs.rdi = cpu; - ctxt->user_regs.rsi = true; /* entry == true */ - } + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c5c16dc4f694..9beef333584a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu) static inline void xen_smp_intr_free(unsigned int cpu) {} #endif /* CONFIG_SMP */ -#ifdef CONFIG_XEN_PVH -extern void xen_pvh_early_cpu_init(int cpu, bool entry); -#else -static inline void xen_pvh_early_cpu_init(int cpu, bool entry) -{ -} -#endif - #endif diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f8d8abf4c1a..37794e42b67d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,25 +16,6 @@ #include #include -#ifdef CONFIG_XEN_PVH -#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" -/* Note the lack of 'hvm_callback_vector'. Older hypervisor will - * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in - * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. - */ -#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_auto_translated_physmap) | \ - (1 << XENFEAT_supervisor_mode_kernel) | \ - (1 << XENFEAT_hvm_callback_vector)) -/* The XENFEAT_writable_page_tables is not stricly necessary as we set that - * up regardless whether this CONFIG option is enabled or not, but it - * clarifies what the right flags need to be. - */ -#else -#define PVH_FEATURES_STR "" -#define PVH_FEATURES (0) -#endif - __INIT ENTRY(startup_xen) cld @@ -54,41 +35,6 @@ ENTRY(startup_xen) __FINIT -#ifdef CONFIG_XEN_PVH -/* - * xen_pvh_early_cpu_init() - early PVH VCPU initialization - * @cpu: this cpu number (%rdi) - * @entry: true if this is a secondary vcpu coming up on this entry - * point, false if this is the boot CPU being initialized for - * the first time (%rsi) - * - * Note: This is called as a function on the boot CPU, and is the entry point - * on the secondary CPU. - */ -ENTRY(xen_pvh_early_cpu_init) - mov %rsi, %r11 - - /* Gather features to see if NX implemented. */ - mov $0x80000001, %eax - cpuid - mov %edx, %esi - - mov $MSR_EFER, %ecx - rdmsr - bts $_EFER_SCE, %eax - - bt $20, %esi - jnc 1f /* No NX, skip setting it */ - bts $_EFER_NX, %eax -1: wrmsr -#ifdef CONFIG_SMP - cmp $0, %r11b - jne cpu_bringup_and_idle -#endif - ret - -#endif /* CONFIG_XEN_PVH */ - .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -114,10 +60,10 @@ ENTRY(hypercall_page) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | - (1 << XENFEAT_writable_page_tables) | - (1 << XENFEAT_dom0)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, + .ascii "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ac0a2b0f9e62..f6a41c41ebc7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); -void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index fd8e872d2943..6a53577772c9 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1704,7 +1704,6 @@ void __init xen_init_IRQ(void) pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); eoi_gmfn.gmfn = virt_to_gfn(pirq_eoi_map); rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); - /* TODO: No PVH support for PIRQ EOI */ if (rc != 0) { free_page((unsigned long) pirq_eoi_map); pirq_eoi_map = NULL; diff --git a/include/xen/xen.h b/include/xen/xen.h index f0f0252cff9a..d0f96840f71f 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -29,17 +29,6 @@ extern enum xen_domain_type xen_domain_type; #define xen_initial_domain() (0) #endif /* CONFIG_XEN_DOM0 */ -#ifdef CONFIG_XEN_PVH -/* This functionality exists only for x86. The XEN_PVHVM support exists - * only in x86 world - hence on ARM it will be always disabled. - * N.B. ARM guests are neither PV nor HVM nor PVHVM. - * It's a bit like PVH but is different also (it's further towards the H - * end of the spectrum than even PVH). - */ -#include -#define xen_pvh_domain() (xen_pv_domain() && \ - xen_feature(XENFEAT_auto_translated_physmap)) -#else #define xen_pvh_domain() (0) -#endif + #endif /* _XEN_XEN_H */ -- cgit v1.2.3-59-g8ed1b From 8613d78ab09900010a5b32e78ff229f551d661d6 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 6 Feb 2017 10:56:15 -0500 Subject: xen/pvh: Initialize grant table for PVH guests Like PV guests, PVH does not have PCI devices and therefore cannot use MMIO space to store grants. Instead it balloons out memory and keeps grants there. Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross --- drivers/xen/grant-table.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index bb36b1e1dbcc..d6786b87e13b 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -1146,13 +1146,13 @@ EXPORT_SYMBOL_GPL(gnttab_init); static int __gnttab_init(void) { + if (!xen_domain()) + return -ENODEV; + /* Delay grant-table initialization in the PV on HVM case */ - if (xen_hvm_domain()) + if (xen_hvm_domain() && !xen_pvh_domain()) return 0; - if (!xen_pv_domain()) - return -ENODEV; - return gnttab_init(); } /* Starts after core_initcall so that xen_pvh_gnttab_setup can be called -- cgit v1.2.3-59-g8ed1b From 2a7197f02dddf1f9cee300bd12512375ed56524a Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 6 Feb 2017 10:58:05 -0500 Subject: xen/pvh: Enable CPU hotplug PVH guests don't (yet) receive ACPI hotplug interrupts and therefore need to monitor xenstore for CPU hotplug event. Signed-off-by: Boris Ostrovsky Reviewed-by: Juergen Gross --- drivers/xen/cpu_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 5676aefdf2bc..0bab60a37464 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -107,7 +107,7 @@ static int __init setup_vcpu_hotplug_event(void) .notifier_call = setup_cpu_watcher }; #ifdef CONFIG_X86 - if (!xen_pv_domain()) + if (!xen_pv_domain() && !xen_pvh_domain()) #else if (!xen_domain()) #endif -- cgit v1.2.3-59-g8ed1b From c0d197d55e8e8aeeea55f79bdf67e1c957bfa25d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 8 Feb 2017 03:33:36 -0800 Subject: xenbus: Neaten xenbus_va_dev_error This function error patch can be simplified, so do so. Remove fail: label and somewhat obfuscating, used once "error_path" function. Signed-off-by: Joe Perches Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus_client.c | 39 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 056da6ee1a35..915d77785193 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -259,53 +259,34 @@ int xenbus_frontend_closed(struct xenbus_device *dev) } EXPORT_SYMBOL_GPL(xenbus_frontend_closed); -/** - * Return the path to the error node for the given device, or NULL on failure. - * If the value returned is non-NULL, then it is the caller's to kfree. - */ -static char *error_path(struct xenbus_device *dev) -{ - return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); -} - - static void xenbus_va_dev_error(struct xenbus_device *dev, int err, const char *fmt, va_list ap) { unsigned int len; - char *printf_buffer = NULL; - char *path_buffer = NULL; + char *printf_buffer; + char *path_buffer; #define PRINTF_BUFFER_SIZE 4096 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); - if (printf_buffer == NULL) - goto fail; + if (!printf_buffer) + return; len = sprintf(printf_buffer, "%i ", -err); - vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); + vsnprintf(printf_buffer + len, PRINTF_BUFFER_SIZE - len, fmt, ap); dev_err(&dev->dev, "%s\n", printf_buffer); - path_buffer = error_path(dev); - - if (path_buffer == NULL) { + path_buffer = kasprintf(GFP_KERNEL, "error/%s", dev->nodename); + if (!path_buffer || + xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer)) dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } + dev->nodename, printf_buffer); - if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { - dev_err(&dev->dev, "failed to write error node for %s (%s)\n", - dev->nodename, printf_buffer); - goto fail; - } - -fail: kfree(printf_buffer); kfree(path_buffer); } - /** * xenbus_dev_error * @dev: xenbus device -- cgit v1.2.3-59-g8ed1b From 332f791dc98d98116f4473b726f67c9321b0f31e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Feb 2017 14:39:56 +0100 Subject: xen: clean up xenbus internal headers The xenbus driver has an awful mixture of internally and globally visible headers: some of the internally used only stuff is defined in the global header include/xen/xenbus.h while some stuff defined in internal headers is used by other drivers, too. Clean this up by moving the externally used symbols to include/xen/xenbus.h and the symbols used internally only to a new header drivers/xen/xenbus/xenbus.h replacing xenbus_comms.h and xenbus_probe.h Signed-off-by: Juergen Gross Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus.h | 97 ++++++++++++++++++++++++++++++ drivers/xen/xenbus/xenbus_client.c | 2 +- drivers/xen/xenbus/xenbus_comms.c | 2 +- drivers/xen/xenbus/xenbus_comms.h | 51 ---------------- drivers/xen/xenbus/xenbus_dev_backend.c | 2 +- drivers/xen/xenbus/xenbus_dev_frontend.c | 4 +- drivers/xen/xenbus/xenbus_probe.c | 3 +- drivers/xen/xenbus/xenbus_probe.h | 88 --------------------------- drivers/xen/xenbus/xenbus_probe_backend.c | 3 +- drivers/xen/xenbus/xenbus_probe_frontend.c | 3 +- drivers/xen/xenbus/xenbus_xs.c | 3 +- drivers/xen/xenfs/super.c | 2 +- drivers/xen/xenfs/xenstored.c | 2 +- include/xen/xenbus.h | 12 ++-- 14 files changed, 113 insertions(+), 161 deletions(-) create mode 100644 drivers/xen/xenbus/xenbus.h delete mode 100644 drivers/xen/xenbus/xenbus_comms.h delete mode 100644 drivers/xen/xenbus/xenbus_probe.h (limited to 'drivers') diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h new file mode 100644 index 000000000000..a6b007dfdaa8 --- /dev/null +++ b/drivers/xen/xenbus/xenbus.h @@ -0,0 +1,97 @@ +/* + * Private include for xenbus communications. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_XENBUS_H +#define _XENBUS_XENBUS_H + +#define XEN_BUS_ID_SIZE 20 + +struct xen_bus_type { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); + int (*probe)(struct xen_bus_type *bus, const char *type, + const char *dir); + void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, + unsigned int len); + struct bus_type bus; +}; + +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +extern enum xenstore_init xen_store_domain_type; +extern const struct attribute_group *xenbus_dev_groups[]; + +int xs_init(void); +int xb_init_comms(void); +void xb_deinit_comms(void); +int xb_write(const void *data, unsigned int len); +int xb_read(void *data, unsigned int len); +int xb_data_to_read(void); +int xb_wait_for_data_to_read(void); + +int xenbus_match(struct device *_dev, struct device_driver *_drv); +int xenbus_dev_probe(struct device *_dev); +int xenbus_dev_remove(struct device *_dev); +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +int xenbus_probe_devices(struct xen_bus_type *bus); + +void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + +void xenbus_dev_shutdown(struct device *_dev); + +int xenbus_dev_suspend(struct device *dev); +int xenbus_dev_resume(struct device *dev); +int xenbus_dev_cancel(struct device *dev); + +void xenbus_otherend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len, + int ignore_on_shutdown); + +int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +void xenbus_ring_ops_init(void); + +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); + +#endif diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 915d77785193..29f82338ab75 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -47,7 +47,7 @@ #include #include -#include "xenbus_probe.h" +#include "xenbus.h" #define XENBUS_PAGES(_grants) (DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE)) diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index ecdecce80a6c..c21ec02643e1 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -40,7 +40,7 @@ #include #include #include -#include "xenbus_comms.h" +#include "xenbus.h" static int xenbus_irq; diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h deleted file mode 100644 index 867a2e425208..000000000000 --- a/drivers/xen/xenbus/xenbus_comms.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Private include for xenbus communications. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_COMMS_H -#define _XENBUS_COMMS_H - -#include - -int xs_init(void); -int xb_init_comms(void); -void xb_deinit_comms(void); - -/* Low level routines. */ -int xb_write(const void *data, unsigned len); -int xb_read(void *data, unsigned len); -int xb_data_to_read(void); -int xb_wait_for_data_to_read(void); -extern struct xenstore_domain_interface *xen_store_interface; -extern int xen_store_evtchn; -extern enum xenstore_init xen_store_domain_type; - -extern const struct file_operations xen_xenbus_fops; - -#endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c index 4a41ac9af966..1126701e212e 100644 --- a/drivers/xen/xenbus/xenbus_dev_backend.c +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -16,7 +16,7 @@ #include #include -#include "xenbus_comms.h" +#include "xenbus.h" static int xenbus_backend_open(struct inode *inode, struct file *filp) { diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 79130b310247..e2bc9b301494 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -57,12 +57,12 @@ #include #include -#include "xenbus_comms.h" - #include #include #include +#include "xenbus.h" + /* * An element of a list of outstanding transactions, for which we're * still waiting a reply. diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 4bdf654041e9..6baffbb6acf9 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -62,8 +62,7 @@ #include -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" int xen_store_evtchn; diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h deleted file mode 100644 index c9ec7ca1f7ab..000000000000 --- a/drivers/xen/xenbus/xenbus_probe.h +++ /dev/null @@ -1,88 +0,0 @@ -/****************************************************************************** - * xenbus_probe.h - * - * Talks to Xen Store to figure out what devices we have. - * - * Copyright (C) 2005 Rusty Russell, IBM Corporation - * Copyright (C) 2005 XenSource Ltd. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _XENBUS_PROBE_H -#define _XENBUS_PROBE_H - -#define XEN_BUS_ID_SIZE 20 - -struct xen_bus_type { - char *root; - unsigned int levels; - int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); - int (*probe)(struct xen_bus_type *bus, const char *type, - const char *dir); - void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, - unsigned int len); - struct bus_type bus; -}; - -enum xenstore_init { - XS_UNKNOWN, - XS_PV, - XS_HVM, - XS_LOCAL, -}; - -extern const struct attribute_group *xenbus_dev_groups[]; - -extern int xenbus_match(struct device *_dev, struct device_driver *_drv); -extern int xenbus_dev_probe(struct device *_dev); -extern int xenbus_dev_remove(struct device *_dev); -extern int xenbus_register_driver_common(struct xenbus_driver *drv, - struct xen_bus_type *bus, - struct module *owner, - const char *mod_name); -extern int xenbus_probe_node(struct xen_bus_type *bus, - const char *type, - const char *nodename); -extern int xenbus_probe_devices(struct xen_bus_type *bus); - -extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); - -extern void xenbus_dev_shutdown(struct device *_dev); - -extern int xenbus_dev_suspend(struct device *dev); -extern int xenbus_dev_resume(struct device *dev); -extern int xenbus_dev_cancel(struct device *dev); - -extern void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, - int ignore_on_shutdown); - -extern int xenbus_read_otherend_details(struct xenbus_device *xendev, - char *id_node, char *path_node); - -void xenbus_ring_ops_init(void); - -#endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 37929df829a3..f46b4dc72c76 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -53,8 +53,7 @@ #include #include -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" /* backend/// => -- */ static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index 6d40a972ffb2..d7b77a62e6e7 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -27,8 +27,7 @@ #include -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 6afb993c5809..4c49d8709765 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -48,8 +48,7 @@ #include #include #include -#include "xenbus_comms.h" -#include "xenbus_probe.h" +#include "xenbus.h" struct xs_stored_msg { struct list_head list; diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 8559a71f36b1..328c3987b112 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -16,10 +16,10 @@ #include #include +#include #include "xenfs.h" #include "../privcmd.h" -#include "../xenbus/xenbus_comms.h" #include diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c index fef20dbc6a5c..82fd2a396d96 100644 --- a/drivers/xen/xenfs/xenstored.c +++ b/drivers/xen/xenfs/xenstored.c @@ -4,9 +4,9 @@ #include #include +#include #include "xenfs.h" -#include "../xenbus/xenbus_comms.h" static ssize_t xsd_read(struct file *file, char __user *buf, size_t size, loff_t *off) diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 271ba62503c7..98f73a20725c 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -175,16 +176,9 @@ void xs_suspend(void); void xs_resume(void); void xs_suspend_cancel(void); -/* Used by xenbus_dev to borrow kernel's store connection. */ -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); - struct work_struct; -/* Prepare for domain suspend: then resume or cancel the suspend. */ -void xenbus_suspend(void); -void xenbus_resume(void); void xenbus_probe(struct work_struct *); -void xenbus_suspend_cancel(void); #define XENBUS_IS_ERR_READ(str) ({ \ if (!IS_ERR(str) && strlen(str) == 0) { \ @@ -235,4 +229,8 @@ const char *xenbus_strstate(enum xenbus_state state); int xenbus_dev_is_online(struct xenbus_device *dev); int xenbus_frontend_closed(struct xenbus_device *dev); +extern const struct file_operations xen_xenbus_fops; +extern struct xenstore_domain_interface *xen_store_interface; +extern int xen_store_evtchn; + #endif /* _XEN_XENBUS_H */ -- cgit v1.2.3-59-g8ed1b From 5584ea250ae44f929feb4c7bd3877d1c5edbf813 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Feb 2017 14:39:57 +0100 Subject: xen: modify xenstore watch event interface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Today a Xenstore watch event is delivered via a callback function declared as: void (*callback)(struct xenbus_watch *, const char **vec, unsigned int len); As all watch events only ever come with two parameters (path and token) changing the prototype to: void (*callback)(struct xenbus_watch *, const char *path, const char *token); is the natural thing to do. Apply this change and adapt all users. Cc: konrad.wilk@oracle.com Cc: roger.pau@citrix.com Cc: wei.liu2@citrix.com Cc: paul.durrant@citrix.com Cc: netdev@vger.kernel.org Signed-off-by: Juergen Gross Reviewed-by: Paul Durrant Reviewed-by: Wei Liu Reviewed-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Signed-off-by: Boris Ostrovsky --- drivers/block/xen-blkback/xenbus.c | 6 +++--- drivers/net/xen-netback/xenbus.c | 8 ++++---- drivers/xen/cpu_hotplug.c | 5 ++--- drivers/xen/manage.c | 6 +++--- drivers/xen/xen-balloon.c | 2 +- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xenbus/xenbus.h | 6 +++--- drivers/xen/xenbus/xenbus_client.c | 4 ++-- drivers/xen/xenbus/xenbus_dev_frontend.c | 21 ++++++++------------- drivers/xen/xenbus/xenbus_probe.c | 11 ++++------- drivers/xen/xenbus/xenbus_probe_backend.c | 8 ++++---- drivers/xen/xenbus/xenbus_probe_frontend.c | 14 +++++++------- drivers/xen/xenbus/xenbus_xs.c | 29 ++++++++++++++--------------- include/xen/xenbus.h | 6 +++--- 14 files changed, 59 insertions(+), 69 deletions(-) (limited to 'drivers') diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 415e79b69d34..8fe61b5dc5a6 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -38,8 +38,8 @@ struct backend_info { static struct kmem_cache *xen_blkif_cachep; static void connect(struct backend_info *); static int connect_ring(struct backend_info *); -static void backend_changed(struct xenbus_watch *, const char **, - unsigned int); +static void backend_changed(struct xenbus_watch *, const char *, + const char *); static void xen_blkif_free(struct xen_blkif *blkif); static void xen_vbd_free(struct xen_vbd *vbd); @@ -661,7 +661,7 @@ fail: * ready, connect. */ static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { int err; unsigned major; diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 85b742e1c42f..bb854f92f5a5 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -734,7 +734,7 @@ static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) } static void xen_net_rate_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xenvif *vif = container_of(watch, struct xenvif, credit_watch); struct xenbus_device *dev = xenvif_to_xenbus_device(vif); @@ -791,7 +791,7 @@ static void xen_unregister_credit_watch(struct xenvif *vif) } static void xen_mcast_ctrl_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xenvif *vif = container_of(watch, struct xenvif, mcast_ctrl_watch); @@ -866,8 +866,8 @@ static void unregister_hotplug_status_watch(struct backend_info *be) } static void hotplug_status_changed(struct xenbus_watch *watch, - const char **vec, - unsigned int vec_size) + const char *path, + const char *token) { struct backend_info *be = container_of(watch, struct backend_info, diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 0bab60a37464..0003912a8111 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -68,13 +68,12 @@ static void vcpu_hotplug(unsigned int cpu) } static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { unsigned int cpu; char *cpustr; - const char *node = vec[XS_WATCH_PATH]; - cpustr = strstr(node, "cpu/"); + cpustr = strstr(path, "cpu/"); if (cpustr != NULL) { sscanf(cpustr, "cpu/%u", &cpu); vcpu_hotplug(cpu); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 357a8db859c9..c1ec8ee80924 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -218,7 +218,7 @@ static struct shutdown_handler shutdown_handlers[] = { }; static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { char *str; struct xenbus_transaction xbt; @@ -266,8 +266,8 @@ static void shutdown_handler(struct xenbus_watch *watch, } #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) +static void sysrq_handler(struct xenbus_watch *watch, const char *path, + const char *token) { char sysrq_key = '\0'; struct xenbus_transaction xbt; diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c index 79865b8901ba..e7715cb62eef 100644 --- a/drivers/xen/xen-balloon.c +++ b/drivers/xen/xen-balloon.c @@ -55,7 +55,7 @@ static int register_balloon(struct device *dev); /* React to a change in the target key */ static void watch_target(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { unsigned long long new_target; int err; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 3f0aee0a068b..3814b44bf1f7 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -652,7 +652,7 @@ out: } static void xen_pcibk_be_watch(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { struct xen_pcibk_device *pdev = container_of(watch, struct xen_pcibk_device, be_watch); diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index a6b007dfdaa8..51995276f549 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -40,8 +40,8 @@ struct xen_bus_type { int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); - void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, - unsigned int len); + void (*otherend_changed)(struct xenbus_watch *watch, const char *path, + const char *token); struct bus_type bus; }; @@ -84,7 +84,7 @@ int xenbus_dev_resume(struct device *dev); int xenbus_dev_cancel(struct device *dev); void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, + const char *path, const char *token, int ignore_on_shutdown); int xenbus_read_otherend_details(struct xenbus_device *xendev, diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 29f82338ab75..82a8866758ee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int)) + const char *, const char *)) { int err; @@ -153,7 +153,7 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int), + const char *, const char *), const char *pathfmt, ...) { int err; diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index e2bc9b301494..e4b984777507 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -258,26 +258,23 @@ out_fail: } static void watch_fired(struct xenbus_watch *watch, - const char **vec, - unsigned int len) + const char *path, + const char *token) { struct watch_adapter *adap; struct xsd_sockmsg hdr; - const char *path, *token; - int path_len, tok_len, body_len, data_len = 0; + const char *token_caller; + int path_len, tok_len, body_len; int ret; LIST_HEAD(staging_q); adap = container_of(watch, struct watch_adapter, watch); - path = vec[XS_WATCH_PATH]; - token = adap->token; + token_caller = adap->token; path_len = strlen(path) + 1; - tok_len = strlen(token) + 1; - if (len > 2) - data_len = vec[len] - vec[2] + 1; - body_len = path_len + tok_len + data_len; + tok_len = strlen(token_caller) + 1; + body_len = path_len + tok_len; hdr.type = XS_WATCH_EVENT; hdr.len = body_len; @@ -288,9 +285,7 @@ static void watch_fired(struct xenbus_watch *watch, if (!ret) ret = queue_reply(&staging_q, path, path_len); if (!ret) - ret = queue_reply(&staging_q, token, tok_len); - if (!ret && len > 2) - ret = queue_reply(&staging_q, vec[2], data_len); + ret = queue_reply(&staging_q, token_caller, tok_len); if (!ret) { /* success: pass reply list onto watcher */ diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 6baffbb6acf9..74888cacd0b0 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -169,7 +169,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev, EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); void xenbus_otherend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len, + const char *path, const char *token, int ignore_on_shutdown) { struct xenbus_device *dev = @@ -180,18 +180,15 @@ void xenbus_otherend_changed(struct xenbus_watch *watch, /* Protect us against watches firing on old details when the otherend details change, say immediately after a resume. */ if (!dev->otherend || - strncmp(dev->otherend, vec[XS_WATCH_PATH], - strlen(dev->otherend))) { - dev_dbg(&dev->dev, "Ignoring watch at %s\n", - vec[XS_WATCH_PATH]); + strncmp(dev->otherend, path, strlen(dev->otherend))) { + dev_dbg(&dev->dev, "Ignoring watch at %s\n", path); return; } state = xenbus_read_driver_state(dev->otherend); dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n", - state, xenbus_strstate(state), dev->otherend_watch.node, - vec[XS_WATCH_PATH]); + state, xenbus_strstate(state), dev->otherend_watch.node, path); /* * Ignore xenbus transitions during shutdown. This prevents us doing diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index f46b4dc72c76..b0bed4faf44c 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -181,9 +181,9 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, } static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 0); + xenbus_otherend_changed(watch, path, token, 0); } static struct xen_bus_type xenbus_backend = { @@ -204,11 +204,11 @@ static struct xen_bus_type xenbus_backend = { }; static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); + xenbus_dev_changed(path, &xenbus_backend); } static struct xenbus_watch be_watch = { diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index d7b77a62e6e7..19e45ce21f89 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -86,9 +86,9 @@ static int xenbus_uevent_frontend(struct device *_dev, static void backend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { - xenbus_otherend_changed(watch, vec, len, 1); + xenbus_otherend_changed(watch, path, token, 1); } static void xenbus_frontend_delayed_resume(struct work_struct *w) @@ -153,11 +153,11 @@ static struct xen_bus_type xenbus_frontend = { }; static void frontend_changed(struct xenbus_watch *watch, - const char **vec, unsigned int len) + const char *path, const char *token) { DPRINTK(""); - xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); + xenbus_dev_changed(path, &xenbus_frontend); } @@ -332,13 +332,13 @@ static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); static int backend_state; static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, - const char **v, unsigned int l) + const char *path, const char *token) { - if (xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", + if (xenbus_scanf(XBT_NIL, path, "", "%i", &backend_state) != 1) backend_state = XenbusStateUnknown; printk(KERN_DEBUG "XENBUS: backend %s %s\n", - v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + path, xenbus_strstate(backend_state)); wake_up(&backend_state_wq); } diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 4c49d8709765..ebc768f44abe 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -64,8 +64,8 @@ struct xs_stored_msg { /* Queued watch events. */ struct { struct xenbus_watch *handle; - char **vec; - unsigned int vec_size; + const char *path; + const char *token; } watch; } u; }; @@ -765,7 +765,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) if (msg->u.watch.handle != watch) continue; list_del(&msg->list); - kfree(msg->u.watch.vec); + kfree(msg->u.watch.path); kfree(msg); } spin_unlock(&watch_events_lock); @@ -833,11 +833,10 @@ static int xenwatch_thread(void *unused) if (ent != &watch_events) { msg = list_entry(ent, struct xs_stored_msg, list); - msg->u.watch.handle->callback( - msg->u.watch.handle, - (const char **)msg->u.watch.vec, - msg->u.watch.vec_size); - kfree(msg->u.watch.vec); + msg->u.watch.handle->callback(msg->u.watch.handle, + msg->u.watch.path, + msg->u.watch.token); + kfree(msg->u.watch.path); kfree(msg); } @@ -903,24 +902,24 @@ static int process_msg(void) body[msg->hdr.len] = '\0'; if (msg->hdr.type == XS_WATCH_EVENT) { - msg->u.watch.vec = split(body, msg->hdr.len, - &msg->u.watch.vec_size); - if (IS_ERR(msg->u.watch.vec)) { - err = PTR_ERR(msg->u.watch.vec); + if (count_strings(body, msg->hdr.len) != 2) { + err = -EINVAL; kfree(msg); + kfree(body); goto out; } + msg->u.watch.path = (const char *)body; + msg->u.watch.token = (const char *)strchr(body, '\0') + 1; spin_lock(&watches_lock); - msg->u.watch.handle = find_watch( - msg->u.watch.vec[XS_WATCH_TOKEN]); + msg->u.watch.handle = find_watch(msg->u.watch.token); if (msg->u.watch.handle != NULL) { spin_lock(&watch_events_lock); list_add_tail(&msg->list, &watch_events); wake_up(&watch_events_waitq); spin_unlock(&watch_events_lock); } else { - kfree(msg->u.watch.vec); + kfree(body); kfree(msg); } spin_unlock(&watches_lock); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 98f73a20725c..869c816d5f8c 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -61,7 +61,7 @@ struct xenbus_watch /* Callback (executed in a process context with no locks held). */ void (*callback)(struct xenbus_watch *, - const char **vec, unsigned int len); + const char *path, const char *token); }; @@ -193,11 +193,11 @@ void xenbus_probe(struct work_struct *); int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int)); + const char *, const char *)); __printf(4, 5) int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, void (*callback)(struct xenbus_watch *, - const char **, unsigned int), + const char *, const char *), const char *pathfmt, ...); int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state); -- cgit v1.2.3-59-g8ed1b From fd8aa9095a95c02dcc35540a263267c29b8fda9d Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Thu, 9 Feb 2017 14:39:58 +0100 Subject: xen: optimize xenbus driver for multiple concurrent xenstore accesses Handling of multiple concurrent Xenstore accesses through xenbus driver either from the kernel or user land is rather lame today: xenbus is capable to have one access active only at one point of time. Rewrite xenbus to handle multiple requests concurrently by making use of the request id of the Xenstore protocol. This requires to: - Instead of blocking inside xb_read() when trying to read data from the xenstore ring buffer do so only in the main loop of xenbus_thread(). - Instead of doing writes to the xenstore ring buffer in the context of the caller just queue the request and do the write in the dedicated xenbus thread. - Instead of just forwarding the request id specified by the caller of xenbus to xenstore use a xenbus internal unique request id. This will allow multiple outstanding requests. - Modify the locking scheme in order to allow multiple requests being active in parallel. - Instead of waiting for the reply of a user's xenstore request after writing the request to the xenstore ring buffer return directly to the caller and do the waiting in the read path. Additionally signal handling was optimized by avoiding waking up the xenbus thread or sending an event to Xenstore in case the addressed entity is known to be running already. As a result communication with Xenstore is sped up by a factor of up to 5: depending on the request type (read or write) and the amount of data transferred the gain was at least 20% (small reads) and went up to a factor of 5 for large writes. In the end some more rough edges of xenbus have been smoothed: - Handling of memory shortage when reading from xenstore ring buffer in the xenbus driver was not optimal: it was busy looping and issuing a warning in each loop. - In case of xenstore not running in dom0 but in a stubdom we end up with two xenbus threads running as the initialization of xenbus in dom0 expecting a local xenstored will be redone later when connecting to the xenstore domain. Up to now this was no problem as locking would prevent the two xenbus threads interfering with each other, but this was just a waste of kernel resources. - An out of memory situation while writing to or reading from the xenstore ring buffer no longer will lead to a possible loss of synchronization with xenstore. - The user read and write part are now interruptible by signals. Signed-off-by: Juergen Gross Signed-off-by: Boris Ostrovsky --- drivers/xen/xenbus/xenbus.h | 48 ++- drivers/xen/xenbus/xenbus_comms.c | 307 ++++++++++++++++-- drivers/xen/xenbus/xenbus_dev_frontend.c | 188 +++++++---- drivers/xen/xenbus/xenbus_xs.c | 520 ++++++++++++++----------------- 4 files changed, 672 insertions(+), 391 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 51995276f549..149c5e7efc89 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -32,6 +32,10 @@ #ifndef _XENBUS_XENBUS_H #define _XENBUS_XENBUS_H +#include +#include +#include + #define XEN_BUS_ID_SIZE 20 struct xen_bus_type { @@ -52,16 +56,49 @@ enum xenstore_init { XS_LOCAL, }; +struct xs_watch_event { + struct list_head list; + unsigned int len; + struct xenbus_watch *handle; + const char *path; + const char *token; + char body[]; +}; + +enum xb_req_state { + xb_req_state_queued, + xb_req_state_wait_reply, + xb_req_state_got_reply, + xb_req_state_aborted +}; + +struct xb_req_data { + struct list_head list; + wait_queue_head_t wq; + struct xsd_sockmsg msg; + enum xsd_sockmsg_type type; + char *body; + const struct kvec *vec; + int num_vecs; + int err; + enum xb_req_state state; + void (*cb)(struct xb_req_data *); + void *par; +}; + extern enum xenstore_init xen_store_domain_type; extern const struct attribute_group *xenbus_dev_groups[]; +extern struct mutex xs_response_mutex; +extern struct list_head xs_reply_list; +extern struct list_head xb_write_list; +extern wait_queue_head_t xb_waitq; +extern struct mutex xb_write_mutex; int xs_init(void); int xb_init_comms(void); void xb_deinit_comms(void); -int xb_write(const void *data, unsigned int len); -int xb_read(void *data, unsigned int len); -int xb_data_to_read(void); -int xb_wait_for_data_to_read(void); +int xs_watch_msg(struct xs_watch_event *event); +void xs_request_exit(struct xb_req_data *req); int xenbus_match(struct device *_dev, struct device_driver *_drv); int xenbus_dev_probe(struct device *_dev); @@ -92,6 +129,7 @@ int xenbus_read_otherend_details(struct xenbus_device *xendev, void xenbus_ring_ops_init(void); -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg); +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par); +void xenbus_dev_queue_reply(struct xb_req_data *req); #endif diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index c21ec02643e1..856ada5d39c9 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -42,11 +43,22 @@ #include #include "xenbus.h" +/* A list of replies. Currently only one will ever be outstanding. */ +LIST_HEAD(xs_reply_list); + +/* A list of write requests. */ +LIST_HEAD(xb_write_list); +DECLARE_WAIT_QUEUE_HEAD(xb_waitq); +DEFINE_MUTEX(xb_write_mutex); + +/* Protect xenbus reader thread against save/restore. */ +DEFINE_MUTEX(xs_response_mutex); + static int xenbus_irq; +static struct task_struct *xenbus_task; static DECLARE_WORK(probe_work, xenbus_probe); -static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); static irqreturn_t wake_waiting(int irq, void *unused) { @@ -84,30 +96,31 @@ static const void *get_input_chunk(XENSTORE_RING_IDX cons, return buf + MASK_XENSTORE_IDX(cons); } +static int xb_data_to_write(void) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + + return (intf->req_prod - intf->req_cons) != XENSTORE_RING_SIZE && + !list_empty(&xb_write_list); +} + /** * xb_write - low level write * @data: buffer to send * @len: length of buffer * - * Returns 0 on success, error otherwise. + * Returns number of bytes written or -err. */ -int xb_write(const void *data, unsigned len) +static int xb_write(const void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { void *dst; unsigned int avail; - rc = wait_event_interruptible( - xb_waitq, - (intf->req_prod - intf->req_cons) != - XENSTORE_RING_SIZE); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->req_cons; prod = intf->req_prod; @@ -115,6 +128,11 @@ int xb_write(const void *data, unsigned len) intf->req_cons = intf->req_prod = 0; return -EIO; } + if (!xb_data_to_write()) + return bytes; + + /* Must write data /after/ reading the consumer index. */ + virt_mb(); dst = get_output_chunk(cons, prod, intf->req, &avail); if (avail == 0) @@ -122,52 +140,45 @@ int xb_write(const void *data, unsigned len) if (avail > len) avail = len; - /* Must write data /after/ reading the consumer index. */ - virt_mb(); - memcpy(dst, data, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see new producer until data is there. */ virt_wmb(); intf->req_prod += avail; /* Implies mb(): other side will see the updated producer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (prod <= intf->req_cons) + notify_remote_via_evtchn(xen_store_evtchn); } - return 0; + return bytes; } -int xb_data_to_read(void) +static int xb_data_to_read(void) { struct xenstore_domain_interface *intf = xen_store_interface; return (intf->rsp_cons != intf->rsp_prod); } -int xb_wait_for_data_to_read(void) -{ - return wait_event_interruptible(xb_waitq, xb_data_to_read()); -} - -int xb_read(void *data, unsigned len) +static int xb_read(void *data, unsigned int len) { struct xenstore_domain_interface *intf = xen_store_interface; XENSTORE_RING_IDX cons, prod; - int rc; + unsigned int bytes = 0; while (len != 0) { unsigned int avail; const char *src; - rc = xb_wait_for_data_to_read(); - if (rc < 0) - return rc; - /* Read indexes, then verify. */ cons = intf->rsp_cons; prod = intf->rsp_prod; + if (cons == prod) + return bytes; + if (!check_indexes(cons, prod)) { intf->rsp_cons = intf->rsp_prod = 0; return -EIO; @@ -185,17 +196,243 @@ int xb_read(void *data, unsigned len) memcpy(data, src, avail); data += avail; len -= avail; + bytes += avail; /* Other side must not see free space until we've copied out */ virt_mb(); intf->rsp_cons += avail; - pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); - /* Implies mb(): other side will see the updated consumer. */ - notify_remote_via_evtchn(xen_store_evtchn); + if (intf->rsp_prod - cons >= XENSTORE_RING_SIZE) + notify_remote_via_evtchn(xen_store_evtchn); + } + + return bytes; +} + +static int process_msg(void) +{ + static struct { + struct xsd_sockmsg msg; + char *body; + union { + void *alloc; + struct xs_watch_event *watch; + }; + bool in_msg; + bool in_hdr; + unsigned int read; + } state; + struct xb_req_data *req; + int err; + unsigned int len; + + if (!state.in_msg) { + state.in_msg = true; + state.in_hdr = true; + state.read = 0; + + /* + * We must disallow save/restore while reading a message. + * A partial read across s/r leaves us out of sync with + * xenstored. + * xs_response_mutex is locked as long as we are processing one + * message. state.in_msg will be true as long as we are holding + * the lock here. + */ + mutex_lock(&xs_response_mutex); + + if (!xb_data_to_read()) { + /* We raced with save/restore: pending data 'gone'. */ + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + return 0; + } + } + + if (state.in_hdr) { + if (state.read != sizeof(state.msg)) { + err = xb_read((void *)&state.msg + state.read, + sizeof(state.msg) - state.read); + if (err < 0) + goto out; + state.read += err; + if (state.read != sizeof(state.msg)) + return 0; + if (state.msg.len > XENSTORE_PAYLOAD_MAX) { + err = -EINVAL; + goto out; + } + } + + len = state.msg.len + 1; + if (state.msg.type == XS_WATCH_EVENT) + len += sizeof(*state.watch); + + state.alloc = kmalloc(len, GFP_NOIO | __GFP_HIGH); + if (!state.alloc) + return -ENOMEM; + + if (state.msg.type == XS_WATCH_EVENT) + state.body = state.watch->body; + else + state.body = state.alloc; + state.in_hdr = false; + state.read = 0; + } + + err = xb_read(state.body + state.read, state.msg.len - state.read); + if (err < 0) + goto out; + + state.read += err; + if (state.read != state.msg.len) + return 0; + + state.body[state.msg.len] = '\0'; + + if (state.msg.type == XS_WATCH_EVENT) { + state.watch->len = state.msg.len; + err = xs_watch_msg(state.watch); + } else { + err = -ENOENT; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) { + if (req->msg.req_id == state.msg.req_id) { + if (req->state == xb_req_state_wait_reply) { + req->msg.type = state.msg.type; + req->msg.len = state.msg.len; + req->body = state.body; + req->state = xb_req_state_got_reply; + list_del(&req->list); + req->cb(req); + } else { + list_del(&req->list); + kfree(req); + } + err = 0; + break; + } + } + mutex_unlock(&xb_write_mutex); + if (err) + goto out; } + mutex_unlock(&xs_response_mutex); + + state.in_msg = false; + state.alloc = NULL; + return err; + + out: + mutex_unlock(&xs_response_mutex); + state.in_msg = false; + kfree(state.alloc); + state.alloc = NULL; + return err; +} + +static int process_writes(void) +{ + static struct { + struct xb_req_data *req; + int idx; + unsigned int written; + } state; + void *base; + unsigned int len; + int err = 0; + + if (!xb_data_to_write()) + return 0; + + mutex_lock(&xb_write_mutex); + + if (!state.req) { + state.req = list_first_entry(&xb_write_list, + struct xb_req_data, list); + state.idx = -1; + state.written = 0; + } + + if (state.req->state == xb_req_state_aborted) + goto out_err; + + while (state.idx < state.req->num_vecs) { + if (state.idx < 0) { + base = &state.req->msg; + len = sizeof(state.req->msg); + } else { + base = state.req->vec[state.idx].iov_base; + len = state.req->vec[state.idx].iov_len; + } + err = xb_write(base + state.written, len - state.written); + if (err < 0) + goto out_err; + state.written += err; + if (state.written != len) + goto out; + + state.idx++; + state.written = 0; + } + + list_del(&state.req->list); + state.req->state = xb_req_state_wait_reply; + list_add_tail(&state.req->list, &xs_reply_list); + state.req = NULL; + + out: + mutex_unlock(&xb_write_mutex); + + return 0; + + out_err: + state.req->msg.type = XS_ERROR; + state.req->err = err; + list_del(&state.req->list); + if (state.req->state == xb_req_state_aborted) + kfree(state.req); + else { + state.req->state = xb_req_state_got_reply; + wake_up(&state.req->wq); + } + + mutex_unlock(&xb_write_mutex); + + state.req = NULL; + + return err; +} + +static int xb_thread_work(void) +{ + return xb_data_to_read() || xb_data_to_write(); +} + +static int xenbus_thread(void *unused) +{ + int err; + + while (!kthread_should_stop()) { + if (wait_event_interruptible(xb_waitq, xb_thread_work())) + continue; + + err = process_msg(); + if (err == -ENOMEM) + schedule(); + else if (err) + pr_warn_ratelimited("error %d while reading message\n", + err); + + err = process_writes(); + if (err) + pr_warn_ratelimited("error %d while writing message\n", + err); + } + + xenbus_task = NULL; return 0; } @@ -223,6 +460,7 @@ int xb_init_comms(void) rebind_evtchn_irq(xen_store_evtchn, xenbus_irq); } else { int err; + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, 0, "xenbus", &xb_waitq); if (err < 0) { @@ -231,6 +469,13 @@ int xb_init_comms(void) } xenbus_irq = err; + + if (!xenbus_task) { + xenbus_task = kthread_run(xenbus_thread, NULL, + "xenbus"); + if (IS_ERR(xenbus_task)) + return PTR_ERR(xenbus_task); + } } return 0; diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index e4b984777507..4d343eed08f5 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -113,6 +113,7 @@ struct xenbus_file_priv { struct list_head read_buffers; wait_queue_head_t read_waitq; + struct kref kref; }; /* Read out any raw xenbus messages queued up. */ @@ -297,6 +298,107 @@ static void watch_fired(struct xenbus_watch *watch, mutex_unlock(&adap->dev_data->reply_mutex); } +static void xenbus_file_free(struct kref *kref) +{ + struct xenbus_file_priv *u; + struct xenbus_transaction_holder *trans, *tmp; + struct watch_adapter *watch, *tmp_watch; + struct read_buffer *rb, *tmp_rb; + + u = container_of(kref, struct xenbus_file_priv, kref); + + /* + * No need for locking here because there are no other users, + * by definition. + */ + + list_for_each_entry_safe(trans, tmp, &u->transactions, list) { + xenbus_transaction_end(trans->handle, 1); + list_del(&trans->list); + kfree(trans); + } + + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + } + + list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { + list_del(&rb->list); + kfree(rb); + } + kfree(u); +} + +static struct xenbus_transaction_holder *xenbus_get_transaction( + struct xenbus_file_priv *u, uint32_t tx_id) +{ + struct xenbus_transaction_holder *trans; + + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == tx_id) + return trans; + + return NULL; +} + +void xenbus_dev_queue_reply(struct xb_req_data *req) +{ + struct xenbus_file_priv *u = req->par; + struct xenbus_transaction_holder *trans = NULL; + int rc; + LIST_HEAD(staging_q); + + xs_request_exit(req); + + mutex_lock(&u->msgbuffer_mutex); + + if (req->type == XS_TRANSACTION_START) { + trans = xenbus_get_transaction(u, 0); + if (WARN_ON(!trans)) + goto out; + if (req->msg.type == XS_ERROR) { + list_del(&trans->list); + kfree(trans); + } else { + rc = kstrtou32(req->body, 10, &trans->handle.id); + if (WARN_ON(rc)) + goto out; + } + } else if (req->msg.type == XS_TRANSACTION_END) { + trans = xenbus_get_transaction(u, req->msg.tx_id); + if (WARN_ON(!trans)) + goto out; + list_del(&trans->list); + kfree(trans); + } + + mutex_unlock(&u->msgbuffer_mutex); + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&staging_q, &req->msg, sizeof(req->msg)); + if (!rc) + rc = queue_reply(&staging_q, req->body, req->msg.len); + if (!rc) { + list_splice_tail(&staging_q, &u->read_buffers); + wake_up(&u->read_waitq); + } else { + queue_cleanup(&staging_q); + } + mutex_unlock(&u->reply_mutex); + + kfree(req->body); + kfree(req); + + kref_put(&u->kref, xenbus_file_free); + + return; + + out: + mutex_unlock(&u->msgbuffer_mutex); +} + static int xenbus_command_reply(struct xenbus_file_priv *u, unsigned int msg_type, const char *reply) { @@ -317,6 +419,9 @@ static int xenbus_command_reply(struct xenbus_file_priv *u, wake_up(&u->read_waitq); mutex_unlock(&u->reply_mutex); + if (!rc) + kref_put(&u->kref, xenbus_file_free); + return rc; } @@ -324,57 +429,22 @@ static int xenbus_write_transaction(unsigned msg_type, struct xenbus_file_priv *u) { int rc; - void *reply; struct xenbus_transaction_holder *trans = NULL; - LIST_HEAD(staging_q); if (msg_type == XS_TRANSACTION_START) { - trans = kmalloc(sizeof(*trans), GFP_KERNEL); + trans = kzalloc(sizeof(*trans), GFP_KERNEL); if (!trans) { rc = -ENOMEM; goto out; } - } else if (u->u.msg.tx_id != 0) { - list_for_each_entry(trans, &u->transactions, list) - if (trans->handle.id == u->u.msg.tx_id) - break; - if (&trans->list == &u->transactions) - return xenbus_command_reply(u, XS_ERROR, "ENOENT"); - } - - reply = xenbus_dev_request_and_reply(&u->u.msg); - if (IS_ERR(reply)) { - if (msg_type == XS_TRANSACTION_START) - kfree(trans); - rc = PTR_ERR(reply); - goto out; - } + list_add(&trans->list, &u->transactions); + } else if (u->u.msg.tx_id != 0 && + !xenbus_get_transaction(u, u->u.msg.tx_id)) + return xenbus_command_reply(u, XS_ERROR, "ENOENT"); - if (msg_type == XS_TRANSACTION_START) { - if (u->u.msg.type == XS_ERROR) - kfree(trans); - else { - trans->handle.id = simple_strtoul(reply, NULL, 0); - list_add(&trans->list, &u->transactions); - } - } else if (u->u.msg.type == XS_TRANSACTION_END) { - list_del(&trans->list); + rc = xenbus_dev_request_and_reply(&u->u.msg, u); + if (rc) kfree(trans); - } - - mutex_lock(&u->reply_mutex); - rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg)); - if (!rc) - rc = queue_reply(&staging_q, reply, u->u.msg.len); - if (!rc) { - list_splice_tail(&staging_q, &u->read_buffers); - wake_up(&u->read_waitq); - } else { - queue_cleanup(&staging_q); - } - mutex_unlock(&u->reply_mutex); - - kfree(reply); out: return rc; @@ -506,6 +576,8 @@ static ssize_t xenbus_file_write(struct file *filp, * OK, now we have a complete message. Do something with it. */ + kref_get(&u->kref); + msg_type = u->u.msg.type; switch (msg_type) { @@ -520,8 +592,10 @@ static ssize_t xenbus_file_write(struct file *filp, ret = xenbus_write_transaction(msg_type, u); break; } - if (ret != 0) + if (ret != 0) { rc = ret; + kref_put(&u->kref, xenbus_file_free); + } /* Buffered message consumed */ u->len = 0; @@ -546,6 +620,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) if (u == NULL) return -ENOMEM; + kref_init(&u->kref); + INIT_LIST_HEAD(&u->transactions); INIT_LIST_HEAD(&u->watches); INIT_LIST_HEAD(&u->read_buffers); @@ -562,32 +638,8 @@ static int xenbus_file_open(struct inode *inode, struct file *filp) static int xenbus_file_release(struct inode *inode, struct file *filp) { struct xenbus_file_priv *u = filp->private_data; - struct xenbus_transaction_holder *trans, *tmp; - struct watch_adapter *watch, *tmp_watch; - struct read_buffer *rb, *tmp_rb; - - /* - * No need for locking here because there are no other users, - * by definition. - */ - list_for_each_entry_safe(trans, tmp, &u->transactions, list) { - xenbus_transaction_end(trans->handle, 1); - list_del(&trans->list); - kfree(trans); - } - - list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { - unregister_xenbus_watch(&watch->watch); - list_del(&watch->list); - free_watch_adapter(watch); - } - - list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { - list_del(&rb->list); - kfree(rb); - } - kfree(u); + kref_put(&u->kref, xenbus_file_free); return 0; } diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index ebc768f44abe..e46080214955 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -50,61 +51,28 @@ #include #include "xenbus.h" -struct xs_stored_msg { - struct list_head list; - - struct xsd_sockmsg hdr; - - union { - /* Queued replies. */ - struct { - char *body; - } reply; - - /* Queued watch events. */ - struct { - struct xenbus_watch *handle; - const char *path; - const char *token; - } watch; - } u; -}; +/* + * Framework to protect suspend/resume handling against normal Xenstore + * message handling: + * During suspend/resume there must be no open transaction and no pending + * Xenstore request. + * New watch events happening in this time can be ignored by firing all watches + * after resume. + */ -struct xs_handle { - /* A list of replies. Currently only one will ever be outstanding. */ - struct list_head reply_list; - spinlock_t reply_lock; - wait_queue_head_t reply_waitq; - - /* - * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. - * response_mutex is never taken simultaneously with the other three. - * - * transaction_mutex must be held before incrementing - * transaction_count. The mutex is held when a suspend is in - * progress to prevent new transactions starting. - * - * When decrementing transaction_count to zero the wait queue - * should be woken up, the suspend code waits for count to - * reach zero. - */ - - /* One request at a time. */ - struct mutex request_mutex; - - /* Protect xenbus reader thread against save/restore. */ - struct mutex response_mutex; - - /* Protect transactions against save/restore. */ - struct mutex transaction_mutex; - atomic_t transaction_count; - wait_queue_head_t transaction_wq; - - /* Protect watch (de)register against save/restore. */ - struct rw_semaphore watch_mutex; -}; +/* Lock protecting enter/exit critical region. */ +static DEFINE_SPINLOCK(xs_state_lock); +/* Number of users in critical region (protected by xs_state_lock). */ +static unsigned int xs_state_users; +/* Suspend handler waiting or already active (protected by xs_state_lock)? */ +static int xs_suspend_active; +/* Unique Xenstore request id (protected by xs_state_lock). */ +static uint32_t xs_request_id; -static struct xs_handle xs_state; +/* Wait queue for all callers waiting for critical region to become usable. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_enter_wq); +/* Wait queue for suspend handling waiting for critical region being empty. */ +static DECLARE_WAIT_QUEUE_HEAD(xs_state_exit_wq); /* List of registered watches, and a lock to protect it. */ static LIST_HEAD(watches); @@ -114,6 +82,9 @@ static DEFINE_SPINLOCK(watches_lock); static LIST_HEAD(watch_events); static DEFINE_SPINLOCK(watch_events_lock); +/* Protect watch (de)register against save/restore. */ +static DECLARE_RWSEM(xs_watch_rwsem); + /* * Details of the xenwatch callback kernel thread. The thread waits on the * watch_events_waitq for work to do (queued on watch_events list). When it @@ -124,6 +95,59 @@ static pid_t xenwatch_pid; static DEFINE_MUTEX(xenwatch_mutex); static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); +static void xs_suspend_enter(void) +{ + spin_lock(&xs_state_lock); + xs_suspend_active++; + spin_unlock(&xs_state_lock); + wait_event(xs_state_exit_wq, xs_state_users == 0); +} + +static void xs_suspend_exit(void) +{ + spin_lock(&xs_state_lock); + xs_suspend_active--; + spin_unlock(&xs_state_lock); + wake_up_all(&xs_state_enter_wq); +} + +static uint32_t xs_request_enter(struct xb_req_data *req) +{ + uint32_t rq_id; + + req->type = req->msg.type; + + spin_lock(&xs_state_lock); + + while (!xs_state_users && xs_suspend_active) { + spin_unlock(&xs_state_lock); + wait_event(xs_state_enter_wq, xs_suspend_active == 0); + spin_lock(&xs_state_lock); + } + + if (req->type == XS_TRANSACTION_START) + xs_state_users++; + xs_state_users++; + rq_id = xs_request_id++; + + spin_unlock(&xs_state_lock); + + return rq_id; +} + +void xs_request_exit(struct xb_req_data *req) +{ + spin_lock(&xs_state_lock); + xs_state_users--; + if ((req->type == XS_TRANSACTION_START && req->msg.type == XS_ERROR) || + req->type == XS_TRANSACTION_END) + xs_state_users--; + spin_unlock(&xs_state_lock); + + if (xs_suspend_active && !xs_state_users) + wake_up(&xs_state_exit_wq); +} + static int get_error(const char *errorstring) { unsigned int i; @@ -161,21 +185,24 @@ static bool xenbus_ok(void) } return false; } -static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) + +static bool test_reply(struct xb_req_data *req) { - struct xs_stored_msg *msg; - char *body; + if (req->state == xb_req_state_got_reply || !xenbus_ok()) + return true; - spin_lock(&xs_state.reply_lock); + /* Make sure to reread req->state each time. */ + barrier(); - while (list_empty(&xs_state.reply_list)) { - spin_unlock(&xs_state.reply_lock); - if (xenbus_ok()) - /* XXX FIXME: Avoid synchronous wait for response here. */ - wait_event_timeout(xs_state.reply_waitq, - !list_empty(&xs_state.reply_list), - msecs_to_jiffies(500)); - else { + return false; +} + +static void *read_reply(struct xb_req_data *req) +{ + while (req->state != xb_req_state_got_reply) { + wait_event(req->wq, test_reply(req)); + + if (!xenbus_ok()) /* * If we are in the process of being shut-down there is * no point of trying to contact XenBus - it is either @@ -183,76 +210,82 @@ static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) * has been killed or is unreachable. */ return ERR_PTR(-EIO); - } - spin_lock(&xs_state.reply_lock); + if (req->err) + return ERR_PTR(req->err); + } - msg = list_entry(xs_state.reply_list.next, - struct xs_stored_msg, list); - list_del(&msg->list); + return req->body; +} - spin_unlock(&xs_state.reply_lock); +static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) +{ + bool notify; - *type = msg->hdr.type; - if (len) - *len = msg->hdr.len; - body = msg->u.reply.body; + req->msg = *msg; + req->err = 0; + req->state = xb_req_state_queued; + init_waitqueue_head(&req->wq); - kfree(msg); + req->msg.req_id = xs_request_enter(req); - return body; -} + mutex_lock(&xb_write_mutex); + list_add_tail(&req->list, &xb_write_list); + notify = list_is_singular(&xb_write_list); + mutex_unlock(&xb_write_mutex); -static void transaction_start(void) -{ - mutex_lock(&xs_state.transaction_mutex); - atomic_inc(&xs_state.transaction_count); - mutex_unlock(&xs_state.transaction_mutex); + if (notify) + wake_up(&xb_waitq); } -static void transaction_end(void) +static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg) { - if (atomic_dec_and_test(&xs_state.transaction_count)) - wake_up(&xs_state.transaction_wq); -} + void *ret; -static void transaction_suspend(void) -{ - mutex_lock(&xs_state.transaction_mutex); - wait_event(xs_state.transaction_wq, - atomic_read(&xs_state.transaction_count) == 0); + ret = read_reply(req); + + xs_request_exit(req); + + msg->type = req->msg.type; + msg->len = req->msg.len; + + mutex_lock(&xb_write_mutex); + if (req->state == xb_req_state_queued || + req->state == xb_req_state_wait_reply) + req->state = xb_req_state_aborted; + else + kfree(req); + mutex_unlock(&xb_write_mutex); + + return ret; } -static void transaction_resume(void) +static void xs_wake_up(struct xb_req_data *req) { - mutex_unlock(&xs_state.transaction_mutex); + wake_up(&req->wq); } -void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) +int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par) { - void *ret; - enum xsd_sockmsg_type type = msg->type; - int err; + struct xb_req_data *req; + struct kvec *vec; - if (type == XS_TRANSACTION_START) - transaction_start(); - - mutex_lock(&xs_state.request_mutex); + req = kmalloc(sizeof(*req) + sizeof(*vec), GFP_KERNEL); + if (!req) + return -ENOMEM; - err = xb_write(msg, sizeof(*msg) + msg->len); - if (err) { - msg->type = XS_ERROR; - ret = ERR_PTR(err); - } else - ret = read_reply(&msg->type, &msg->len); + vec = (struct kvec *)(req + 1); + vec->iov_len = msg->len; + vec->iov_base = msg + 1; - mutex_unlock(&xs_state.request_mutex); + req->vec = vec; + req->num_vecs = 1; + req->cb = xenbus_dev_queue_reply; + req->par = par; - if ((msg->type == XS_TRANSACTION_END) || - ((type == XS_TRANSACTION_START) && (msg->type == XS_ERROR))) - transaction_end(); + xs_send(req, msg); - return ret; + return 0; } EXPORT_SYMBOL(xenbus_dev_request_and_reply); @@ -263,37 +296,31 @@ static void *xs_talkv(struct xenbus_transaction t, unsigned int num_vecs, unsigned int *len) { + struct xb_req_data *req; struct xsd_sockmsg msg; void *ret = NULL; unsigned int i; int err; + req = kmalloc(sizeof(*req), GFP_NOIO | __GFP_HIGH); + if (!req) + return ERR_PTR(-ENOMEM); + + req->vec = iovec; + req->num_vecs = num_vecs; + req->cb = xs_wake_up; + msg.tx_id = t.id; - msg.req_id = 0; msg.type = type; msg.len = 0; for (i = 0; i < num_vecs; i++) msg.len += iovec[i].iov_len; - mutex_lock(&xs_state.request_mutex); - - err = xb_write(&msg, sizeof(msg)); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - - for (i = 0; i < num_vecs; i++) { - err = xb_write(iovec[i].iov_base, iovec[i].iov_len); - if (err) { - mutex_unlock(&xs_state.request_mutex); - return ERR_PTR(err); - } - } - - ret = read_reply(&msg.type, len); + xs_send(req, &msg); - mutex_unlock(&xs_state.request_mutex); + ret = xs_wait_for_reply(req, &msg); + if (len) + *len = msg.len; if (IS_ERR(ret)) return ret; @@ -500,13 +527,9 @@ int xenbus_transaction_start(struct xenbus_transaction *t) { char *id_str; - transaction_start(); - id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); - if (IS_ERR(id_str)) { - transaction_end(); + if (IS_ERR(id_str)) return PTR_ERR(id_str); - } t->id = simple_strtoul(id_str, NULL, 0); kfree(id_str); @@ -520,18 +543,13 @@ EXPORT_SYMBOL_GPL(xenbus_transaction_start); int xenbus_transaction_end(struct xenbus_transaction t, int abort) { char abortstr[2]; - int err; if (abort) strcpy(abortstr, "F"); else strcpy(abortstr, "T"); - err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); - - transaction_end(); - - return err; + return xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); } EXPORT_SYMBOL_GPL(xenbus_transaction_end); @@ -664,6 +682,30 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } + +int xs_watch_msg(struct xs_watch_event *event) +{ + if (count_strings(event->body, event->len) != 2) { + kfree(event); + return -EINVAL; + } + event->path = (const char *)event->body; + event->token = (const char *)strchr(event->body, '\0') + 1; + + spin_lock(&watches_lock); + event->handle = find_watch(event->token); + if (event->handle != NULL) { + spin_lock(&watch_events_lock); + list_add_tail(&event->list, &watch_events); + wake_up(&watch_events_waitq); + spin_unlock(&watch_events_lock); + } else + kfree(event); + spin_unlock(&watches_lock); + + return 0; +} + /* * Certain older XenBus toolstack cannot handle reading values that are * not populated. Some Xen 3.4 installation are incapable of doing this @@ -712,7 +754,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(find_watch(token)); @@ -727,7 +769,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) spin_unlock(&watches_lock); } - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); return err; } @@ -735,13 +777,13 @@ EXPORT_SYMBOL_GPL(register_xenbus_watch); void unregister_xenbus_watch(struct xenbus_watch *watch) { - struct xs_stored_msg *msg, *tmp; + struct xs_watch_event *event, *tmp; char token[sizeof(watch) * 2 + 1]; int err; sprintf(token, "%lX", (long)watch); - down_read(&xs_state.watch_mutex); + down_read(&xs_watch_rwsem); spin_lock(&watches_lock); BUG_ON(!find_watch(token)); @@ -752,7 +794,7 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) if (err) pr_warn("Failed to release watch %s: %i\n", watch->node, err); - up_read(&xs_state.watch_mutex); + up_read(&xs_watch_rwsem); /* Make sure there are no callbacks running currently (unless its us) */ @@ -761,12 +803,11 @@ void unregister_xenbus_watch(struct xenbus_watch *watch) /* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(msg, tmp, &watch_events, list) { - if (msg->u.watch.handle != watch) + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) continue; - list_del(&msg->list); - kfree(msg->u.watch.path); - kfree(msg); + list_del(&event->list); + kfree(event); } spin_unlock(&watch_events_lock); @@ -777,10 +818,10 @@ EXPORT_SYMBOL_GPL(unregister_xenbus_watch); void xs_suspend(void) { - transaction_suspend(); - down_write(&xs_state.watch_mutex); - mutex_lock(&xs_state.request_mutex); - mutex_lock(&xs_state.response_mutex); + xs_suspend_enter(); + + down_write(&xs_watch_rwsem); + mutex_lock(&xs_response_mutex); } void xs_resume(void) @@ -790,31 +831,31 @@ void xs_resume(void) xb_init_comms(); - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - transaction_resume(); + mutex_unlock(&xs_response_mutex); + + xs_suspend_exit(); - /* No need for watches_lock: the watch_mutex is sufficient. */ + /* No need for watches_lock: the xs_watch_rwsem is sufficient. */ list_for_each_entry(watch, &watches, list) { sprintf(token, "%lX", (long)watch); xs_watch(watch->node, token); } - up_write(&xs_state.watch_mutex); + up_write(&xs_watch_rwsem); } void xs_suspend_cancel(void) { - mutex_unlock(&xs_state.response_mutex); - mutex_unlock(&xs_state.request_mutex); - up_write(&xs_state.watch_mutex); - mutex_unlock(&xs_state.transaction_mutex); + mutex_unlock(&xs_response_mutex); + up_write(&xs_watch_rwsem); + + xs_suspend_exit(); } static int xenwatch_thread(void *unused) { struct list_head *ent; - struct xs_stored_msg *msg; + struct xs_watch_event *event; for (;;) { wait_event_interruptible(watch_events_waitq, @@ -832,12 +873,10 @@ static int xenwatch_thread(void *unused) spin_unlock(&watch_events_lock); if (ent != &watch_events) { - msg = list_entry(ent, struct xs_stored_msg, list); - msg->u.watch.handle->callback(msg->u.watch.handle, - msg->u.watch.path, - msg->u.watch.token); - kfree(msg->u.watch.path); - kfree(msg); + event = list_entry(ent, struct xs_watch_event, list); + event->handle->callback(event->handle, event->path, + event->token); + kfree(event); } mutex_unlock(&xenwatch_mutex); @@ -846,126 +885,37 @@ static int xenwatch_thread(void *unused) return 0; } -static int process_msg(void) +/* + * Wake up all threads waiting for a xenstore reply. In case of shutdown all + * pending replies will be marked as "aborted" in order to let the waiters + * return in spite of xenstore possibly no longer being able to reply. This + * will avoid blocking shutdown by a thread waiting for xenstore but being + * necessary for shutdown processing to proceed. + */ +static int xs_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) { - struct xs_stored_msg *msg; - char *body; - int err; - - /* - * We must disallow save/restore while reading a xenstore message. - * A partial read across s/r leaves us out of sync with xenstored. - */ - for (;;) { - err = xb_wait_for_data_to_read(); - if (err) - return err; - mutex_lock(&xs_state.response_mutex); - if (xb_data_to_read()) - break; - /* We raced with save/restore: pending data 'disappeared'. */ - mutex_unlock(&xs_state.response_mutex); - } + struct xb_req_data *req; - - msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH); - if (msg == NULL) { - err = -ENOMEM; - goto out; - } - - err = xb_read(&msg->hdr, sizeof(msg->hdr)); - if (err) { - kfree(msg); - goto out; - } - - if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { - kfree(msg); - err = -EINVAL; - goto out; - } - - body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); - if (body == NULL) { - kfree(msg); - err = -ENOMEM; - goto out; - } - - err = xb_read(body, msg->hdr.len); - if (err) { - kfree(body); - kfree(msg); - goto out; - } - body[msg->hdr.len] = '\0'; - - if (msg->hdr.type == XS_WATCH_EVENT) { - if (count_strings(body, msg->hdr.len) != 2) { - err = -EINVAL; - kfree(msg); - kfree(body); - goto out; - } - msg->u.watch.path = (const char *)body; - msg->u.watch.token = (const char *)strchr(body, '\0') + 1; - - spin_lock(&watches_lock); - msg->u.watch.handle = find_watch(msg->u.watch.token); - if (msg->u.watch.handle != NULL) { - spin_lock(&watch_events_lock); - list_add_tail(&msg->list, &watch_events); - wake_up(&watch_events_waitq); - spin_unlock(&watch_events_lock); - } else { - kfree(body); - kfree(msg); - } - spin_unlock(&watches_lock); - } else { - msg->u.reply.body = body; - spin_lock(&xs_state.reply_lock); - list_add_tail(&msg->list, &xs_state.reply_list); - spin_unlock(&xs_state.reply_lock); - wake_up(&xs_state.reply_waitq); - } - - out: - mutex_unlock(&xs_state.response_mutex); - return err; + mutex_lock(&xb_write_mutex); + list_for_each_entry(req, &xs_reply_list, list) + wake_up(&req->wq); + list_for_each_entry(req, &xb_write_list, list) + wake_up(&req->wq); + mutex_unlock(&xb_write_mutex); + return NOTIFY_DONE; } -static int xenbus_thread(void *unused) -{ - int err; - - for (;;) { - err = process_msg(); - if (err) - pr_warn("error %d while reading message\n", err); - if (kthread_should_stop()) - break; - } - - return 0; -} +static struct notifier_block xs_reboot_nb = { + .notifier_call = xs_reboot_notify, +}; int xs_init(void) { int err; struct task_struct *task; - INIT_LIST_HEAD(&xs_state.reply_list); - spin_lock_init(&xs_state.reply_lock); - init_waitqueue_head(&xs_state.reply_waitq); - - mutex_init(&xs_state.request_mutex); - mutex_init(&xs_state.response_mutex); - mutex_init(&xs_state.transaction_mutex); - init_rwsem(&xs_state.watch_mutex); - atomic_set(&xs_state.transaction_count, 0); - init_waitqueue_head(&xs_state.transaction_wq); + register_reboot_notifier(&xs_reboot_nb); /* Initialize the shared memory rings to talk to xenstored */ err = xb_init_comms(); @@ -977,10 +927,6 @@ int xs_init(void) return PTR_ERR(task); xenwatch_pid = task->pid; - task = kthread_run(xenbus_thread, NULL, "xenbus"); - if (IS_ERR(task)) - return PTR_ERR(task); - /* shutdown watches for kexec boot */ xs_reset_watches(); -- cgit v1.2.3-59-g8ed1b From dc9eab6fd94dd26340749321bba2c58634761516 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Mon, 13 Feb 2017 17:03:22 +0000 Subject: xen/privcmd: return -ENOTTY for unimplemented IOCTLs The code sets the default return code to -ENOSYS but then overrides this to -EINVAL in the switch() statement's default case, which is clearly silly. This patch removes the override and sets the default return code to -ENOTTY, which is the conventional return for an unimplemented ioctl. Signed-off-by: Paul Durrant Signed-off-by: Boris Ostrovsky --- drivers/xen/privcmd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 6e3306f4a525..5e5c7aef0c9f 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -551,7 +551,7 @@ out_unlock: static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { - int ret = -ENOSYS; + int ret = -ENOTTY; void __user *udata = (void __user *) data; switch (cmd) { @@ -572,7 +572,6 @@ static long privcmd_ioctl(struct file *file, break; default: - ret = -EINVAL; break; } -- cgit v1.2.3-59-g8ed1b From ab520be8cd5d56867fc95cfbc34b90880faf1f9d Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Mon, 13 Feb 2017 17:03:23 +0000 Subject: xen/privcmd: Add IOCTL_PRIVCMD_DM_OP Recently a new dm_op[1] hypercall was added to Xen to provide a mechanism for restricting device emulators (such as QEMU) to a limited set of hypervisor operations, and being able to audit those operations in the kernel of the domain in which they run. This patch adds IOCTL_PRIVCMD_DM_OP as gateway for __HYPERVISOR_dm_op. NOTE: There is no requirement for user-space code to bounce data through locked memory buffers (as with IOCTL_PRIVCMD_HYPERCALL) since privcmd has enough information to lock the original buffers directly. [1] http://xenbits.xen.org/gitweb/?p=xen.git;a=commit;h=524a98c2 Signed-off-by: Paul Durrant Acked-by: Stefano Stabellini Signed-off-by: Boris Ostrovsky --- arch/arm/xen/enlighten.c | 1 + arch/arm/xen/hypercall.S | 1 + arch/arm64/xen/hypercall.S | 1 + arch/x86/include/asm/xen/hypercall.h | 7 ++ drivers/xen/privcmd.c | 139 +++++++++++++++++++++++++++++++++++ include/uapi/xen/privcmd.h | 13 ++++ include/xen/arm/hypercall.h | 1 + include/xen/interface/hvm/dm_op.h | 32 ++++++++ include/xen/interface/xen.h | 1 + 9 files changed, 196 insertions(+) create mode 100644 include/xen/interface/hvm/dm_op.h (limited to 'drivers') diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 11d9f2898b16..81e3217b12d3 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -457,4 +457,5 @@ EXPORT_SYMBOL_GPL(HYPERVISOR_tmem_op); EXPORT_SYMBOL_GPL(HYPERVISOR_platform_op); EXPORT_SYMBOL_GPL(HYPERVISOR_multicall); EXPORT_SYMBOL_GPL(HYPERVISOR_vm_assist); +EXPORT_SYMBOL_GPL(HYPERVISOR_dm_op); EXPORT_SYMBOL_GPL(privcmd_call); diff --git a/arch/arm/xen/hypercall.S b/arch/arm/xen/hypercall.S index a648dfc3be30..b0b80c0f09f3 100644 --- a/arch/arm/xen/hypercall.S +++ b/arch/arm/xen/hypercall.S @@ -92,6 +92,7 @@ HYPERCALL1(tmem_op); HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); HYPERCALL2(vm_assist); +HYPERCALL3(dm_op); ENTRY(privcmd_call) stmdb sp!, {r4} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 947830a459d2..401ceb71540c 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -84,6 +84,7 @@ HYPERCALL1(tmem_op); HYPERCALL1(platform_op_raw); HYPERCALL2(multicall); HYPERCALL2(vm_assist); +HYPERCALL3(dm_op); ENTRY(privcmd_call) mov x16, x0 diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index a12a047184ee..f6d20f6cca12 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -472,6 +472,13 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg) return _hypercall2(int, xenpmu_op, op, arg); } +static inline int +HYPERVISOR_dm_op( + domid_t dom, unsigned int nr_bufs, void *bufs) +{ + return _hypercall3(int, dm_op, dom, nr_bufs, bufs); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 5e5c7aef0c9f..1a6f1860e008 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +45,17 @@ MODULE_LICENSE("GPL"); #define PRIV_VMA_LOCKED ((void *)1) +static unsigned int privcmd_dm_op_max_num = 16; +module_param_named(dm_op_max_nr_bufs, privcmd_dm_op_max_num, uint, 0644); +MODULE_PARM_DESC(dm_op_max_nr_bufs, + "Maximum number of buffers per dm_op hypercall"); + +static unsigned int privcmd_dm_op_buf_max_size = 4096; +module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, + 0644); +MODULE_PARM_DESC(dm_op_buf_max_size, + "Maximum size of a dm_op hypercall buffer"); + static int privcmd_vma_range_is_mapped( struct vm_area_struct *vma, unsigned long addr, @@ -548,6 +561,128 @@ out_unlock: goto out; } +static int lock_pages( + struct privcmd_dm_op_buf kbufs[], unsigned int num, + struct page *pages[], unsigned int nr_pages) +{ + unsigned int i; + + for (i = 0; i < num; i++) { + unsigned int requested; + int pinned; + + requested = DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE); + if (requested > nr_pages) + return -ENOSPC; + + pinned = get_user_pages_fast( + (unsigned long) kbufs[i].uptr, + requested, FOLL_WRITE, pages); + if (pinned < 0) + return pinned; + + nr_pages -= pinned; + pages += pinned; + } + + return 0; +} + +static void unlock_pages(struct page *pages[], unsigned int nr_pages) +{ + unsigned int i; + + if (!pages) + return; + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + put_page(pages[i]); + } +} + +static long privcmd_ioctl_dm_op(void __user *udata) +{ + struct privcmd_dm_op kdata; + struct privcmd_dm_op_buf *kbufs; + unsigned int nr_pages = 0; + struct page **pages = NULL; + struct xen_dm_op_buf *xbufs = NULL; + unsigned int i; + long rc; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + if (kdata.num == 0) + return 0; + + if (kdata.num > privcmd_dm_op_max_num) + return -E2BIG; + + kbufs = kcalloc(kdata.num, sizeof(*kbufs), GFP_KERNEL); + if (!kbufs) + return -ENOMEM; + + if (copy_from_user(kbufs, kdata.ubufs, + sizeof(*kbufs) * kdata.num)) { + rc = -EFAULT; + goto out; + } + + for (i = 0; i < kdata.num; i++) { + if (kbufs[i].size > privcmd_dm_op_buf_max_size) { + rc = -E2BIG; + goto out; + } + + if (!access_ok(VERIFY_WRITE, kbufs[i].uptr, + kbufs[i].size)) { + rc = -EFAULT; + goto out; + } + + nr_pages += DIV_ROUND_UP( + offset_in_page(kbufs[i].uptr) + kbufs[i].size, + PAGE_SIZE); + } + + pages = kcalloc(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + rc = -ENOMEM; + goto out; + } + + xbufs = kcalloc(kdata.num, sizeof(*xbufs), GFP_KERNEL); + if (!xbufs) { + rc = -ENOMEM; + goto out; + } + + rc = lock_pages(kbufs, kdata.num, pages, nr_pages); + if (rc) + goto out; + + for (i = 0; i < kdata.num; i++) { + set_xen_guest_handle(xbufs[i].h, kbufs[i].uptr); + xbufs[i].size = kbufs[i].size; + } + + xen_preemptible_hcall_begin(); + rc = HYPERVISOR_dm_op(kdata.dom, kdata.num, xbufs); + xen_preemptible_hcall_end(); + +out: + unlock_pages(pages, nr_pages); + kfree(xbufs); + kfree(pages); + kfree(kbufs); + + return rc; +} + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { @@ -571,6 +706,10 @@ static long privcmd_ioctl(struct file *file, ret = privcmd_ioctl_mmap_batch(udata, 2); break; + case IOCTL_PRIVCMD_DM_OP: + ret = privcmd_ioctl_dm_op(udata); + break; + default: break; } diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h index 7ddeeda93809..f8c5d75b99e1 100644 --- a/include/uapi/xen/privcmd.h +++ b/include/uapi/xen/privcmd.h @@ -77,6 +77,17 @@ struct privcmd_mmapbatch_v2 { int __user *err; /* array of error codes */ }; +struct privcmd_dm_op_buf { + void __user *uptr; + size_t size; +}; + +struct privcmd_dm_op { + domid_t dom; + __u16 num; + const struct privcmd_dm_op_buf __user *ubufs; +}; + /* * @cmd: IOCTL_PRIVCMD_HYPERCALL * @arg: &privcmd_hypercall_t @@ -98,5 +109,7 @@ struct privcmd_mmapbatch_v2 { _IOC(_IOC_NONE, 'P', 3, sizeof(struct privcmd_mmapbatch)) #define IOCTL_PRIVCMD_MMAPBATCH_V2 \ _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2)) +#define IOCTL_PRIVCMD_DM_OP \ + _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ diff --git a/include/xen/arm/hypercall.h b/include/xen/arm/hypercall.h index 9d874db13c0e..73db4b2eeb89 100644 --- a/include/xen/arm/hypercall.h +++ b/include/xen/arm/hypercall.h @@ -53,6 +53,7 @@ int HYPERVISOR_physdev_op(int cmd, void *arg); int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); int HYPERVISOR_tmem_op(void *arg); int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type); +int HYPERVISOR_dm_op(domid_t domid, unsigned int nr_bufs, void *bufs); int HYPERVISOR_platform_op_raw(void *arg); static inline int HYPERVISOR_platform_op(struct xen_platform_op *op) { diff --git a/include/xen/interface/hvm/dm_op.h b/include/xen/interface/hvm/dm_op.h new file mode 100644 index 000000000000..ee9e480bc559 --- /dev/null +++ b/include/xen/interface/hvm/dm_op.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2016, Citrix Systems Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef __XEN_PUBLIC_HVM_DM_OP_H__ +#define __XEN_PUBLIC_HVM_DM_OP_H__ + +struct xen_dm_op_buf { + GUEST_HANDLE(void) h; + xen_ulong_t size; +}; +DEFINE_GUEST_HANDLE_STRUCT(xen_dm_op_buf); + +#endif /* __XEN_PUBLIC_HVM_DM_OP_H__ */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 1b0d189cd3d3..4f4830ef8f93 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -81,6 +81,7 @@ #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ #define __HYPERVISOR_xenpmu_op 40 +#define __HYPERVISOR_dm_op 41 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 -- cgit v1.2.3-59-g8ed1b From 4610d240d691768203fdd210a5da0a2e02eddb76 Mon Sep 17 00:00:00 2001 From: Paul Durrant Date: Mon, 13 Feb 2017 17:03:24 +0000 Subject: xen/privcmd: add IOCTL_PRIVCMD_RESTRICT The purpose if this ioctl is to allow a user of privcmd to restrict its operation such that it will no longer service arbitrary hypercalls via IOCTL_PRIVCMD_HYPERCALL, and will check for a matching domid when servicing IOCTL_PRIVCMD_DM_OP or IOCTL_PRIVCMD_MMAP*. The aim of this is to limit the attack surface for a compromised device model. Signed-off-by: Paul Durrant Signed-off-by: Boris Ostrovsky --- drivers/xen/privcmd.c | 88 +++++++++++++++++++++++++++++++++++++++++----- include/uapi/xen/privcmd.h | 2 ++ 2 files changed, 81 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 1a6f1860e008..2077a3ac7c0c 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -56,16 +56,25 @@ module_param_named(dm_op_buf_max_size, privcmd_dm_op_buf_max_size, uint, MODULE_PARM_DESC(dm_op_buf_max_size, "Maximum size of a dm_op hypercall buffer"); +struct privcmd_data { + domid_t domid; +}; + static int privcmd_vma_range_is_mapped( struct vm_area_struct *vma, unsigned long addr, unsigned long nr_pages); -static long privcmd_ioctl_hypercall(void __user *udata) +static long privcmd_ioctl_hypercall(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_hypercall hypercall; long ret; + /* Disallow arbitrary hypercalls if restricted */ + if (data->domid != DOMID_INVALID) + return -EPERM; + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) return -EFAULT; @@ -242,8 +251,9 @@ static int mmap_gfn_range(void *data, void *state) return 0; } -static long privcmd_ioctl_mmap(void __user *udata) +static long privcmd_ioctl_mmap(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_mmap mmapcmd; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; @@ -258,6 +268,10 @@ static long privcmd_ioctl_mmap(void __user *udata) if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) return -EFAULT; + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != mmapcmd.dom) + return -EPERM; + rc = gather_array(&pagelist, mmapcmd.num, sizeof(struct privcmd_mmap_entry), mmapcmd.entry); @@ -429,8 +443,10 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) static const struct vm_operations_struct privcmd_vm_ops; -static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +static long privcmd_ioctl_mmap_batch( + struct file *file, void __user *udata, int version) { + struct privcmd_data *data = file->private_data; int ret; struct privcmd_mmapbatch_v2 m; struct mm_struct *mm = current->mm; @@ -459,6 +475,10 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version) return -EINVAL; } + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != m.dom) + return -EPERM; + nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE); if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) return -EINVAL; @@ -603,8 +623,9 @@ static void unlock_pages(struct page *pages[], unsigned int nr_pages) } } -static long privcmd_ioctl_dm_op(void __user *udata) +static long privcmd_ioctl_dm_op(struct file *file, void __user *udata) { + struct privcmd_data *data = file->private_data; struct privcmd_dm_op kdata; struct privcmd_dm_op_buf *kbufs; unsigned int nr_pages = 0; @@ -616,6 +637,10 @@ static long privcmd_ioctl_dm_op(void __user *udata) if (copy_from_user(&kdata, udata, sizeof(kdata))) return -EFAULT; + /* If restriction is in place, check the domid matches */ + if (data->domid != DOMID_INVALID && data->domid != kdata.dom) + return -EPERM; + if (kdata.num == 0) return 0; @@ -683,6 +708,23 @@ out: return rc; } +static long privcmd_ioctl_restrict(struct file *file, void __user *udata) +{ + struct privcmd_data *data = file->private_data; + domid_t dom; + + if (copy_from_user(&dom, udata, sizeof(dom))) + return -EFAULT; + + /* Set restriction to the specified domain, or check it matches */ + if (data->domid == DOMID_INVALID) + data->domid = dom; + else if (data->domid != dom) + return -EINVAL; + + return 0; +} + static long privcmd_ioctl(struct file *file, unsigned int cmd, unsigned long data) { @@ -691,23 +733,27 @@ static long privcmd_ioctl(struct file *file, switch (cmd) { case IOCTL_PRIVCMD_HYPERCALL: - ret = privcmd_ioctl_hypercall(udata); + ret = privcmd_ioctl_hypercall(file, udata); break; case IOCTL_PRIVCMD_MMAP: - ret = privcmd_ioctl_mmap(udata); + ret = privcmd_ioctl_mmap(file, udata); break; case IOCTL_PRIVCMD_MMAPBATCH: - ret = privcmd_ioctl_mmap_batch(udata, 1); + ret = privcmd_ioctl_mmap_batch(file, udata, 1); break; case IOCTL_PRIVCMD_MMAPBATCH_V2: - ret = privcmd_ioctl_mmap_batch(udata, 2); + ret = privcmd_ioctl_mmap_batch(file, udata, 2); break; case IOCTL_PRIVCMD_DM_OP: - ret = privcmd_ioctl_dm_op(udata); + ret = privcmd_ioctl_dm_op(file, udata); + break; + + case IOCTL_PRIVCMD_RESTRICT: + ret = privcmd_ioctl_restrict(file, udata); break; default: @@ -717,6 +763,28 @@ static long privcmd_ioctl(struct file *file, return ret; } +static int privcmd_open(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = kzalloc(sizeof(*data), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + /* DOMID_INVALID implies no restriction */ + data->domid = DOMID_INVALID; + + file->private_data = data; + return 0; +} + +static int privcmd_release(struct inode *ino, struct file *file) +{ + struct privcmd_data *data = file->private_data; + + kfree(data); + return 0; +} + static void privcmd_close(struct vm_area_struct *vma) { struct page **pages = vma->vm_private_data; @@ -785,6 +853,8 @@ static int privcmd_vma_range_is_mapped( const struct file_operations xen_privcmd_fops = { .owner = THIS_MODULE, .unlocked_ioctl = privcmd_ioctl, + .open = privcmd_open, + .release = privcmd_release, .mmap = privcmd_mmap, }; EXPORT_SYMBOL_GPL(xen_privcmd_fops); diff --git a/include/uapi/xen/privcmd.h b/include/uapi/xen/privcmd.h index f8c5d75b99e1..63ee95c9dabb 100644 --- a/include/uapi/xen/privcmd.h +++ b/include/uapi/xen/privcmd.h @@ -111,5 +111,7 @@ struct privcmd_dm_op { _IOC(_IOC_NONE, 'P', 4, sizeof(struct privcmd_mmapbatch_v2)) #define IOCTL_PRIVCMD_DM_OP \ _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_dm_op)) +#define IOCTL_PRIVCMD_RESTRICT \ + _IOC(_IOC_NONE, 'P', 6, sizeof(domid_t)) #endif /* __LINUX_PUBLIC_PRIVCMD_H__ */ -- cgit v1.2.3-59-g8ed1b