aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/calling.h15
-rw-r--r--arch/x86/entry/entry_32.S8
-rw-r--r--arch/x86/entry/entry_64.S8
-rw-r--r--arch/x86/entry/entry_64_compat.S20
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl1
-rw-r--r--arch/x86/entry/vdso/Makefile1
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c151
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S3
-rw-r--r--arch/x86/entry/vdso/vdso2c.c3
-rw-r--r--arch/x86/entry/vdso/vma.c14
11 files changed, 121 insertions, 104 deletions
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3c71dd947c7b..e32206e09868 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -1,3 +1,5 @@
+#include <linux/jump_label.h>
+
/*
x86 function call convention, 64-bit:
@@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with
#endif /* CONFIG_X86_64 */
+/*
+ * This does 'call enter_from_user_mode' unless we can avoid it based on
+ * kernel config or using the static jump infrastructure.
+ */
+.macro CALL_enter_from_user_mode
+#ifdef CONFIG_CONTEXT_TRACKING
+#ifdef HAVE_JUMP_LABEL
+ STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
+#endif
+ call enter_from_user_mode
+.Lafter_call_\@:
+#endif
+.endm
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index f3b6d54e0042..77d8c5112900 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -329,7 +329,8 @@ sysenter_past_esp:
* Return back to the vDSO, which will pop ecx and edx.
* Don't bother with DS and ES (they already contain __USER_DS).
*/
- ENABLE_INTERRUPTS_SYSEXIT
+ sti
+ sysexit
.pushsection .fixup, "ax"
2: movl $0, PT_FS(%esp)
@@ -552,11 +553,6 @@ ENTRY(native_iret)
iret
_ASM_EXTABLE(native_iret, iret_exc)
END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
#endif
ENTRY(overflow)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a55697d19824..9d34d3cfceb6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -520,9 +520,7 @@ END(irq_entries_start)
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
1:
/*
@@ -1066,9 +1064,7 @@ ENTRY(error_entry)
* (which can take locks).
*/
TRACE_IRQS_OFF
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
+ CALL_enter_from_user_mode
ret
.Lerror_entry_done:
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 6a1ae3751e82..ff1c6d61f332 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -18,13 +18,6 @@
.section .entry.text, "ax"
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
/*
* 32-bit SYSENTER instruction entry.
*
@@ -103,15 +96,15 @@ ENTRY(entry_SYSENTER_compat)
* This needs to happen before enabling interrupts so that
* we don't get preempted with NT set.
*
- * NB.: sysenter_fix_flags is a label with the code under it moved
+ * NB.: .Lsysenter_fix_flags is a label with the code under it moved
* out-of-line as an optimization: NT is unlikely to be set in the
* majority of the cases and instead of polluting the I$ unnecessarily,
* we're keeping that code behind a branch which will predict as
* not-taken and therefore its instructions won't be fetched.
*/
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
- jnz sysenter_fix_flags
-sysenter_flags_fixed:
+ jnz .Lsysenter_fix_flags
+.Lsysenter_flags_fixed:
/*
* User mode is traced as though IRQs are on, and SYSENTER
@@ -126,10 +119,10 @@ sysenter_flags_fixed:
"jmp .Lsyscall_32_done", X86_FEATURE_XENPV
jmp sysret32_from_system_call
-sysenter_fix_flags:
+.Lsysenter_fix_flags:
pushq $X86_EFLAGS_FIXED
popfq
- jmp sysenter_flags_fixed
+ jmp .Lsysenter_flags_fixed
ENDPROC(entry_SYSENTER_compat)
/*
@@ -238,7 +231,8 @@ sysret32_from_system_call:
xorq %r9, %r9
xorq %r10, %r10
movq RSP-ORIG_RAX(%rsp), %rsp
- USERGS_SYSRET32
+ swapgs
+ sysretl
END(entry_SYSCALL_compat)
/*
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index f17705e1332c..cb713df81180 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -383,3 +383,4 @@
374 i386 userfaultfd sys_userfaultfd
375 i386 membarrier sys_membarrier
376 i386 mlock2 sys_mlock2
+377 i386 copy_file_range sys_copy_file_range
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90bfc09c..dc1040a50bdc 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
+326 common copy_file_range sys_copy_file_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 265c0ed68118..c854541d93ff 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -4,6 +4,7 @@
KBUILD_CFLAGS += $(DISABLE_LTO)
KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
VDSO64-$(CONFIG_X86_64) := y
VDSOX32-$(CONFIG_X86_X32_ABI) := y
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -17,8 +17,10 @@
#include <asm/vvar.h>
#include <asm/unistd.h>
#include <asm/msr.h>
+#include <asm/pvclock.h>
#include <linux/math64.h>
#include <linux/time.h>
+#include <linux/kernel.h>
#define gtod (&VVAR(vsyscall_gtod_data))
@@ -36,12 +38,12 @@ static notrace cycle_t vread_hpet(void)
}
#endif
-#ifndef BUILD_VDSO32
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+ __attribute__((visibility("hidden")));
+#endif
-#include <linux/kernel.h>
-#include <asm/vsyscall.h>
-#include <asm/fixmap.h>
-#include <asm/pvclock.h>
+#ifndef BUILD_VDSO32
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
@@ -60,75 +62,6 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
-#ifdef CONFIG_PARAVIRT_CLOCK
-
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
-{
- const struct pvclock_vsyscall_time_info *pvti_base;
- int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
- int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
- BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
- pvti_base = (struct pvclock_vsyscall_time_info *)
- __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
- return &pvti_base[offset];
-}
-
-static notrace cycle_t vread_pvclock(int *mode)
-{
- const struct pvclock_vsyscall_time_info *pvti;
- cycle_t ret;
- u64 last;
- u32 version;
- u8 flags;
- unsigned cpu, cpu1;
-
-
- /*
- * Note: hypervisor must guarantee that:
- * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
- * 2. that per-CPU pvclock time info is updated if the
- * underlying CPU changes.
- * 3. that version is increased whenever underlying CPU
- * changes.
- *
- */
- do {
- cpu = __getcpu() & VGETCPU_CPU_MASK;
- /* TODO: We can put vcpu id into higher bits of pvti.version.
- * This will save a couple of cycles by getting rid of
- * __getcpu() calls (Gleb).
- */
-
- pvti = get_pvti(cpu);
-
- version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
- /*
- * Test we're still on the cpu as well as the version.
- * We could have been migrated just after the first
- * vgetcpu but before fetching the version, so we
- * wouldn't notice a version change.
- */
- cpu1 = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1 ||
- (pvti->pvti.version & 1) ||
- pvti->pvti.version != version));
-
- if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
- *mode = VCLOCK_NONE;
-
- /* refer to tsc.c read_tsc() comment for rationale */
- last = gtod->cycle_last;
-
- if (likely(ret >= last))
- return ret;
-
- return last;
-}
-#endif
#else
@@ -162,15 +95,77 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
return ret;
}
+#endif
+
#ifdef CONFIG_PARAVIRT_CLOCK
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
+{
+ return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
+}
static notrace cycle_t vread_pvclock(int *mode)
{
- *mode = VCLOCK_NONE;
- return 0;
-}
-#endif
+ const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
+ cycle_t ret;
+ u64 tsc, pvti_tsc;
+ u64 last, delta, pvti_system_time;
+ u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
+ /*
+ * Note: The kernel and hypervisor must guarantee that cpu ID
+ * number maps 1:1 to per-CPU pvclock time info.
+ *
+ * Because the hypervisor is entirely unaware of guest userspace
+ * preemption, it cannot guarantee that per-CPU pvclock time
+ * info is updated if the underlying CPU changes or that that
+ * version is increased whenever underlying CPU changes.
+ *
+ * On KVM, we are guaranteed that pvti updates for any vCPU are
+ * atomic as seen by *all* vCPUs. This is an even stronger
+ * guarantee than we get with a normal seqlock.
+ *
+ * On Xen, we don't appear to have that guarantee, but Xen still
+ * supplies a valid seqlock using the version field.
+ *
+ * We only do pvclock vdso timing at all if
+ * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+ * mean that all vCPUs have matching pvti and that the TSC is
+ * synced, so we can just look at vCPU 0's pvti.
+ */
+
+ do {
+ version = pvti->version;
+
+ smp_rmb();
+
+ if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+ *mode = VCLOCK_NONE;
+ return 0;
+ }
+
+ tsc = rdtsc_ordered();
+ pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+ pvti_tsc_shift = pvti->tsc_shift;
+ pvti_system_time = pvti->system_time;
+ pvti_tsc = pvti->tsc_timestamp;
+
+ /* Make sure that the version double-check is last. */
+ smp_rmb();
+ } while (unlikely((version & 1) || version != pvti->version));
+
+ delta = tsc - pvti_tsc;
+ ret = pvti_system_time +
+ pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+ pvti_tsc_shift);
+
+ /* refer to vread_tsc() comment for rationale */
+ last = gtod->cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
#endif
notrace static cycle_t vread_tsc(void)
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..4158acc17df0 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
* segment.
*/
- vvar_start = . - 2 * PAGE_SIZE;
+ vvar_start = . - 3 * PAGE_SIZE;
vvar_page = vvar_start;
/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
#undef EMIT_VVAR
hpet_page = vvar_start + PAGE_SIZE;
+ pvclock_page = vvar_start + 2 * PAGE_SIZE;
. = SIZEOF_HEADERS;
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d9922b106..491020b2826d 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
sym_vvar_start,
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
};
@@ -80,6 +81,7 @@ enum {
const int special_pages[] = {
sym_vvar_page,
sym_hpet_page,
+ sym_pvclock_page,
};
struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
[sym_hpet_page] = {"hpet_page", true},
+ [sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
},
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df47148160..b8f69e264ac4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
#include <linux/random.h>
#include <linux/elf.h>
#include <linux/cpu.h>
+#include <asm/pvclock.h>
#include <asm/vgtod.h>
#include <asm/proto.h>
#include <asm/vdso.h>
@@ -100,6 +101,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
.name = "[vvar]",
.pages = no_pages,
};
+ struct pvclock_vsyscall_time_info *pvti;
if (calculate_addr) {
addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +171,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
}
#endif
+ pvti = pvclock_pvti_cpu0_va();
+ if (pvti && image->sym_pvclock_page) {
+ ret = remap_pfn_range(vma,
+ text_start + image->sym_pvclock_page,
+ __pa(pvti) >> PAGE_SHIFT,
+ PAGE_SIZE,
+ PAGE_READONLY);
+
+ if (ret)
+ goto up_fail;
+ }
+
up_fail:
if (ret)
current->mm->context.vdso = NULL;