aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2015-05-08 13:33:33 +0200
committerIngo Molnar <mingo@kernel.org>2015-05-08 13:33:33 +0200
commit7ae383be81781c5e1347f71c3eb0d53ce5188200 (patch)
treed2dfedb78cf4ee2bc9cc460af3be106b08e01050 /arch/x86
parentx86: Force inlining of atomic ops (diff)
parentMerge tag 'pm+acpi-4.1-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm (diff)
downloadwireguard-linux-7ae383be81781c5e1347f71c3eb0d53ce5188200.tar.xz
wireguard-linux-7ae383be81781c5e1347f71c3eb0d53ce5188200.zip
Merge branch 'linus' into x86/asm, before applying dependent patch
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig33
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/Makefile2
-rw-r--r--arch/x86/Makefile.um2
-rw-r--r--arch/x86/boot/compressed/eboot.c2
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c187
-rw-r--r--arch/x86/crypto/camellia_aesni_avx2_glue.c15
-rw-r--r--arch/x86/crypto/camellia_aesni_avx_glue.c15
-rw-r--r--arch/x86/crypto/cast5_avx_glue.c9
-rw-r--r--arch/x86/crypto/cast6_avx_glue.c15
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c7
-rw-r--r--arch/x86/crypto/glue_helper.c1
-rw-r--r--arch/x86/crypto/serpent_avx2_glue.c15
-rw-r--r--arch/x86/crypto/serpent_avx_glue.c15
-rw-r--r--arch/x86/crypto/serpent_sse2_glue.c15
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb.c9
-rw-r--r--arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c2
-rw-r--r--arch/x86/crypto/sha1_ssse3_glue.c139
-rw-r--r--arch/x86/crypto/sha256-avx-asm.S10
-rw-r--r--arch/x86/crypto/sha256-avx2-asm.S10
-rw-r--r--arch/x86/crypto/sha256-ssse3-asm.S10
-rw-r--r--arch/x86/crypto/sha256_ssse3_glue.c193
-rw-r--r--arch/x86/crypto/sha512-avx-asm.S6
-rw-r--r--arch/x86/crypto/sha512-avx2-asm.S8
-rw-r--r--arch/x86/crypto/sha512-ssse3-asm.S6
-rw-r--r--arch/x86/crypto/sha512_ssse3_glue.c202
-rw-r--r--arch/x86/crypto/twofish_avx_glue.c15
-rw-r--r--arch/x86/ia32/ia32entry.S7
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h11
-rw-r--r--arch/x86/include/asm/e820.h8
-rw-r--r--arch/x86/include/asm/elf.h3
-rw-r--r--arch/x86/include/asm/hypervisor.h2
-rw-r--r--arch/x86/include/asm/intel-mid.h3
-rw-r--r--arch/x86/include/asm/lguest.h7
-rw-r--r--arch/x86/include/asm/livepatch.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h8
-rw-r--r--arch/x86/include/asm/paravirt_types.h8
-rw-r--r--arch/x86/include/asm/pgalloc.h8
-rw-r--r--arch/x86/include/asm/pgtable-2level_types.h1
-rw-r--r--arch/x86/include/asm/pgtable-3level_types.h2
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_64_types.h1
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/pm-trace.h (renamed from arch/x86/include/asm/resume-trace.h)10
-rw-r--r--arch/x86/include/asm/processor.h3
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/seccomp.h21
-rw-r--r--arch/x86/include/asm/seccomp_32.h11
-rw-r--r--arch/x86/include/asm/seccomp_64.h17
-rw-r--r--arch/x86/include/asm/serial.h8
-rw-r--r--arch/x86/include/asm/smp.h2
-rw-r--r--arch/x86/include/asm/spinlock.h2
-rw-r--r--arch/x86/include/asm/thread_info.h3
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/uapi/asm/e820.h10
-rw-r--r--arch/x86/include/uapi/asm/hyperv.h2
-rw-r--r--arch/x86/include/uapi/asm/msr-index.h26
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c4
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c3
-rw-r--r--arch/x86/kernel/cpu/common.c39
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c4
-rw-r--r--arch/x86/kernel/cpu/intel_pt.h131
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event.c205
-rw-r--r--arch/x86/kernel/cpu/perf_event.h181
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c974
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_bts.c525
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_cqm.c1379
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c39
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c321
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_pt.c1100
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c12
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c3
-rw-r--r--arch/x86/kernel/cpu/scattered.c1
-rw-r--r--arch/x86/kernel/e820.c26
-rw-r--r--arch/x86/kernel/early_printk.c6
-rw-r--r--arch/x86/kernel/entry_64.S9
-rw-r--r--arch/x86/kernel/i387.c8
-rw-r--r--arch/x86/kernel/kprobes/core.c9
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/paravirt.c6
-rw-r--r--arch/x86/kernel/pmem.c53
-rw-r--r--arch/x86/kernel/process.c14
-rw-r--r--arch/x86/kernel/process_64.c28
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/signal.c38
-rw-r--r--arch/x86/kernel/smpboot.c39
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kvm/assigned-dev.c2
-rw-r--r--arch/x86/kvm/lapic.c11
-rw-r--r--arch/x86/kvm/mmu.c20
-rw-r--r--arch/x86/kvm/vmx.c12
-rw-r--r--arch/x86/kvm/x86.c43
-rw-r--r--arch/x86/lguest/boot.c7
-rw-r--r--arch/x86/lguest/head_32.S30
-rw-r--r--arch/x86/lib/usercopy_64.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/ioremap.c37
-rw-r--r--arch/x86/mm/memtest.c118
-rw-r--r--arch/x86/mm/mmap.c38
-rw-r--r--arch/x86/mm/pgtable.c79
-rw-r--r--arch/x86/pci/acpi.c24
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/platform/intel-mid/Makefile1
-rw-r--r--arch/x86/platform/intel-mid/early_printk_intel_mid.c112
-rw-r--r--arch/x86/platform/olpc/olpc-xo1-sci.c4
-rw-r--r--arch/x86/platform/olpc/olpc-xo15-sci.c4
-rw-r--r--arch/x86/syscalls/Makefile9
-rw-r--r--arch/x86/um/Makefile1
-rw-r--r--arch/x86/um/asm/barrier.h11
-rw-r--r--arch/x86/um/asm/elf.h2
-rw-r--r--arch/x86/um/ldt.c227
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_32.h3
-rw-r--r--arch/x86/um/shared/sysdep/faultinfo_64.h3
-rw-r--r--arch/x86/um/shared/sysdep/skas_ptrace.h22
-rw-r--r--arch/x86/um/signal.c7
-rw-r--r--arch/x86/vdso/vclock_gettime.c34
-rw-r--r--arch/x86/xen/apic.c180
-rw-r--r--arch/x86/xen/enlighten.c117
-rw-r--r--arch/x86/xen/mmu.c221
-rw-r--r--arch/x86/xen/smp.c46
-rw-r--r--arch/x86/xen/suspend.c10
-rw-r--r--arch/x86/xen/trace.c50
-rw-r--r--arch/x86/xen/xen-head.S63
133 files changed, 6037 insertions, 2014 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..226d5696e1d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,7 @@ config X86_64
### Arch settings
config X86
def_bool y
+ select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_FAST_MULTIPLIER
@@ -87,7 +88,7 @@ config X86
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
select HAVE_USER_RETURN_NOTIFIER
- select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+ select ARCH_HAS_ELF_RANDOMIZE
select HAVE_ARCH_JUMP_LABEL
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select SPARSE_IRQ
@@ -99,6 +100,7 @@ config X86
select IRQ_FORCED_THREADING
select HAVE_BPF_JIT if X86_64
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
select ARCH_HAS_SG_CHAIN
select CLKEVT_I8253
select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -177,7 +179,7 @@ config SBUS
config NEED_DMA_MAP_STATE
def_bool y
- depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
+ depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
config NEED_SG_DMA_LENGTH
def_bool y
@@ -277,6 +279,12 @@ config ARCH_SUPPORTS_UPROBES
config FIX_EARLYCON_MEM
def_bool y
+config PGTABLE_LEVELS
+ int
+ default 4 if X86_64
+ default 3 if X86_PAE
+ default 2
+
source "init/Kconfig"
source "kernel/Kconfig.freezer"
@@ -714,17 +722,6 @@ endif #HYPERVISOR_GUEST
config NO_BOOTMEM
def_bool y
-config MEMTEST
- bool "Memtest"
- ---help---
- This option adds a kernel parameter 'memtest', which allows memtest
- to be set.
- memtest=0, mean disabled; -- default
- memtest=1, mean do 1 test pattern;
- ...
- memtest=4, mean do 4 test patterns.
- If you are unsure how to answer this question, answer N.
-
source "arch/x86/Kconfig.cpu"
config HPET_TIMER
@@ -1425,6 +1422,16 @@ config ILLEGAL_POINTER_VALUE
source "mm/Kconfig"
+config X86_PMEM_LEGACY
+ bool "Support non-standard NVDIMMs and ADR protected memory"
+ help
+ Treat memory marked using the non-standard e820 type of 12 as used
+ by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+ The kernel will offer these regions to the 'pmem' driver so
+ they can be used for persistent storage.
+
+ Say Y if unsure.
+
config HIGHPTE
bool "Allocate 3rd-level pagetables from highmem"
depends on HIGHMEM
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 20028da8ae18..72484a645f05 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,10 +43,6 @@ config EARLY_PRINTK
with klogd/syslogd or the X server. You should normally N here,
unless you want to debug such a crash.
-config EARLY_PRINTK_INTEL_MID
- bool "Early printk for Intel MID platform support"
- depends on EARLY_PRINTK && X86_INTEL_MID
-
config EARLY_PRINTK_DBGP
bool "Early printk via EHCI debug port"
depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 40af1bac2b7d..c7c31876bb40 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y)
$(call cc-option,-fno-unit-at-a-time))
# CPU-specific tuning. Anything which can be shared with UML should go here.
- include $(srctree)/arch/x86/Makefile_32.cpu
+ include arch/x86/Makefile_32.cpu
KBUILD_CFLAGS += $(cflags-y)
# temporary until string.h is fixed
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 95eba554baf9..5b7e898ffd9a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -18,7 +18,7 @@ LDS_EXTRA := -Ui386
export LDS_EXTRA
# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
-include $(srctree)/arch/x86/Makefile_32.cpu
+include arch/x86/Makefile_32.cpu
# prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index ef17683484e9..48304b89b601 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -1109,6 +1109,8 @@ struct boot_params *make_boot_params(struct efi_config *c)
if (!cmdline_ptr)
goto fail;
hdr->cmd_line_ptr = (unsigned long)cmdline_ptr;
+ /* Fill in upper bits of command line address, NOP on 32 bit */
+ boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32;
hdr->ramdisk_image = 0;
hdr->ramdisk_size = 0;
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 54f60ab41c63..112cefacf2af 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm)
PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
struct crypto_aead *cryptd_child;
struct aesni_rfc4106_gcm_ctx *child_ctx;
- cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+ cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
@@ -890,15 +892,12 @@ out_free_ablkcipher:
return ret;
}
-static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
- unsigned int key_len)
+static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
+ unsigned int key_len)
{
int ret = 0;
- struct crypto_tfm *tfm = crypto_aead_tfm(parent);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
- struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
- struct aesni_rfc4106_gcm_ctx *child_ctx =
- aesni_rfc4106_gcm_ctx_get(cryptd_child);
+ struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
u8 *new_key_align, *new_key_mem = NULL;
if (key_len < 4) {
@@ -943,20 +942,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
goto exit;
}
ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
- memcpy(child_ctx, ctx, sizeof(*ctx));
exit:
kfree(new_key_mem);
return ret;
}
-/* This is the Integrity Check Value (aka the authentication tag length and can
- * be 8, 12 or 16 bytes long. */
-static int rfc4106_set_authsize(struct crypto_aead *parent,
- unsigned int authsize)
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+ unsigned int key_len)
{
struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
- struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+ struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
+ struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
+ struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
+ int ret;
+ ret = crypto_aead_setkey(child, key, key_len);
+ if (!ret) {
+ memcpy(ctx, c_ctx, sizeof(*ctx));
+ ctx->cryptd_tfm = cryptd_tfm;
+ }
+ return ret;
+}
+
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+ unsigned int authsize)
+{
switch (authsize) {
case 8:
case 12:
@@ -965,51 +975,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
default:
return -EINVAL;
}
- crypto_aead_crt(parent)->authsize = authsize;
- crypto_aead_crt(cryptd_child)->authsize = authsize;
+ crypto_aead_crt(aead)->authsize = authsize;
return 0;
}
-static int rfc4106_encrypt(struct aead_request *req)
-{
- int ret;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-
- if (!irq_fpu_usable()) {
- struct aead_request *cryptd_req =
- (struct aead_request *) aead_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_aead_encrypt(cryptd_req);
- } else {
- struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
- kernel_fpu_begin();
- ret = cryptd_child->base.crt_aead.encrypt(req);
- kernel_fpu_end();
- return ret;
- }
-}
-
-static int rfc4106_decrypt(struct aead_request *req)
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+ unsigned int authsize)
{
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+ struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
int ret;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
- if (!irq_fpu_usable()) {
- struct aead_request *cryptd_req =
- (struct aead_request *) aead_request_ctx(req);
- memcpy(cryptd_req, req, sizeof(*req));
- aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
- return crypto_aead_decrypt(cryptd_req);
- } else {
- struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
- kernel_fpu_begin();
- ret = cryptd_child->base.crt_aead.decrypt(req);
- kernel_fpu_end();
- return ret;
- }
+ ret = crypto_aead_setauthsize(child, authsize);
+ if (!ret)
+ crypto_aead_crt(parent)->authsize = authsize;
+ return ret;
}
static int __driver_rfc4106_encrypt(struct aead_request *req)
@@ -1185,6 +1167,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
}
return retval;
}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ ret = crypto_aead_encrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = __driver_rfc4106_encrypt(req);
+ kernel_fpu_end();
+ }
+ return ret;
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ ret = crypto_aead_decrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = __driver_rfc4106_decrypt(req);
+ kernel_fpu_end();
+ }
+ return ret;
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+ int ret;
+
+ if (unlikely(!irq_fpu_usable())) {
+ WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+ ret = -EINVAL;
+ } else {
+ kernel_fpu_begin();
+ ret = __driver_rfc4106_encrypt(req);
+ kernel_fpu_end();
+ }
+ return ret;
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+ int ret;
+
+ if (unlikely(!irq_fpu_usable())) {
+ WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+ ret = -EINVAL;
+ } else {
+ kernel_fpu_begin();
+ ret = __driver_rfc4106_decrypt(req);
+ kernel_fpu_end();
+ }
+ return ret;
+}
#endif
static struct crypto_alg aesni_algs[] = { {
@@ -1210,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__aes-aesni",
.cra_driver_name = "__driver-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx) +
AESNI_ALIGN - 1,
@@ -1229,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__ecb-aes-aesni",
.cra_driver_name = "__driver-ecb-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx) +
AESNI_ALIGN - 1,
@@ -1249,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__cbc-aes-aesni",
.cra_driver_name = "__driver-cbc-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct crypto_aes_ctx) +
AESNI_ALIGN - 1,
@@ -1313,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__ctr-aes-aesni",
.cra_driver_name = "__driver-ctr-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct crypto_aes_ctx) +
AESNI_ALIGN - 1,
@@ -1357,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__gcm-aes-aesni",
.cra_driver_name = "__driver-gcm-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_AEAD,
+ .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) +
AESNI_ALIGN,
@@ -1366,8 +1423,12 @@ static struct crypto_alg aesni_algs[] = { {
.cra_module = THIS_MODULE,
.cra_u = {
.aead = {
- .encrypt = __driver_rfc4106_encrypt,
- .decrypt = __driver_rfc4106_decrypt,
+ .setkey = common_rfc4106_set_key,
+ .setauthsize = common_rfc4106_set_authsize,
+ .encrypt = helper_rfc4106_encrypt,
+ .decrypt = helper_rfc4106_decrypt,
+ .ivsize = 8,
+ .maxauthsize = 16,
},
},
}, {
@@ -1423,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__lrw-aes-aesni",
.cra_driver_name = "__driver-lrw-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesni_lrw_ctx),
.cra_alignmask = 0,
@@ -1444,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { {
.cra_name = "__xts-aes-aesni",
.cra_driver_name = "__driver-xts-aes-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = AES_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct aesni_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 9a07fafe3831..baf0ac21ace5 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__ecb-camellia-aesni-avx2",
.cra_driver_name = "__driver-ecb-camellia-aesni-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__cbc-camellia-aesni-avx2",
.cra_driver_name = "__driver-cbc-camellia-aesni-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__ctr-camellia-aesni-avx2",
.cra_driver_name = "__driver-ctr-camellia-aesni-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__lrw-camellia-aesni-avx2",
.cra_driver_name = "__driver-lrw-camellia-aesni-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_lrw_ctx),
.cra_alignmask = 0,
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__xts-camellia-aesni-avx2",
.cra_driver_name = "__driver-xts-camellia-aesni-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index ed38d959add6..78818a1e73e3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__ecb-camellia-aesni",
.cra_driver_name = "__driver-ecb-camellia-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__cbc-camellia-aesni",
.cra_driver_name = "__driver-cbc-camellia-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__ctr-camellia-aesni",
.cra_driver_name = "__driver-ctr-camellia-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct camellia_ctx),
.cra_alignmask = 0,
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__lrw-camellia-aesni",
.cra_driver_name = "__driver-lrw-camellia-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_lrw_ctx),
.cra_alignmask = 0,
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { {
.cra_name = "__xts-camellia-aesni",
.cra_driver_name = "__driver-xts-camellia-aesni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAMELLIA_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct camellia_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..236c80974457 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { {
.cra_name = "__ecb-cast5-avx",
.cra_driver_name = "__driver-ecb-cast5-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { {
.cra_name = "__cbc-cast5-avx",
.cra_driver_name = "__driver-cbc-cast5-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST5_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { {
.cra_name = "__ctr-cast5-avx",
.cra_driver_name = "__driver-ctr-cast5-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct cast5_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 0160f68a57ff..f448810ca4ac 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { {
.cra_name = "__ecb-cast6-avx",
.cra_driver_name = "__driver-ecb-cast6-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST6_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast6_ctx),
.cra_alignmask = 0,
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { {
.cra_name = "__cbc-cast6-avx",
.cra_driver_name = "__driver-cbc-cast6-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST6_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast6_ctx),
.cra_alignmask = 0,
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { {
.cra_name = "__ctr-cast6-avx",
.cra_driver_name = "__driver-ctr-cast6-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct cast6_ctx),
.cra_alignmask = 0,
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { {
.cra_name = "__lrw-cast6-avx",
.cra_driver_name = "__driver-lrw-cast6-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST6_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast6_lrw_ctx),
.cra_alignmask = 0,
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { {
.cra_name = "__xts-cast6-avx",
.cra_driver_name = "__driver-xts-cast6-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = CAST6_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct cast6_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..2079baf06bdd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
.cra_name = "__ghash",
.cra_driver_name = "__ghash-pclmulqdqni",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_SHASH,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = GHASH_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct ghash_ctx),
.cra_module = THIS_MODULE,
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm)
struct cryptd_ahash *cryptd_tfm;
struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
- cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+ cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
if (IS_ERR(cryptd_tfm))
return PTR_ERR(cryptd_tfm);
ctx->cryptd_tfm = cryptd_tfm;
diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..6a85598931b5 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
le128_to_be128((be128 *)walk->iv, &ctrblk);
}
-EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
struct blkcipher_desc *desc,
diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 437e47a4d302..2f63dc89e7a9 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { {
.cra_name = "__ecb-serpent-avx2",
.cra_driver_name = "__driver-ecb-serpent-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { {
.cra_name = "__cbc-serpent-avx2",
.cra_driver_name = "__driver-cbc-serpent-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { {
.cra_name = "__ctr-serpent-avx2",
.cra_driver_name = "__driver-ctr-serpent-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { {
.cra_name = "__lrw-serpent-avx2",
.cra_driver_name = "__driver-lrw-serpent-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_lrw_ctx),
.cra_alignmask = 0,
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { {
.cra_name = "__xts-serpent-avx2",
.cra_driver_name = "__driver-xts-serpent-avx2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7e217398b4eb..c8d478af8456 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__ecb-serpent-avx",
.cra_driver_name = "__driver-ecb-serpent-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__cbc-serpent-avx",
.cra_driver_name = "__driver-cbc-serpent-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__ctr-serpent-avx",
.cra_driver_name = "__driver-ctr-serpent-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__lrw-serpent-avx",
.cra_driver_name = "__driver-lrw-serpent-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_lrw_ctx),
.cra_alignmask = 0,
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__xts-serpent-avx",
.cra_driver_name = "__driver-xts-serpent-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index bf025adaea01..3643dd508f45 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__ecb-serpent-sse2",
.cra_driver_name = "__driver-ecb-serpent-sse2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__cbc-serpent-sse2",
.cra_driver_name = "__driver-cbc-serpent-sse2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__ctr-serpent-sse2",
.cra_driver_name = "__driver-ctr-serpent-sse2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct serpent_ctx),
.cra_alignmask = 0,
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__lrw-serpent-sse2",
.cra_driver_name = "__driver-lrw-serpent-sse2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_lrw_ctx),
.cra_alignmask = 0,
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { {
.cra_name = "__xts-serpent-sse2",
.cra_driver_name = "__driver-xts-serpent-sse2",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SERPENT_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct serpent_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index fd9f6b035b16..e510b1c5d690 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = {
* use ASYNC flag as some buffers in multi-buffer
* algo may not have completed before hashing thread sleep
*/
- .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC,
+ .cra_flags = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = SHA1_BLOCK_SIZE,
.cra_module = THIS_MODULE,
.cra_list = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
struct mcryptd_hash_ctx *mctx;
- mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0);
+ mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+ CRYPTO_ALG_INTERNAL,
+ CRYPTO_ALG_INTERNAL);
if (IS_ERR(mcryptd_tfm))
return PTR_ERR(mcryptd_tfm);
mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
@@ -828,7 +831,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
while (!list_empty(&cstate->work_list)) {
rctx = list_entry(cstate->work_list.next,
struct mcryptd_hash_request_ctx, waiter);
- if time_before(cur_time, rctx->tag.expire)
+ if (time_before(cur_time, rctx->tag.expire))
break;
kernel_fpu_begin();
sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
index 4ca7e166a2aa..822acb5b464c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -56,7 +56,7 @@
void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
{
unsigned int j;
- state->unused_lanes = 0xF76543210;
+ state->unused_lanes = 0xF76543210ULL;
for (j = 0; j < 8; j++) {
state->lens[j] = 0xFFFFFFFF;
state->ldata[j].job_in_lane = NULL;
diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 6c20fe04a738..33d1b9dc14cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -28,7 +28,7 @@
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha1_base.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
#define SHA1_AVX2_BLOCK_OPTSIZE 4 /* optimal 4*64 bytes of SHA1 blocks */
asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
- unsigned int rounds);
+ unsigned int rounds);
#endif
-static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-
-static int sha1_ssse3_init(struct shash_desc *desc)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- *sctx = (struct sha1_state){
- .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
- };
-
- return 0;
-}
-
-static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, unsigned int partial)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- unsigned int done = 0;
-
- sctx->count += len;
-
- if (partial) {
- done = SHA1_BLOCK_SIZE - partial;
- memcpy(sctx->buffer + partial, data, done);
- sha1_transform_asm(sctx->state, sctx->buffer, 1);
- }
-
- if (len - done >= SHA1_BLOCK_SIZE) {
- const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
-
- sha1_transform_asm(sctx->state, data + done, rounds);
- done += rounds * SHA1_BLOCK_SIZE;
- }
-
- memcpy(sctx->buffer, data + done, len - done);
-
- return 0;
-}
+static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha1_state *sctx = shash_desc_ctx(desc);
- unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
- int res;
- /* Handle the fast case right here */
- if (partial + len < SHA1_BLOCK_SIZE) {
- sctx->count += len;
- memcpy(sctx->buffer + partial, data, len);
+ if (!irq_fpu_usable() ||
+ (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
+ return crypto_sha1_update(desc, data, len);
- return 0;
- }
+ /* make sure casting to sha1_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
- if (!irq_fpu_usable()) {
- res = crypto_sha1_update(desc, data, len);
- } else {
- kernel_fpu_begin();
- res = __sha1_ssse3_update(desc, data, len, partial);
- kernel_fpu_end();
- }
-
- return res;
-}
-
-
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
-{
- struct sha1_state *sctx = shash_desc_ctx(desc);
- unsigned int i, index, padlen;
- __be32 *dst = (__be32 *)out;
- __be64 bits;
- static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
-
- bits = cpu_to_be64(sctx->count << 3);
-
- /* Pad out to 56 mod 64 and append length */
- index = sctx->count % SHA1_BLOCK_SIZE;
- padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
- if (!irq_fpu_usable()) {
- crypto_sha1_update(desc, padding, padlen);
- crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
- } else {
- kernel_fpu_begin();
- /* We need to fill a whole block for __sha1_ssse3_update() */
- if (padlen <= 56) {
- sctx->count += padlen;
- memcpy(sctx->buffer + index, padding, padlen);
- } else {
- __sha1_ssse3_update(desc, padding, padlen, index);
- }
- __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
- kernel_fpu_end();
- }
-
- /* Store state in digest */
- for (i = 0; i < 5; i++)
- dst[i] = cpu_to_be32(sctx->state[i]);
-
- /* Wipe context */
- memset(sctx, 0, sizeof(*sctx));
+ kernel_fpu_begin();
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_transform_asm);
+ kernel_fpu_end();
return 0;
}
-static int sha1_ssse3_export(struct shash_desc *desc, void *out)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
+ if (!irq_fpu_usable())
+ return crypto_sha1_finup(desc, data, len, out);
- memcpy(out, sctx, sizeof(*sctx));
+ kernel_fpu_begin();
+ if (len)
+ sha1_base_do_update(desc, data, len,
+ (sha1_block_fn *)sha1_transform_asm);
+ sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+ kernel_fpu_end();
- return 0;
+ return sha1_base_finish(desc, out);
}
-static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
{
- struct sha1_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
+ return sha1_ssse3_finup(desc, NULL, 0, out);
}
#ifdef CONFIG_AS_AVX2
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data,
static struct shash_alg alg = {
.digestsize = SHA1_DIGEST_SIZE,
- .init = sha1_ssse3_init,
+ .init = sha1_base_init,
.update = sha1_ssse3_update,
.final = sha1_ssse3_final,
- .export = sha1_ssse3_export,
- .import = sha1_ssse3_import,
+ .finup = sha1_ssse3_finup,
.descsize = sizeof(struct sha1_state),
- .statesize = sizeof(struct sha1_state),
.base = {
.cra_name = "sha1",
.cra_driver_name= "sha1-ssse3",
diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 642f15687a0a..92b3b5d75ba9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12 # shuffle xDxC -> DC00
BYTE_FLIP_MASK = %xmm13
NUM_BLKS = %rdx # 3rd arg
-CTX = %rsi # 2nd arg
-INP = %rdi # 1st arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
-SRND = %rdi # clobbers INP
+SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
@@ -342,8 +342,8 @@ a = TMP_
########################################################################
## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
.text
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9e86944c539d..570ec5ec62d7 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13
X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
NUM_BLKS = %rdx # 3rd arg
-CTX = %rsi # 2nd arg
-INP = %rdi # 1st arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
c = %ecx
d = %r8d
e = %edx # clobbers NUM_BLKS
-y3 = %edi # clobbers INP
+y3 = %esi # clobbers INP
TBL = %rbp
@@ -523,8 +523,8 @@ STACK_SIZE = _RSP + _RSP_SIZE
########################################################################
## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
.text
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index f833b74d902b..2cedc44e8121 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
BYTE_FLIP_MASK = %xmm12
NUM_BLKS = %rdx # 3rd arg
-CTX = %rsi # 2nd arg
-INP = %rdi # 1st arg
+INP = %rsi # 2nd arg
+CTX = %rdi # 1st arg
-SRND = %rdi # clobbers INP
+SRND = %rsi # clobbers INP
c = %ecx
d = %r8d
e = %edx
@@ -348,8 +348,8 @@ a = TMP_
########################################################################
## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
## arg 3 : Num blocks
########################################################################
.text
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 8fad72f4dfd2..ccc338881ee8 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -36,195 +36,74 @@
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha256_base.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <linux/string.h>
-asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
- u64 rounds);
+asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
+ u64 rounds);
#ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
u64 rounds);
#endif
#ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
- u64 rounds);
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+ u64 rounds);
#endif
-static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
-
-
-static int sha256_ssse3_init(struct shash_desc *desc)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA256_H0;
- sctx->state[1] = SHA256_H1;
- sctx->state[2] = SHA256_H2;
- sctx->state[3] = SHA256_H3;
- sctx->state[4] = SHA256_H4;
- sctx->state[5] = SHA256_H5;
- sctx->state[6] = SHA256_H6;
- sctx->state[7] = SHA256_H7;
- sctx->count = 0;
-
- return 0;
-}
-
-static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, unsigned int partial)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
- unsigned int done = 0;
-
- sctx->count += len;
-
- if (partial) {
- done = SHA256_BLOCK_SIZE - partial;
- memcpy(sctx->buf + partial, data, done);
- sha256_transform_asm(sctx->buf, sctx->state, 1);
- }
-
- if (len - done >= SHA256_BLOCK_SIZE) {
- const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
-
- sha256_transform_asm(data + done, sctx->state, (u64) rounds);
-
- done += rounds * SHA256_BLOCK_SIZE;
- }
-
- memcpy(sctx->buf, data + done, len - done);
-
- return 0;
-}
+static void (*sha256_transform_asm)(u32 *, const char *, u64);
static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
struct sha256_state *sctx = shash_desc_ctx(desc);
- unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
- int res;
- /* Handle the fast case right here */
- if (partial + len < SHA256_BLOCK_SIZE) {
- sctx->count += len;
- memcpy(sctx->buf + partial, data, len);
+ if (!irq_fpu_usable() ||
+ (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
+ return crypto_sha256_update(desc, data, len);
- return 0;
- }
-
- if (!irq_fpu_usable()) {
- res = crypto_sha256_update(desc, data, len);
- } else {
- kernel_fpu_begin();
- res = __sha256_ssse3_update(desc, data, len, partial);
- kernel_fpu_end();
- }
-
- return res;
-}
+ /* make sure casting to sha256_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
-
-/* Add padding and return the message digest. */
-static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
- unsigned int i, index, padlen;
- __be32 *dst = (__be32 *)out;
- __be64 bits;
- static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
-
- bits = cpu_to_be64(sctx->count << 3);
-
- /* Pad out to 56 mod 64 and append length */
- index = sctx->count % SHA256_BLOCK_SIZE;
- padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
-
- if (!irq_fpu_usable()) {
- crypto_sha256_update(desc, padding, padlen);
- crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
- } else {
- kernel_fpu_begin();
- /* We need to fill a whole block for __sha256_ssse3_update() */
- if (padlen <= 56) {
- sctx->count += padlen;
- memcpy(sctx->buf + index, padding, padlen);
- } else {
- __sha256_ssse3_update(desc, padding, padlen, index);
- }
- __sha256_ssse3_update(desc, (const u8 *)&bits,
- sizeof(bits), 56);
- kernel_fpu_end();
- }
-
- /* Store state in digest */
- for (i = 0; i < 8; i++)
- dst[i] = cpu_to_be32(sctx->state[i]);
-
- /* Wipe context */
- memset(sctx, 0, sizeof(*sctx));
+ kernel_fpu_begin();
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_transform_asm);
+ kernel_fpu_end();
return 0;
}
-static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- struct sha256_state *sctx = shash_desc_ctx(desc);
+ if (!irq_fpu_usable())
+ return crypto_sha256_finup(desc, data, len, out);
- memcpy(out, sctx, sizeof(*sctx));
+ kernel_fpu_begin();
+ if (len)
+ sha256_base_do_update(desc, data, len,
+ (sha256_block_fn *)sha256_transform_asm);
+ sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+ kernel_fpu_end();
- return 0;
+ return sha256_base_finish(desc, out);
}
-static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha224_ssse3_init(struct shash_desc *desc)
-{
- struct sha256_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA224_H0;
- sctx->state[1] = SHA224_H1;
- sctx->state[2] = SHA224_H2;
- sctx->state[3] = SHA224_H3;
- sctx->state[4] = SHA224_H4;
- sctx->state[5] = SHA224_H5;
- sctx->state[6] = SHA224_H6;
- sctx->state[7] = SHA224_H7;
- sctx->count = 0;
-
- return 0;
-}
-
-static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
{
- u8 D[SHA256_DIGEST_SIZE];
-
- sha256_ssse3_final(desc, D);
-
- memcpy(hash, D, SHA224_DIGEST_SIZE);
- memzero_explicit(D, SHA256_DIGEST_SIZE);
-
- return 0;
+ return sha256_ssse3_finup(desc, NULL, 0, out);
}
static struct shash_alg algs[] = { {
.digestsize = SHA256_DIGEST_SIZE,
- .init = sha256_ssse3_init,
+ .init = sha256_base_init,
.update = sha256_ssse3_update,
.final = sha256_ssse3_final,
- .export = sha256_ssse3_export,
- .import = sha256_ssse3_import,
+ .finup = sha256_ssse3_finup,
.descsize = sizeof(struct sha256_state),
- .statesize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha256",
.cra_driver_name = "sha256-ssse3",
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { {
}
}, {
.digestsize = SHA224_DIGEST_SIZE,
- .init = sha224_ssse3_init,
+ .init = sha224_base_init,
.update = sha256_ssse3_update,
- .final = sha224_ssse3_final,
- .export = sha256_ssse3_export,
- .import = sha256_ssse3_import,
+ .final = sha256_ssse3_final,
+ .finup = sha256_ssse3_finup,
.descsize = sizeof(struct sha256_state),
- .statesize = sizeof(struct sha256_state),
.base = {
.cra_name = "sha224",
.cra_driver_name = "sha224-ssse3",
diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 974dde9bc6cd..565274d6a641 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -54,9 +54,9 @@
# Virtual Registers
# ARG1
-msg = %rdi
+digest = %rdi
# ARG2
-digest = %rsi
+msg = %rsi
# ARG3
msglen = %rdx
T1 = %rcx
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_avx(const void* M, void* D, u64 L)
+# void sha512_transform_avx(void* D, const void* M, u64 L)
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
# The size of the message pointed to by M must be an integer multiple of SHA512
# message blocks.
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 568b96105f5c..1f20b35d8573 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -70,16 +70,16 @@ XFER = YTMP0
BYTE_FLIP_MASK = %ymm9
# 1st arg
-INP = %rdi
+CTX = %rdi
# 2nd arg
-CTX = %rsi
+INP = %rsi
# 3rd arg
NUM_BLKS = %rdx
c = %rcx
d = %r8
e = %rdx
-y3 = %rdi
+y3 = %rsi
TBL = %rbp
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
+# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
# The size of the message pointed to by M must be an integer multiple of SHA512
# message blocks.
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index fb56855d51f5..e610e29cbc81 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -53,9 +53,9 @@
# Virtual Registers
# ARG1
-msg = %rdi
+digest = %rdi
# ARG2
-digest = %rsi
+msg = %rsi
# ARG3
msglen = %rdx
T1 = %rcx
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
.endm
########################################################################
-# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
# Purpose: Updates the SHA512 digest stored at D with the message stored in M.
# The size of the message pointed to by M must be an integer multiple of SHA512
# message blocks.
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b6af26832bf..d9fa4c1e063f 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,205 +34,75 @@
#include <linux/cryptohash.h>
#include <linux/types.h>
#include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha512_base.h>
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/xsave.h>
#include <linux/string.h>
-asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
- u64 rounds);
+asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
+ u64 rounds);
#ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
u64 rounds);
#endif
#ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
- u64 rounds);
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+ u64 rounds);
#endif
-static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
-
-
-static int sha512_ssse3_init(struct shash_desc *desc)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA512_H0;
- sctx->state[1] = SHA512_H1;
- sctx->state[2] = SHA512_H2;
- sctx->state[3] = SHA512_H3;
- sctx->state[4] = SHA512_H4;
- sctx->state[5] = SHA512_H5;
- sctx->state[6] = SHA512_H6;
- sctx->state[7] = SHA512_H7;
- sctx->count[0] = sctx->count[1] = 0;
-
- return 0;
-}
+static void (*sha512_transform_asm)(u64 *, const char *, u64);
-static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len, unsigned int partial)
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+ unsigned int len)
{
struct sha512_state *sctx = shash_desc_ctx(desc);
- unsigned int done = 0;
-
- sctx->count[0] += len;
- if (sctx->count[0] < len)
- sctx->count[1]++;
- if (partial) {
- done = SHA512_BLOCK_SIZE - partial;
- memcpy(sctx->buf + partial, data, done);
- sha512_transform_asm(sctx->buf, sctx->state, 1);
- }
-
- if (len - done >= SHA512_BLOCK_SIZE) {
- const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+ if (!irq_fpu_usable() ||
+ (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+ return crypto_sha512_update(desc, data, len);
- sha512_transform_asm(data + done, sctx->state, (u64) rounds);
-
- done += rounds * SHA512_BLOCK_SIZE;
- }
+ /* make sure casting to sha512_block_fn() is safe */
+ BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
- memcpy(sctx->buf, data + done, len - done);
+ kernel_fpu_begin();
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_transform_asm);
+ kernel_fpu_end();
return 0;
}
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
- unsigned int len)
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
{
- struct sha512_state *sctx = shash_desc_ctx(desc);
- unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
- int res;
-
- /* Handle the fast case right here */
- if (partial + len < SHA512_BLOCK_SIZE) {
- sctx->count[0] += len;
- if (sctx->count[0] < len)
- sctx->count[1]++;
- memcpy(sctx->buf + partial, data, len);
-
- return 0;
- }
+ if (!irq_fpu_usable())
+ return crypto_sha512_finup(desc, data, len, out);
- if (!irq_fpu_usable()) {
- res = crypto_sha512_update(desc, data, len);
- } else {
- kernel_fpu_begin();
- res = __sha512_ssse3_update(desc, data, len, partial);
- kernel_fpu_end();
- }
+ kernel_fpu_begin();
+ if (len)
+ sha512_base_do_update(desc, data, len,
+ (sha512_block_fn *)sha512_transform_asm);
+ sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+ kernel_fpu_end();
- return res;
+ return sha512_base_finish(desc, out);
}
-
/* Add padding and return the message digest. */
static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
{
- struct sha512_state *sctx = shash_desc_ctx(desc);
- unsigned int i, index, padlen;
- __be64 *dst = (__be64 *)out;
- __be64 bits[2];
- static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
- /* save number of bits */
- bits[1] = cpu_to_be64(sctx->count[0] << 3);
- bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
- /* Pad out to 112 mod 128 and append length */
- index = sctx->count[0] & 0x7f;
- padlen = (index < 112) ? (112 - index) : ((128+112) - index);
-
- if (!irq_fpu_usable()) {
- crypto_sha512_update(desc, padding, padlen);
- crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
- } else {
- kernel_fpu_begin();
- /* We need to fill a whole block for __sha512_ssse3_update() */
- if (padlen <= 112) {
- sctx->count[0] += padlen;
- if (sctx->count[0] < padlen)
- sctx->count[1]++;
- memcpy(sctx->buf + index, padding, padlen);
- } else {
- __sha512_ssse3_update(desc, padding, padlen, index);
- }
- __sha512_ssse3_update(desc, (const u8 *)&bits,
- sizeof(bits), 112);
- kernel_fpu_end();
- }
-
- /* Store state in digest */
- for (i = 0; i < 8; i++)
- dst[i] = cpu_to_be64(sctx->state[i]);
-
- /* Wipe context */
- memset(sctx, 0, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha512_ssse3_export(struct shash_desc *desc, void *out)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
-
- memcpy(out, sctx, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
-
- memcpy(sctx, in, sizeof(*sctx));
-
- return 0;
-}
-
-static int sha384_ssse3_init(struct shash_desc *desc)
-{
- struct sha512_state *sctx = shash_desc_ctx(desc);
-
- sctx->state[0] = SHA384_H0;
- sctx->state[1] = SHA384_H1;
- sctx->state[2] = SHA384_H2;
- sctx->state[3] = SHA384_H3;
- sctx->state[4] = SHA384_H4;
- sctx->state[5] = SHA384_H5;
- sctx->state[6] = SHA384_H6;
- sctx->state[7] = SHA384_H7;
-
- sctx->count[0] = sctx->count[1] = 0;
-
- return 0;
-}
-
-static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
-{
- u8 D[SHA512_DIGEST_SIZE];
-
- sha512_ssse3_final(desc, D);
-
- memcpy(hash, D, SHA384_DIGEST_SIZE);
- memzero_explicit(D, SHA512_DIGEST_SIZE);
-
- return 0;
+ return sha512_ssse3_finup(desc, NULL, 0, out);
}
static struct shash_alg algs[] = { {
.digestsize = SHA512_DIGEST_SIZE,
- .init = sha512_ssse3_init,
+ .init = sha512_base_init,
.update = sha512_ssse3_update,
.final = sha512_ssse3_final,
- .export = sha512_ssse3_export,
- .import = sha512_ssse3_import,
+ .finup = sha512_ssse3_finup,
.descsize = sizeof(struct sha512_state),
- .statesize = sizeof(struct sha512_state),
.base = {
.cra_name = "sha512",
.cra_driver_name = "sha512-ssse3",
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { {
}
}, {
.digestsize = SHA384_DIGEST_SIZE,
- .init = sha384_ssse3_init,
+ .init = sha384_base_init,
.update = sha512_ssse3_update,
- .final = sha384_ssse3_final,
- .export = sha512_ssse3_export,
- .import = sha512_ssse3_import,
+ .final = sha512_ssse3_final,
+ .finup = sha512_ssse3_finup,
.descsize = sizeof(struct sha512_state),
- .statesize = sizeof(struct sha512_state),
.base = {
.cra_name = "sha384",
.cra_driver_name = "sha384-ssse3",
diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 1ac531ea9bcc..b5e2d5651851 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__ecb-twofish-avx",
.cra_driver_name = "__driver-ecb-twofish-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__cbc-twofish-avx",
.cra_driver_name = "__driver-cbc-twofish-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__ctr-twofish-avx",
.cra_driver_name = "__driver-ctr-twofish-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = 1,
.cra_ctxsize = sizeof(struct twofish_ctx),
.cra_alignmask = 0,
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__lrw-twofish-avx",
.cra_driver_name = "__driver-lrw-twofish-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_lrw_ctx),
.cra_alignmask = 0,
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { {
.cra_name = "__xts-twofish-avx",
.cra_driver_name = "__driver-xts-twofish-avx",
.cra_priority = 0,
- .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
+ .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER |
+ CRYPTO_ALG_INTERNAL,
.cra_blocksize = TF_BLOCK_SIZE,
.cra_ctxsize = sizeof(struct twofish_xts_ctx),
.cra_alignmask = 0,
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 56fd6dd2e342..2ab0f7182df3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -418,6 +418,13 @@ sysretl_from_sys_call:
* cs and ss are loaded from MSRs.
* (Note: 32bit->32bit SYSRET is different: since r11
* does not exist, it merely sets eflags.IF=1).
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we must
+ * avoid SYSRET with SS == NULL, which could happen if we schedule,
+ * exit the kernel, and re-enter using an interrupt vector. (All
+ * interrupt entries on x86_64 set SS to NULL.) We prevent that
+ * from happening by reloading SS in __switch_to.
*/
USERGS_SYSRET32
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
#endif
#endif
-DECLARE_PER_CPU(int, cpu_state);
-
int mwait_usable(const struct cpuinfo_x86 *);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 854c04b3c9c2..3d6606fb97d0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
-#define NCAPINTS 11 /* N 32-bit words worth of info */
+#define NCAPINTS 13 /* N 32-bit words worth of info */
#define NBUGINTS 1 /* N 32-bit bug flags */
/*
@@ -195,6 +195,7 @@
#define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
#define X86_FEATURE_HWP_EPP ( 7*32+13) /* Intel HWP_EPP */
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
@@ -226,6 +227,7 @@
#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */
#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
@@ -244,6 +246,12 @@
#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */
#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
/*
* BUG word(s)
*/
@@ -257,6 +265,7 @@
#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */
#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
}
#endif
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
extern unsigned long e820_end_of_ram_pfn(void);
extern unsigned long e820_end_of_low_ram_pfn(void);
extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 935588d95c82..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index e42f758a0fbd..055ea9941dd5 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -50,7 +50,7 @@ extern const struct hypervisor_x86 *x86_hyper;
/* Recognized hypervisors */
extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
-extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+extern const struct hypervisor_x86 x86_hyper_xen;
extern const struct hypervisor_x86 x86_hyper_kvm;
extern void init_hypervisor(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h
index 705d35708a50..7c5af123bdbd 100644
--- a/arch/x86/include/asm/intel-mid.h
+++ b/arch/x86/include/asm/intel-mid.h
@@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options;
#define SFI_MTMR_MAX_NUM 8
#define SFI_MRTC_MAX 8
-extern struct console early_hsu_console;
-extern void hsu_early_console_init(const char *);
-
extern void intel_scu_devices_create(void);
extern void intel_scu_devices_destroy(void);
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index e2d4a4afa8c3..3bbc07a57a31 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -20,13 +20,10 @@ extern unsigned long switcher_addr;
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
-/* Declarations for definitions in lguest_guest.S */
-extern char lguest_noirq_start[], lguest_noirq_end[];
+/* Declarations for definitions in arch/x86/lguest/head_32.S */
+extern char lguest_noirq_iret[];
extern const char lgstart_cli[], lgend_cli[];
-extern const char lgstart_sti[], lgend_sti[];
-extern const char lgstart_popf[], lgend_popf[];
extern const char lgstart_pushf[], lgend_pushf[];
-extern const char lgstart_iret[], lgend_iret[];
extern void lguest_iret(void);
extern void lguest_init(void);
diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
#endif
return 0;
}
-extern int klp_write_module_reloc(struct module *mod, unsigned long type,
- unsigned long loc, unsigned long value);
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+ unsigned long loc, unsigned long value);
static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
{
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
#ifdef CONFIG_X86_64
#include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER (PUD_SHIFT)
#else
#include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER (PMD_SHIFT)
#endif /* CONFIG_X86_64 */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5f6051d5d139..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
}
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
static inline pmd_t __pmd(pmdval_t val)
{
pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
#ifdef CONFIG_X86_PAE
/* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 38a0ff9ef06e..344c646e7f06 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -295,7 +295,7 @@ struct pv_mmu_ops {
struct paravirt_callee_save pgd_val;
struct paravirt_callee_save make_pgd;
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -309,13 +309,13 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* PAGETABLE_LEVELS == 4 */
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
#define pmd_pgtable(pmd) pmd_page(pmd)
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
{
struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
}
#endif /* CONFIG_X86_PAE */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 2
/*
* traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
#define SHARED_KERNEL_PMD 1
#endif
-#define PAGETABLE_LEVELS 3
-
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
return npg >> (20 - PAGE_SHIFT);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
static inline int pud_none(pud_t pud)
{
return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
{
return 0;
}
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
static inline int pgd_present(pgd_t pgd)
{
return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
{
return !native_pgd_val(pgd);
}
-#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* !__ASSEMBLY__ */
#define SHARED_KERNEL_PMD 0
-#define PAGETABLE_LEVELS 4
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
}
#endif
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
typedef struct { pmdval_t pmd; } pmd_t;
static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/pm-trace.h
index 3ff1c2cb1da5..7b7ac42c3661 100644
--- a/arch/x86/include/asm/resume-trace.h
+++ b/arch/x86/include/asm/pm-trace.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_RESUME_TRACE_H
-#define _ASM_X86_RESUME_TRACE_H
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
#include <asm/asm.h>
@@ -14,8 +14,10 @@ do { \
".previous" \
:"=r" (tracedata) \
: "i" (__LINE__), "i" (__FILE__)); \
- generate_resume_trace(tracedata, user); \
+ generate_pm_trace(tracedata, user); \
} \
} while (0)
-#endif /* _ASM_X86_RESUME_TRACE_H */
+#define TRACE_SUSPEND(user) TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index d2203b5d9538..23ba6765b718 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
/* in KB - valid for CPUS which support this call: */
int x86_cache_size;
int x86_cache_alignment; /* In bytes */
+ /* Cache QoS architectural values: */
+ int x86_cache_max_rmid; /* max index */
+ int x86_cache_occ_scale; /* scale to bytes */
int x86_power;
unsigned long loops_per_jiffy;
/* cpuid returned max cores value: */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 25b1cc07d496..d6b078e9fa28 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
struct pvclock_vsyscall_time_info {
struct pvclock_vcpu_time_info pvti;
- u32 migrate_count;
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
#ifdef CONFIG_X86_32
-# include <asm/seccomp_32.h>
-#else
-# include <asm/seccomp_64.h>
+#define __NR_seccomp_sigreturn __NR_sigreturn
#endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/ia32_unistd.h>
+#define __NR_seccomp_read_32 __NR_ia32_read
+#define __NR_seccomp_write_32 __NR_ia32_write
+#define __NR_seccomp_exit_32 __NR_ia32_exit
+#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_32_H
-#define _ASM_X86_SECCOMP_32_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_64_H
-#define _ASM_X86_SECCOMP_64_H
-
-#include <linux/unistd.h>
-#include <asm/ia32_unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_ia32_read
-#define __NR_seccomp_write_32 __NR_ia32_write
-#define __NR_seccomp_exit_32 __NR_ia32_exit
-#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
index 460b84f64556..8378b8c9109c 100644
--- a/arch/x86/include/asm/serial.h
+++ b/arch/x86/include/asm/serial.h
@@ -12,11 +12,11 @@
/* Standard COM flags (except for COM4, because of the 8514 problem) */
#ifdef CONFIG_SERIAL_DETECT_IRQ
-# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
-# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ)
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ)
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ)
#else
-# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 )
-# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 )
+# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | 0 )
+# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | 0 )
#endif
#define SERIAL_PORT_DFNS \
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81d02fc7dafa..17a8dced12da 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,13 +150,13 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
}
void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus);
void native_smp_cpus_done(unsigned int max_cpus);
void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
void native_cpu_die(unsigned int cpu);
void native_play_dead(void);
void play_dead_common(void);
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index cf87de3fc390..64b611782ef0 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -169,7 +169,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock)
struct __raw_tickets tmp = READ_ONCE(lock->tickets);
tmp.head &= ~TICKET_SLOWPATH_FLAG;
- return (tmp.tail - tmp.head) > TICKET_LOCK_INC;
+ return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
}
#define arch_spin_is_contended arch_spin_is_contended
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ea2dbe82cba3..b4bdec3e9523 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -46,13 +46,11 @@
*/
#ifndef __ASSEMBLY__
struct task_struct;
-struct exec_domain;
#include <asm/processor.h>
#include <linux/atomic.h>
struct thread_info {
struct task_struct *task; /* main task structure */
- struct exec_domain *exec_domain; /* execution domain */
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
@@ -66,7 +64,6 @@ struct thread_info {
#define INIT_THREAD_INFO(tsk) \
{ \
.task = &tsk, \
- .exec_domain = &default_exec_domain, \
.flags = 0, \
.cpu = 0, \
.saved_preempt_count = INIT_PREEMPT_COUNT, \
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 358dcd338915..c44a5d53e464 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -269,4 +269,9 @@ static inline bool xen_arch_need_swiotlb(struct device *dev,
return false;
}
+static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order)
+{
+ return __get_free_pages(__GFP_NOWARN, order);
+}
+
#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33f5236..960a8a9dc4ab 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
#define E820_NVS 4
#define E820_UNUSABLE 5
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot. The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ * but newer versions switched to 12 as 6 was assigned differently. Some
+ * time they will learn... )
+ */
+#define E820_PRAM 12
/*
* reserved RAM used by kernel itself
diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h
index 90c458e66e13..ce6068dbcfbc 100644
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -225,6 +225,8 @@
#define HV_STATUS_INVALID_HYPERCALL_CODE 2
#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
#define HV_STATUS_INVALID_ALIGNMENT 4
+#define HV_STATUS_INSUFFICIENT_MEMORY 11
+#define HV_STATUS_INVALID_CONNECTION_ID 18
#define HV_STATUS_INSUFFICIENT_BUFFERS 19
typedef struct _HV_REFERENCE_TSC_PAGE {
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..c469490db4a8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -61,6 +61,9 @@
#define MSR_OFFCORE_RSP_1 0x000001a7
#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae
+#define MSR_TURBO_RATIO_LIMIT 0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1 0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2 0x000001af
#define MSR_LBR_SELECT 0x000001c8
#define MSR_LBR_TOS 0x000001c9
@@ -74,6 +77,24 @@
#define MSR_IA32_PERF_CAPABILITIES 0x00000345
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
+#define MSR_IA32_RTIT_CTL 0x00000570
+#define RTIT_CTL_TRACEEN BIT(0)
+#define RTIT_CTL_OS BIT(2)
+#define RTIT_CTL_USR BIT(3)
+#define RTIT_CTL_CR3EN BIT(7)
+#define RTIT_CTL_TOPA BIT(8)
+#define RTIT_CTL_TSC_EN BIT(10)
+#define RTIT_CTL_DISRETC BIT(11)
+#define RTIT_CTL_BRANCH_EN BIT(13)
+#define MSR_IA32_RTIT_STATUS 0x00000571
+#define RTIT_STATUS_CONTEXTEN BIT(1)
+#define RTIT_STATUS_TRIGGEREN BIT(2)
+#define RTIT_STATUS_ERROR BIT(4)
+#define RTIT_STATUS_STOPPED BIT(5)
+#define MSR_IA32_RTIT_CR3_MATCH 0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE 0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK 0x00000561
+
#define MSR_MTRRfix64K_00000 0x00000250
#define MSR_MTRRfix16K_80000 0x00000258
#define MSR_MTRRfix16K_A0000 0x00000259
@@ -147,6 +168,11 @@
#define MSR_PP1_ENERGY_STATUS 0x00000641
#define MSR_PP1_POLICY 0x00000642
+#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES 0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B
+
#define MSR_CORE_C1_RES 0x00000660
#define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index c887cd944f0c..9bcd0b56ca17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -95,6 +95,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o
obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 803b684676ff..dbe76a14c3c9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -757,7 +757,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
}
/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu)
+int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
{
return _acpi_map_lsapic(handle, physid, pcpu);
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index d9d0bd2faaf4..ab3219b3fbda 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -171,8 +171,8 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
for_each_online_cpu(cpu) {
if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
continue;
- __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
- __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
}
free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
free_cpumask_var(per_cpu(ipi_mask, this_cpu));
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,8 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_pt.o perf_event_intel_bts.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE) += perf_event_intel_uncore.o \
perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd470ebf924e..e4cf63301ff4 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c)
if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
+
+ /* AMD CPUs don't reset SS attributes on SYSRET */
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3f70538012e2..a62cf04dac8a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[10] = eax;
}
+ /* Additional Intel-defined flags: level 0x0000000F */
+ if (c->cpuid_level >= 0x0000000F) {
+ u32 eax, ebx, ecx, edx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=0 */
+ cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[11] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+ /* will be overridden if occupancy monitoring exists */
+ c->x86_cache_max_rmid = ebx;
+
+ /* QoS sub-leaf, EAX=0Fh, ECX=1 */
+ cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[12] = edx;
+ if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+ c->x86_cache_max_rmid = ecx;
+ c->x86_cache_occ_scale = ebx;
+ }
+ } else {
+ c->x86_cache_max_rmid = -1;
+ c->x86_cache_occ_scale = -1;
+ }
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
detect_nopl(c);
}
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+ /*
+ * The heavy lifting of max_rmid and cache_occ_scale are handled
+ * in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
+ * in case CQM bits really aren't there in this CPU.
+ */
+ if (c != &boot_cpu_data) {
+ boot_cpu_data.x86_cache_max_rmid =
+ min(boot_cpu_data.x86_cache_max_rmid,
+ c->x86_cache_max_rmid);
+ }
+}
+
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor(c);
x86_init_rdrand(c);
+ x86_init_cache_qos(c);
/*
* Clear/Set all flags overriden by options, need do it
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 36ce402a3fa5..d820d8eae96b 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -27,8 +27,8 @@
static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
-#ifdef CONFIG_XEN_PVHVM
- &x86_hyper_xen_hvm,
+#ifdef CONFIG_XEN
+ &x86_hyper_xen,
#endif
&x86_hyper_vmware,
&x86_hyper_ms_hyperv,
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+ TOPA_4K = 0,
+ TOPA_8K,
+ TOPA_16K,
+ TOPA_32K,
+ TOPA_64K,
+ TOPA_128K,
+ TOPA_256K,
+ TOPA_512K,
+ TOPA_1MB,
+ TOPA_2MB,
+ TOPA_4MB,
+ TOPA_8MB,
+ TOPA_16MB,
+ TOPA_32MB,
+ TOPA_64MB,
+ TOPA_128MB,
+ TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+ return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+ u64 end : 1;
+ u64 rsvd0 : 1;
+ u64 intr : 1;
+ u64 rsvd1 : 1;
+ u64 stop : 1;
+ u64 rsvd2 : 1;
+ u64 size : 4;
+ u64 rsvd3 : 2;
+ u64 base : 36;
+ u64 rsvd4 : 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+ PT_CAP_max_subleaf = 0,
+ PT_CAP_cr3_filtering,
+ PT_CAP_topa_output,
+ PT_CAP_topa_multiple_entries,
+ PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+ struct pmu pmu;
+ u32 caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ * cpu, depending on perf event configuration
+ * @cpu: cpu for per-cpu allocation
+ * @tables: list of ToPA tables in this buffer
+ * @first: shorthand for first topa table
+ * @last: shorthand for last topa table
+ * @cur: current topa table
+ * @nr_pages: buffer size in pages
+ * @cur_idx: current output region's index within @cur table
+ * @output_off: offset within the current output region
+ * @data_size: running total of the amount of data in this buffer
+ * @lost: if data was lost/truncated
+ * @head: logical write offset inside the buffer
+ * @snapshot: if this is for a snapshot/overwrite counter
+ * @stop_pos: STOP topa entry in the buffer
+ * @intr_pos: INT topa entry in the buffer
+ * @data_pages: array of pages from perf
+ * @topa_index: table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+ int cpu;
+ struct list_head tables;
+ struct topa *first, *last, *cur;
+ unsigned int cur_idx;
+ size_t output_off;
+ unsigned long nr_pages;
+ local_t data_size;
+ local_t lost;
+ local64_t head;
+ bool snapshot;
+ unsigned long stop_pos, intr_pos;
+ void **data_pages;
+ struct topa_entry *topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle: perf output handle
+ * @handle_nmi: do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+ struct perf_output_handle handle;
+ int handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
static int mtrr_seq_show(struct seq_file *seq, void *offset)
{
char factor;
- int i, max, len;
+ int i, max;
mtrr_type type;
unsigned long base, size;
- len = 0;
max = num_var_ranges;
for (i = 0; i < max; i++) {
mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
size >>= 20 - PAGE_SHIFT;
}
/* Base can be > 32bit */
- len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
- "(%5luMB), size=%5lu%cB, count=%d: %s\n",
- i, base, base >> (20 - PAGE_SHIFT), size,
- factor, mtrr_usage_table[i],
- mtrr_attrib_to_str(type));
+ seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+ i, base, base >> (20 - PAGE_SHIFT),
+ size, factor,
+ mtrr_usage_table[i], mtrr_attrib_to_str(type));
}
return 0;
}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e2888a3ad1e3..87848ebe2bb7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
}
}
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+ hw_perf_event_destroy(event);
+
+ /* undo the lbr/bts event accounting */
+ x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
static inline int x86_pmu_initialized(void)
{
return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
return x86_pmu_extra_regs(val, event);
}
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+ int ret = -EBUSY, i;
+
+ if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+ return 0;
+
+ mutex_lock(&pmc_reserve_mutex);
+ for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+ if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+ goto out;
+
+ atomic_inc(&x86_pmu.lbr_exclusive[what]);
+ ret = 0;
+
+out:
+ mutex_unlock(&pmc_reserve_mutex);
+ return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+ atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
/* BTS is currently only allowed for user-mode. */
if (!attr->exclude_kernel)
return -EOPNOTSUPP;
+
+ /* disallow bts if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
}
hwc->config |= config;
@@ -399,39 +442,41 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip > precise)
return -EOPNOTSUPP;
- /*
- * check that PEBS LBR correction does not conflict with
- * whatever the user is asking with attr->branch_sample_type
- */
- if (event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2) {
- u64 *br_type = &event->attr.branch_sample_type;
-
- if (has_branch_stack(event)) {
- if (!precise_br_compat(event))
- return -EOPNOTSUPP;
-
- /* branch_sample_type is compatible */
-
- } else {
- /*
- * user did not specify branch_sample_type
- *
- * For PEBS fixups, we capture all
- * the branches at the priv level of the
- * event.
- */
- *br_type = PERF_SAMPLE_BRANCH_ANY;
-
- if (!event->attr.exclude_user)
- *br_type |= PERF_SAMPLE_BRANCH_USER;
-
- if (!event->attr.exclude_kernel)
- *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
- }
+ }
+ /*
+ * check that PEBS LBR correction does not conflict with
+ * whatever the user is asking with attr->branch_sample_type
+ */
+ if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+ u64 *br_type = &event->attr.branch_sample_type;
+
+ if (has_branch_stack(event)) {
+ if (!precise_br_compat(event))
+ return -EOPNOTSUPP;
+
+ /* branch_sample_type is compatible */
+
+ } else {
+ /*
+ * user did not specify branch_sample_type
+ *
+ * For PEBS fixups, we capture all
+ * the branches at the priv level of the
+ * event.
+ */
+ *br_type = PERF_SAMPLE_BRANCH_ANY;
+
+ if (!event->attr.exclude_user)
+ *br_type |= PERF_SAMPLE_BRANCH_USER;
+
+ if (!event->attr.exclude_kernel)
+ *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
}
}
+ if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+ event->attach_state |= PERF_ATTACH_TASK_DATA;
+
/*
* Generate PMC IRQs:
* (keep 'enabled' bit clear for now)
@@ -449,6 +494,12 @@ int x86_pmu_hw_config(struct perf_event *event)
if (event->attr.type == PERF_TYPE_RAW)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+ if (event->attr.sample_period && x86_pmu.limit_period) {
+ if (x86_pmu.limit_period(event, event->attr.sample_period) >
+ event->attr.sample_period)
+ return -EINVAL;
+ }
+
return x86_setup_perfctr(event);
}
@@ -728,14 +779,17 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
struct event_constraint *c;
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
struct perf_event *e;
- int i, wmin, wmax, num = 0;
+ int i, wmin, wmax, unsched = 0;
struct hw_perf_event *hwc;
bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ if (x86_pmu.start_scheduling)
+ x86_pmu.start_scheduling(cpuc);
+
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
hwc = &cpuc->event_list[i]->hw;
- c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+ c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
hwc->constraint = c;
wmin = min(wmin, c->weight);
@@ -768,24 +822,30 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* slow path */
if (i != n)
- num = perf_assign_events(cpuc->event_list, n, wmin,
- wmax, assign);
+ unsched = perf_assign_events(cpuc->event_list, n, wmin,
+ wmax, assign);
/*
- * Mark the event as committed, so we do not put_constraint()
- * in case new events are added and fail scheduling.
+ * In case of success (unsched = 0), mark events as committed,
+ * so we do not put_constraint() in case new events are added
+ * and fail to be scheduled
+ *
+ * We invoke the lower level commit callback to lock the resource
+ *
+ * We do not need to do all of this in case we are called to
+ * validate an event group (assign == NULL)
*/
- if (!num && assign) {
+ if (!unsched && assign) {
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+ if (x86_pmu.commit_scheduling)
+ x86_pmu.commit_scheduling(cpuc, e, assign[i]);
}
}
- /*
- * scheduling failed or is just a simulation,
- * free resources if necessary
- */
- if (!assign || num) {
+
+ if (!assign || unsched) {
+
for (i = 0; i < n; i++) {
e = cpuc->event_list[i];
/*
@@ -795,11 +855,18 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
continue;
+ /*
+ * release events that failed scheduling
+ */
if (x86_pmu.put_event_constraints)
x86_pmu.put_event_constraints(cpuc, e);
}
}
- return num ? -EINVAL : 0;
+
+ if (x86_pmu.stop_scheduling)
+ x86_pmu.stop_scheduling(cpuc);
+
+ return unsched ? -EINVAL : 0;
}
/*
@@ -986,6 +1053,9 @@ int x86_perf_event_set_period(struct perf_event *event)
if (left > x86_pmu.max_period)
left = x86_pmu.max_period;
+ if (x86_pmu.limit_period)
+ left = x86_pmu.limit_period(event, left);
+
per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
/*
@@ -1033,7 +1103,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
hwc = &event->hw;
- perf_pmu_disable(event->pmu);
n0 = cpuc->n_events;
ret = n = collect_events(cpuc, event, false);
if (ret < 0)
@@ -1071,7 +1140,6 @@ done_collect:
ret = 0;
out:
- perf_pmu_enable(event->pmu);
return ret;
}
@@ -1103,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
- u64 pebs;
+ u64 pebs, debugctl;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
@@ -1121,14 +1189,20 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
- rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
- pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ if (x86_pmu.pebs_constraints) {
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
+ }
+ if (x86_pmu.lbr_nr) {
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ pr_info("CPU#%d: debugctl: %016llx\n", cpu, debugctl);
+ }
}
pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
@@ -1321,11 +1395,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- int ret = NOTIFY_OK;
+ int i, ret = NOTIFY_OK;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- cpuc->kfree_on_online = NULL;
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+ cpuc->kfree_on_online[i] = NULL;
if (x86_pmu.cpu_prepare)
ret = x86_pmu.cpu_prepare(cpu);
break;
@@ -1336,7 +1411,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
break;
case CPU_ONLINE:
- kfree(cpuc->kfree_on_online);
+ for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+ kfree(cpuc->kfree_on_online[i]);
+ cpuc->kfree_on_online[i] = NULL;
+ }
break;
case CPU_DYING:
@@ -1712,7 +1790,7 @@ static int validate_event(struct perf_event *event)
if (IS_ERR(fake_cpuc))
return PTR_ERR(fake_cpuc);
- c = x86_pmu.get_event_constraints(fake_cpuc, event);
+ c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
if (!c || !c->weight)
ret = -EINVAL;
@@ -1914,10 +1992,10 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
NULL,
};
-static void x86_pmu_flush_branch_stack(void)
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
- if (x86_pmu.flush_branch_stack)
- x86_pmu.flush_branch_stack();
+ if (x86_pmu.sched_task)
+ x86_pmu.sched_task(ctx, sched_in);
}
void perf_check_microcode(void)
@@ -1949,7 +2027,8 @@ static struct pmu pmu = {
.commit_txn = x86_pmu_commit_txn,
.event_idx = x86_pmu_event_idx,
- .flush_branch_stack = x86_pmu_flush_branch_stack,
+ .sched_task = x86_pmu_sched_task,
+ .task_ctx_size = sizeof(struct x86_perf_task_context),
};
void arch_perf_update_userpage(struct perf_event *event,
@@ -1968,13 +2047,23 @@ void arch_perf_update_userpage(struct perf_event *event,
data = cyc2ns_read_begin();
+ /*
+ * Internal timekeeping for enabled/running/stopped times
+ * is always in the local_clock domain.
+ */
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
userpg->time_offset = data->cyc2ns_offset - now;
- userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
+ /*
+ * cap_user_time_zero doesn't make sense when we're using a different
+ * time base for the records.
+ */
+ if (event->clock == &local_clock) {
+ userpg->cap_user_time_zero = 1;
+ userpg->time_zero = data->cyc2ns_offset;
+ }
cyc2ns_read_end(data);
}
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,13 +65,15 @@ struct event_constraint {
/*
* struct hw_perf_event.flags flags
*/
-#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */
+#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */
struct amd_nb {
@@ -123,8 +125,37 @@ struct intel_shared_regs {
unsigned core_id; /* per-core: core id */
};
+enum intel_excl_state_type {
+ INTEL_EXCL_UNUSED = 0, /* counter is unused */
+ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */
+ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+ enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+ enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+ int num_alloc_cntrs;/* #counters allocated */
+ int max_alloc_cntrs;/* max #counters allowed */
+ bool sched_started; /* true if scheduling has started */
+};
+
+struct intel_excl_cntrs {
+ raw_spinlock_t lock;
+
+ struct intel_excl_states states[2];
+
+ int refcnt; /* per-core: #HT threads */
+ unsigned core_id; /* per-core: core id */
+};
+
#define MAX_LBR_ENTRIES 16
+enum {
+ X86_PERF_KFREE_SHARED = 0,
+ X86_PERF_KFREE_EXCL = 1,
+ X86_PERF_KFREE_MAX
+};
+
struct cpu_hw_events {
/*
* Generic x86 PMC bits
@@ -179,6 +210,12 @@ struct cpu_hw_events {
* used on Intel NHM/WSM/SNB
*/
struct intel_shared_regs *shared_regs;
+ /*
+ * manage exclusive counter access between hyperthread
+ */
+ struct event_constraint *constraint_list; /* in enable order */
+ struct intel_excl_cntrs *excl_cntrs;
+ int excl_thread_id; /* 0 or 1 */
/*
* AMD specific bits
@@ -187,7 +224,7 @@ struct cpu_hw_events {
/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
u64 perf_ctr_virt_mask;
- void *kfree_on_online;
+ void *kfree_on_online[X86_PERF_KFREE_MAX];
};
#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
@@ -202,6 +239,10 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
+#define INTEL_EXCLEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+ 0, PERF_X86_EVENT_EXCL)
+
/*
* The overlap flag marks event constraints with overlapping counter
* masks. This is the case if the counter mask of such an event is not
@@ -259,6 +300,10 @@ struct cpu_hw_events {
#define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n) \
+ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
#define INTEL_PLD_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
@@ -283,22 +328,40 @@ struct cpu_hw_events {
/* Check flags and event code, and set the HSW load flag */
#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
- __EVENT_CONSTRAINT(code, n, \
+ __EVENT_CONSTRAINT(code, n, \
ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW store flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW load flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
__EVENT_CONSTRAINT(code, n, \
INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+ __EVENT_CONSTRAINT(code, n, \
+ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+ HWEIGHT(n), 0, \
+ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
/* Check flags and event code/umask, and set the HSW N/A flag */
#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
__EVENT_CONSTRAINT(code, n, \
@@ -408,6 +471,13 @@ union x86_pmu_config {
#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
+enum {
+ x86_lbr_exclusive_lbr,
+ x86_lbr_exclusive_bts,
+ x86_lbr_exclusive_pt,
+ x86_lbr_exclusive_max,
+};
+
/*
* struct x86_pmu - generic x86 pmu
*/
@@ -443,14 +513,25 @@ struct x86_pmu {
u64 max_period;
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
+ int idx,
struct perf_event *event);
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
+
+ void (*commit_scheduling)(struct cpu_hw_events *cpuc,
+ struct perf_event *event,
+ int cntr);
+
+ void (*start_scheduling)(struct cpu_hw_events *cpuc);
+
+ void (*stop_scheduling)(struct cpu_hw_events *cpuc);
+
struct event_constraint *event_constraints;
struct x86_pmu_quirk *quirks;
int perfctr_second_write;
bool late_ack;
+ unsigned (*limit_period)(struct perf_event *event, unsigned l);
/*
* sysfs attrs
@@ -472,7 +553,8 @@ struct x86_pmu {
void (*cpu_dead)(int cpu);
void (*check_microcode)(void);
- void (*flush_branch_stack)(void);
+ void (*sched_task)(struct perf_event_context *ctx,
+ bool sched_in);
/*
* Intel Arch Perfmon v2+
@@ -504,10 +586,15 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
/*
+ * Intel PT/LBR/BTS are exclusive
+ */
+ atomic_t lbr_exclusive[x86_lbr_exclusive_max];
+
+ /*
* Extra registers for events
*/
struct extra_reg *extra_regs;
- unsigned int er_flags;
+ unsigned int flags;
/*
* Intel host/guest support (KVM)
@@ -515,6 +602,13 @@ struct x86_pmu {
struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
};
+struct x86_perf_task_context {
+ u64 lbr_from[MAX_LBR_ENTRIES];
+ u64 lbr_to[MAX_LBR_ENTRIES];
+ int lbr_callstack_users;
+ int lbr_stack_state;
+};
+
#define x86_add_quirk(func_) \
do { \
static struct x86_pmu_quirk __quirk __initdata = { \
@@ -524,8 +618,13 @@ do { \
x86_pmu.quirks = &__quirk; \
} while (0)
-#define ERF_NO_HT_SHARING 1
-#define ERF_HAS_RSP_1 2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */
+#define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */
+#define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */
#define EVENT_VAR(_id) event_attr_##_id
#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -546,6 +645,12 @@ static struct perf_pmu_events_attr event_attr_##v = { \
extern struct x86_pmu x86_pmu __read_mostly;
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+ return x86_pmu.lbr_sel_map &&
+ x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
int x86_perf_event_set_period(struct perf_event *event);
@@ -588,6 +693,12 @@ static inline int x86_pmu_rdpmc_index(int index)
return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
}
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
int x86_setup_perfctr(struct perf_event *event);
int x86_pmu_hw_config(struct perf_event *event);
@@ -674,10 +785,34 @@ static inline int amd_pmu_init(void)
#ifdef CONFIG_CPU_SUP_INTEL
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+ /* user explicitly requested branch sampling */
+ if (has_branch_stack(event))
+ return true;
+
+ /* implicit branch sampling to correct PEBS skid */
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+ x86_pmu.intel_cap.pebs_format < 2)
+ return true;
+
+ return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+ if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+ !event->attr.freq && event->hw.sample_period == 1)
+ return true;
+
+ return false;
+}
+
int intel_pmu_save_and_restart(struct perf_event *event);
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event);
struct intel_shared_regs *allocate_shared_regs(int cpu);
@@ -727,13 +862,15 @@ void intel_pmu_pebs_disable_all(void);
void intel_ds_init(void);
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
void intel_pmu_lbr_reset(void);
void intel_pmu_lbr_enable(struct perf_event *event);
void intel_pmu_lbr_disable(struct perf_event *event);
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
void intel_pmu_lbr_disable_all(void);
@@ -747,8 +884,18 @@ void intel_pmu_lbr_init_atom(void);
void intel_pmu_lbr_init_snb(void);
+void intel_pmu_lbr_init_hsw(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
+void intel_pt_interrupt(void);
+
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
int p4_pmu_init(void);
int p6_pmu_init(void);
@@ -758,6 +905,10 @@ int knc_pmu_init(void);
ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
char *page);
+static inline int is_ht_workaround_enabled(void)
+{
+ return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
#else /* CONFIG_CPU_SUP_INTEL */
static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
static void amd_pmu_cpu_starting(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
struct amd_nb *nb;
int i, nb_id;
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
continue;
if (nb->nb_id == nb_id) {
- cpuc->kfree_on_online = cpuc->amd_nb;
+ *onln = cpuc->amd_nb;
cpuc->amd_nb = nb;
break;
}
@@ -429,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
}
static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
/*
* if not NB event or no NB, then no constraints
@@ -537,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
* the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
* is using the new offset.
*/
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
{
int offset;
int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
if (offset == APIC_EILVT_NR_MAX) {
printk(KERN_DEBUG "No EILVT entry available\n");
- return -EBUSY;
+ return;
}
ret = setup_ibs_ctl(offset);
if (ret)
goto out;
- if (!ibs_eilvt_valid()) {
- ret = -EFAULT;
+ if (!ibs_eilvt_valid())
goto out;
- }
pr_info("IBS: LVT offset %d assigned\n", offset);
- return 0;
+ return;
out:
preempt_disable();
put_eilvt(offset);
preempt_enable();
- return ret;
+ return;
}
static void ibs_eilvt_setup(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 258990688a5e..960e85de13fb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/watchdog.h>
#include <asm/cpufeature.h>
#include <asm/hardirq.h>
@@ -113,6 +114,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -131,15 +138,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
- /*
- * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
- * siblings; disable these events because they can corrupt unrelated
- * counters.
- */
- INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
- INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
EVENT_CONSTRAINT_END
};
@@ -217,6 +221,21 @@ static struct event_constraint intel_hsw_event_constraints[] = {
INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+ EVENT_CONSTRAINT_END
+};
+
+struct event_constraint intel_bdw_event_constraints[] = {
+ FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
+ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
+ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */
+ INTEL_UEVENT_CONSTRAINT(0x148, 0x4), /* L1D_PEND_MISS.PENDING */
+ INTEL_EVENT_CONSTRAINT(0xa3, 0x4), /* CYCLE_ACTIVITY.* */
EVENT_CONSTRAINT_END
};
@@ -415,6 +434,202 @@ static __initconst const u64 snb_hw_cache_event_ids
};
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ * reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD BIT_ULL(0)
+#define HSW_DEMAND_RFO BIT_ULL(1)
+#define HSW_ANY_RESPONSE BIT_ULL(16)
+#define HSW_SUPPLIER_NONE BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0 BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1 BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P BIT_ULL(29)
+#define HSW_L3_MISS (HSW_L3_MISS_LOCAL_DRAM| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED BIT_ULL(32)
+#define HSW_SNOOP_MISS BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD BIT_ULL(35)
+#define HSW_SNOOP_HITM BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM BIT_ULL(37)
+#define HSW_ANY_SNOOP (HSW_SNOOP_NONE| \
+ HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+ HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+ HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM (HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE (HSW_L3_MISS_REMOTE_HOP0|\
+ HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS HSW_ANY_RESPONSE
+
+#define BDW_L3_MISS_LOCAL BIT(26)
+#define BDW_L3_MISS (BDW_L3_MISS_LOCAL| \
+ HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+ HSW_L3_MISS_REMOTE_HOP2P)
+
+
+static __initconst const u64 hsw_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x151, /* L1D.REPLACEMENT */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(L1I ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x280, /* ICACHE.MISSES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOPS_RETIRED.ALL_LOADS */
+ [ C(RESULT_MISS) ] = 0x108, /* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOPS_RETIRED.ALL_STORES */
+ [ C(RESULT_MISS) ] = 0x149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x6085, /* ITLB_MISSES.STLB_HIT */
+ [ C(RESULT_MISS) ] = 0x185, /* ITLB_MISSES.MISS_CAUSES_A_WALK */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(BPU ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0xc4, /* BR_INST_RETIRED.ALL_BRANCHES */
+ [ C(RESULT_MISS) ] = 0xc5, /* BR_MISP_RETIRED.ALL_BRANCHES */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ [ C(RESULT_MISS) ] = 0x1b7, /* OFFCORE_RESPONSE */
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_LLC_ACCESS,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS|HSW_ANY_SNOOP,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+ [ C(NODE) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_READ|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_LOCAL_DRAM|
+ HSW_SNOOP_DRAM,
+ [ C(RESULT_MISS) ] = HSW_DEMAND_WRITE|
+ HSW_L3_MISS_REMOTE|
+ HSW_SNOOP_DRAM,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = 0x0,
+ },
+ },
+};
+
static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -1029,21 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
},
};
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
- /* user explicitly requested branch sampling */
- if (has_branch_stack(event))
- return true;
-
- /* implicit branch sampling to correct PEBS skid */
- if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
- x86_pmu.intel_cap.pebs_format < 2)
- return true;
-
- return false;
-}
-
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1051,17 +1255,24 @@ static void intel_pmu_disable_all(void)
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+ else
+ intel_bts_disable_local();
intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+ __intel_pmu_disable_all();
intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
intel_pmu_pebs_enable_all();
- intel_pmu_lbr_enable_all();
+ intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
@@ -1073,7 +1284,13 @@ static void intel_pmu_enable_all(int added)
return;
intel_pmu_enable_bts(event->hw.config);
- }
+ } else
+ intel_bts_enable_local();
+}
+
+static void intel_pmu_enable_all(int added)
+{
+ __intel_pmu_enable_all(added, false);
}
/*
@@ -1207,7 +1424,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
* must disable before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_disable(event);
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1485,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
* must enabled before any actual event
* because any event may be combined with LBR
*/
- if (intel_pmu_needs_lbr_smpl(event))
+ if (needs_branch_stack(event))
intel_pmu_lbr_enable(event);
if (event->attr.exclude_host)
@@ -1334,6 +1551,18 @@ static void intel_pmu_reset(void)
if (ds)
ds->bts_index = ds->bts_buffer_base;
+ /* Ack all overflows and disable fixed counters */
+ if (x86_pmu.version >= 2) {
+ intel_pmu_ack_status(intel_pmu_get_status());
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ }
+
+ /* Reset LBRs and LBR freezing */
+ if (x86_pmu.lbr_nr) {
+ update_debugctlmsr(get_debugctlmsr() &
+ ~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+ }
+
local_irq_restore(flags);
}
@@ -1357,8 +1586,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
*/
if (!x86_pmu.late_ack)
apic_write(APIC_LVTPC, APIC_DM_NMI);
- intel_pmu_disable_all();
+ __intel_pmu_disable_all();
handled = intel_pmu_drain_bts_buffer();
+ handled += intel_bts_interrupt();
status = intel_pmu_get_status();
if (!status)
goto done;
@@ -1399,6 +1629,14 @@ again:
}
/*
+ * Intel PT
+ */
+ if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+ handled++;
+ intel_pt_interrupt();
+ }
+
+ /*
* Checkpointed counters can lead to 'spurious' PMIs because the
* rollback caused by the PMI will have cleared the overflow status
* bit. Therefore always force probe these counters.
@@ -1433,7 +1671,7 @@ again:
goto again;
done:
- intel_pmu_enable_all(0);
+ __intel_pmu_enable_all(0, true);
/*
* Only unmask the NMI after the overflow counters
* have been reset. This avoids spurious NMIs on
@@ -1464,7 +1702,7 @@ intel_bts_constraints(struct perf_event *event)
static int intel_alt_er(int idx)
{
- if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+ if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
return idx;
if (idx == EXTRA_REG_RSP_0)
@@ -1624,7 +1862,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
}
struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1641,7 +1880,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
}
static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
struct event_constraint *c;
@@ -1657,7 +1897,278 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
if (c)
return c;
- return x86_get_event_constraints(cpuc, event);
+ return x86_get_event_constraints(cpuc, idx, event);
+}
+
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ xl->sched_started = true;
+ xl->num_alloc_cntrs = 0;
+ /*
+ * lock shared state until we are done scheduling
+ * in stop_event_scheduling()
+ * makes scheduling appear as a transaction
+ */
+ WARN_ON_ONCE(!irqs_disabled());
+ raw_spin_lock(&excl_cntrs->lock);
+
+ /*
+ * save initial state of sibling thread
+ */
+ memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* sibling thread */
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return;
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return;
+
+ xlo = &excl_cntrs->states[o_tid];
+ xl = &excl_cntrs->states[tid];
+
+ /*
+ * make new sibling thread state visible
+ */
+ memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+ xl->sched_started = false;
+ /*
+ * release shared state lock (acquired in intel_start_scheduling())
+ */
+ raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+ int idx, struct event_constraint *c)
+{
+ struct event_constraint *cx;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xl, *xlo;
+ int is_excl, i;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid; /* alternate */
+
+ /*
+ * validating a group does not require
+ * enforcing cross-thread exclusion
+ */
+ if (cpuc->is_fake || !is_ht_workaround_enabled())
+ return c;
+
+ /*
+ * no exclusion needed
+ */
+ if (!excl_cntrs)
+ return c;
+ /*
+ * event requires exclusive counter access
+ * across HT threads
+ */
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ /*
+ * xl = state of current HT
+ * xlo = state of sibling HT
+ */
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * do not allow scheduling of more than max_alloc_cntrs
+ * which is set to half the available generic counters.
+ * this helps avoid counter starvation of sibling thread
+ * by ensuring at most half the counters cannot be in
+ * exclusive mode. There is not designated counters for the
+ * limits. Any N/2 counters can be used. This helps with
+ * events with specifix counter constraints
+ */
+ if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+ return &emptyconstraint;
+
+ cx = c;
+
+ /*
+ * because we modify the constraint, we need
+ * to make a copy. Static constraints come
+ * from static const tables.
+ *
+ * only needed when constraint has not yet
+ * been cloned (marked dynamic)
+ */
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+ /* sanity check */
+ if (idx < 0)
+ return &emptyconstraint;
+
+ /*
+ * grab pre-allocated constraint entry
+ */
+ cx = &cpuc->constraint_list[idx];
+
+ /*
+ * initialize dynamic constraint
+ * with static constraint
+ */
+ memcpy(cx, c, sizeof(*cx));
+
+ /*
+ * mark constraint as dynamic, so we
+ * can free it later on
+ */
+ cx->flags |= PERF_X86_EVENT_DYNAMIC;
+ }
+
+ /*
+ * From here on, the constraint is dynamic.
+ * Either it was just allocated above, or it
+ * was allocated during a earlier invocation
+ * of this function
+ */
+
+ /*
+ * Modify static constraint with current dynamic
+ * state of thread
+ *
+ * EXCLUSIVE: sibling counter measuring exclusive event
+ * SHARED : sibling counter measuring non-exclusive event
+ * UNUSED : sibling counter unused
+ */
+ for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+ /*
+ * exclusive event in sibling counter
+ * our corresponding counter cannot be used
+ * regardless of our event
+ */
+ if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+ __clear_bit(i, cx->idxmsk);
+ /*
+ * if measuring an exclusive event, sibling
+ * measuring non-exclusive, then counter cannot
+ * be used
+ */
+ if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+ __clear_bit(i, cx->idxmsk);
+ }
+
+ /*
+ * recompute actual bit weight for scheduling algorithm
+ */
+ cx->weight = hweight64(cx->idxmsk64);
+
+ /*
+ * if we return an empty mask, then switch
+ * back to static empty constraint to avoid
+ * the cost of freeing later on
+ */
+ if (cx->weight == 0)
+ cx = &emptyconstraint;
+
+ return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+ struct event_constraint *c1 = event->hw.constraint;
+ struct event_constraint *c2;
+
+ /*
+ * first time only
+ * - static constraint: no change across incremental scheduling calls
+ * - dynamic constraint: handled by intel_get_excl_constraints()
+ */
+ c2 = __intel_get_event_constraints(cpuc, idx, event);
+ if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+ bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+ c1->weight = c2->weight;
+ c2 = c1;
+ }
+
+ if (cpuc->excl_cntrs)
+ return intel_get_excl_constraints(cpuc, event, idx, c2);
+
+ return c2;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+ struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct intel_excl_states *xlo, *xl;
+ unsigned long flags = 0; /* keep compiler happy */
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+
+ /*
+ * nothing needed if in group validation mode
+ */
+ if (cpuc->is_fake)
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ /*
+ * put_constraint may be called from x86_schedule_events()
+ * which already has the lock held so here make locking
+ * conditional
+ */
+ if (!xl->sched_started)
+ raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+ /*
+ * if event was actually assigned, then mark the
+ * counter state as unused now
+ */
+ if (hwc->idx >= 0)
+ xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+ if (!xl->sched_started)
+ raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
}
static void
@@ -1678,7 +2189,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
struct perf_event *event)
{
+ struct event_constraint *c = event->hw.constraint;
+
intel_put_shared_regs_event_constraints(cpuc, event);
+
+ /*
+ * is PMU has exclusive counter restrictions, then
+ * all events are subject to and must call the
+ * put_excl_constraints() routine
+ */
+ if (c && cpuc->excl_cntrs)
+ intel_put_excl_constraints(cpuc, event);
+
+ /* cleanup dynamic constraint */
+ if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+ event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+ struct perf_event *event, int cntr)
+{
+ struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+ struct event_constraint *c = event->hw.constraint;
+ struct intel_excl_states *xlo, *xl;
+ int tid = cpuc->excl_thread_id;
+ int o_tid = 1 - tid;
+ int is_excl;
+
+ if (cpuc->is_fake || !c)
+ return;
+
+ is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+ if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+ return;
+
+ WARN_ON_ONCE(!excl_cntrs);
+
+ if (!excl_cntrs)
+ return;
+
+ xl = &excl_cntrs->states[tid];
+ xlo = &excl_cntrs->states[o_tid];
+
+ WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+ if (cntr >= 0) {
+ if (is_excl)
+ xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+ else
+ xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+ }
}
static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -1747,10 +2308,21 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (event->attr.precise_ip && x86_pmu.pebs_aliases)
x86_pmu.pebs_aliases(event);
- if (intel_pmu_needs_lbr_smpl(event)) {
+ if (needs_branch_stack(event)) {
ret = intel_pmu_setup_lbr_filter(event);
if (ret)
return ret;
+
+ /*
+ * BTS is set up earlier in this path, so don't account twice
+ */
+ if (!intel_pmu_has_bts(event)) {
+ /* disallow lbr if conflicting events are present */
+ if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+ return -EBUSY;
+
+ event->destroy = hw_perf_lbr_event_destroy;
+ }
}
if (event->attr.type != PERF_TYPE_RAW)
@@ -1891,9 +2463,12 @@ static struct event_constraint counter2_constraint =
EVENT_CONSTRAINT(0, 0x4, 0);
static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
{
- struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+ struct event_constraint *c;
+
+ c = intel_get_event_constraints(cpuc, idx, event);
/* Handle special quirk on in_tx_checkpointed only in counter 2 */
if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
@@ -1905,6 +2480,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return c;
}
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+ if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+ X86_CONFIG(.event=0xc0, .umask=0x01)) {
+ if (left < 128)
+ left = 128;
+ left &= ~0x3fu;
+ }
+ return left;
+}
+
PMU_FORMAT_ATTR(event, "config:0-7" );
PMU_FORMAT_ATTR(umask, "config:8-15" );
PMU_FORMAT_ATTR(edge, "config:18" );
@@ -1932,34 +2533,6 @@ ssize_t intel_event_sysfs_show(char *page, u64 config)
return x86_event_sysfs_show(page, config, event);
}
-static __initconst const struct x86_pmu core_pmu = {
- .name = "core",
- .handle_irq = x86_pmu_handle_irq,
- .disable_all = x86_pmu_disable_all,
- .enable_all = core_pmu_enable_all,
- .enable = core_pmu_enable_event,
- .disable = x86_pmu_disable_event,
- .hw_config = x86_pmu_hw_config,
- .schedule_events = x86_schedule_events,
- .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .event_map = intel_pmu_event_map,
- .max_events = ARRAY_SIZE(intel_perfmon_event_map),
- .apic = 1,
- /*
- * Intel PMCs cannot be accessed sanely above 32 bit width,
- * so we install an artificial 1<<31 period regardless of
- * the generic event period:
- */
- .max_period = (1ULL << 31) - 1,
- .get_event_constraints = intel_get_event_constraints,
- .put_event_constraints = intel_put_event_constraints,
- .event_constraints = intel_core_event_constraints,
- .guest_get_msrs = core_guest_get_msrs,
- .format_attrs = intel_arch_formats_attr,
- .events_sysfs_show = intel_event_sysfs_show,
-};
-
struct intel_shared_regs *allocate_shared_regs(int cpu)
{
struct intel_shared_regs *regs;
@@ -1979,16 +2552,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
return regs;
}
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+ struct intel_excl_cntrs *c;
+ int i;
+
+ c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (c) {
+ raw_spin_lock_init(&c->lock);
+ for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+ c->states[0].state[i] = INTEL_EXCL_UNUSED;
+ c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+ c->states[1].state[i] = INTEL_EXCL_UNUSED;
+ c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+ }
+ c->core_id = -1;
+ }
+ return c;
+}
+
static int intel_pmu_cpu_prepare(int cpu)
{
struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
- if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
- return NOTIFY_OK;
+ if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+ cpuc->shared_regs = allocate_shared_regs(cpu);
+ if (!cpuc->shared_regs)
+ return NOTIFY_BAD;
+ }
- cpuc->shared_regs = allocate_shared_regs(cpu);
- if (!cpuc->shared_regs)
- return NOTIFY_BAD;
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+ cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+ if (!cpuc->constraint_list)
+ return NOTIFY_BAD;
+
+ cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+ if (!cpuc->excl_cntrs) {
+ kfree(cpuc->constraint_list);
+ kfree(cpuc->shared_regs);
+ return NOTIFY_BAD;
+ }
+ cpuc->excl_thread_id = 0;
+ }
return NOTIFY_OK;
}
@@ -2010,13 +2619,15 @@ static void intel_pmu_cpu_starting(int cpu)
if (!cpuc->shared_regs)
return;
- if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+ if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+ void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
for_each_cpu(i, topology_thread_cpumask(cpu)) {
struct intel_shared_regs *pc;
pc = per_cpu(cpu_hw_events, i).shared_regs;
if (pc && pc->core_id == core_id) {
- cpuc->kfree_on_online = cpuc->shared_regs;
+ *onln = cpuc->shared_regs;
cpuc->shared_regs = pc;
break;
}
@@ -2027,6 +2638,44 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.lbr_sel_map)
cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+ if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+ int h = x86_pmu.num_counters >> 1;
+
+ for_each_cpu(i, topology_thread_cpumask(cpu)) {
+ struct intel_excl_cntrs *c;
+
+ c = per_cpu(cpu_hw_events, i).excl_cntrs;
+ if (c && c->core_id == core_id) {
+ cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+ cpuc->excl_cntrs = c;
+ cpuc->excl_thread_id = 1;
+ break;
+ }
+ }
+ cpuc->excl_cntrs->core_id = core_id;
+ cpuc->excl_cntrs->refcnt++;
+ /*
+ * set hard limit to half the number of generic counters
+ */
+ cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+ cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
+ }
+}
+
+static void free_excl_cntrs(int cpu)
+{
+ struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+ struct intel_excl_cntrs *c;
+
+ c = cpuc->excl_cntrs;
+ if (c) {
+ if (c->core_id == -1 || --c->refcnt == 0)
+ kfree(c);
+ cpuc->excl_cntrs = NULL;
+ kfree(cpuc->constraint_list);
+ cpuc->constraint_list = NULL;
+ }
}
static void intel_pmu_cpu_dying(int cpu)
@@ -2041,19 +2690,9 @@ static void intel_pmu_cpu_dying(int cpu)
cpuc->shared_regs = NULL;
}
- fini_debug_store_on_cpu(cpu);
-}
+ free_excl_cntrs(cpu);
-static void intel_pmu_flush_branch_stack(void)
-{
- /*
- * Intel LBR does not tag entries with the
- * PID of the current task, then we need to
- * flush it on ctxsw
- * For now, we simply reset it
- */
- if (x86_pmu.lbr_nr)
- intel_pmu_lbr_reset();
+ fini_debug_store_on_cpu(cpu);
}
PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
@@ -2076,6 +2715,44 @@ static struct attribute *intel_arch3_formats_attr[] = {
NULL,
};
+static __initconst const struct x86_pmu core_pmu = {
+ .name = "core",
+ .handle_irq = x86_pmu_handle_irq,
+ .disable_all = x86_pmu_disable_all,
+ .enable_all = core_pmu_enable_all,
+ .enable = core_pmu_enable_event,
+ .disable = x86_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
+ .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
+ .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
+ .event_map = intel_pmu_event_map,
+ .max_events = ARRAY_SIZE(intel_perfmon_event_map),
+ .apic = 1,
+ /*
+ * Intel PMCs cannot be accessed sanely above 32-bit width,
+ * so we install an artificial 1<<31 period regardless of
+ * the generic event period:
+ */
+ .max_period = (1ULL<<31) - 1,
+ .get_event_constraints = intel_get_event_constraints,
+ .put_event_constraints = intel_put_event_constraints,
+ .event_constraints = intel_core_event_constraints,
+ .guest_get_msrs = core_guest_get_msrs,
+ .format_attrs = intel_arch_formats_attr,
+ .events_sysfs_show = intel_event_sysfs_show,
+
+ /*
+ * Virtual (or funny metal) CPU can define x86_pmu.extra_regs
+ * together with PMU version 1 and thus be using core_pmu with
+ * shared_regs. We need following callbacks here to allocate
+ * it properly.
+ */
+ .cpu_prepare = intel_pmu_cpu_prepare,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
+};
+
static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
@@ -2107,7 +2784,7 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_starting = intel_pmu_cpu_starting,
.cpu_dying = intel_pmu_cpu_dying,
.guest_get_msrs = intel_guest_get_msrs,
- .flush_branch_stack = intel_pmu_flush_branch_stack,
+ .sched_task = intel_pmu_lbr_sched_task,
};
static __init void intel_clovertown_quirk(void)
@@ -2264,6 +2941,27 @@ static __init void intel_nehalem_quirk(void)
}
}
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
+ */
+static __init void intel_ht_bug(void)
+{
+ x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
+
+ x86_pmu.commit_scheduling = intel_commit_scheduling;
+ x86_pmu.start_scheduling = intel_start_scheduling;
+ x86_pmu.stop_scheduling = intel_stop_scheduling;
+}
+
EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3");
EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82")
@@ -2443,7 +3141,7 @@ __init int intel_pmu_init(void)
x86_pmu.event_constraints = intel_slm_event_constraints;
x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
x86_pmu.extra_regs = intel_slm_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
pr_cont("Silvermont events, ");
break;
@@ -2461,7 +3159,7 @@ __init int intel_pmu_init(void)
x86_pmu.enable_all = intel_pmu_nhm_enable_all;
x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
x86_pmu.extra_regs = intel_westmere_extra_regs;
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
x86_pmu.cpu_events = nhm_events_attrs;
@@ -2478,6 +3176,7 @@ __init int intel_pmu_init(void)
case 42: /* 32nm SandyBridge */
case 45: /* 32nm SandyBridge-E/EN/EP */
x86_add_quirk(intel_sandybridge_quirk);
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -2492,9 +3191,11 @@ __init int intel_pmu_init(void)
x86_pmu.extra_regs = intel_snbep_extra_regs;
else
x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2510,6 +3211,7 @@ __init int intel_pmu_init(void)
case 58: /* 22nm IvyBridge */
case 62: /* 22nm IvyBridge-EP/EX */
+ x86_add_quirk(intel_ht_bug);
memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
/* dTLB-load-misses on IVB is different than SNB */
@@ -2528,8 +3230,8 @@ __init int intel_pmu_init(void)
else
x86_pmu.extra_regs = intel_snb_extra_regs;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.cpu_events = snb_events_attrs;
@@ -2545,19 +3247,20 @@ __init int intel_pmu_init(void)
case 63: /* 22nm Haswell Server */
case 69: /* 22nm Haswell ULT */
case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+ x86_add_quirk(intel_ht_bug);
x86_pmu.late_ack = true;
- memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
- memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
- intel_pmu_lbr_init_snb();
+ intel_pmu_lbr_init_hsw();
x86_pmu.event_constraints = intel_hsw_event_constraints;
x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
x86_pmu.extra_regs = intel_snbep_extra_regs;
x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
/* all extra regs are per-cpu when HT is on */
- x86_pmu.er_flags |= ERF_HAS_RSP_1;
- x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
x86_pmu.hw_config = hsw_hw_config;
x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2566,6 +3269,39 @@ __init int intel_pmu_init(void)
pr_cont("Haswell events, ");
break;
+ case 61: /* 14nm Broadwell Core-M */
+ case 86: /* 14nm Broadwell Xeon D */
+ x86_pmu.late_ack = true;
+ memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+ /* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+ hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+ BDW_L3_MISS|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+ HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+ hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+ BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+ intel_pmu_lbr_init_hsw();
+
+ x86_pmu.event_constraints = intel_bdw_event_constraints;
+ x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_snbep_extra_regs;
+ x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+ /* all extra regs are per-cpu when HT is on */
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+ x86_pmu.hw_config = hsw_hw_config;
+ x86_pmu.get_event_constraints = hsw_get_event_constraints;
+ x86_pmu.cpu_events = hsw_events_attrs;
+ x86_pmu.limit_period = bdw_limit_period;
+ pr_cont("Broadwell events, ");
+ break;
+
default:
switch (x86_pmu.version) {
case 1:
@@ -2651,3 +3387,47 @@ __init int intel_pmu_init(void)
return 0;
}
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+ int cpu = smp_processor_id();
+ int w, c;
+ /*
+ * problem not present on this CPU model, nothing to do
+ */
+ if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+ return 0;
+
+ w = cpumask_weight(topology_thread_cpumask(cpu));
+ if (w > 1) {
+ pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+ return 0;
+ }
+
+ watchdog_nmi_disable_all();
+
+ x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+ x86_pmu.commit_scheduling = NULL;
+ x86_pmu.start_scheduling = NULL;
+ x86_pmu.stop_scheduling = NULL;
+
+ watchdog_nmi_enable_all();
+
+ get_online_cpus();
+
+ for_each_online_cpu(c) {
+ free_excl_cntrs(c);
+ }
+
+ put_online_cpus();
+ pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+ return 0;
+}
+subsys_initcall(fixup_ht_bug)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..ac1f0c55f379
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+ struct perf_output_handle handle;
+ struct debug_store ds_back;
+ int started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE 24
+#define BTS_SAFETY_MARGIN 4080
+
+struct bts_phys {
+ struct page *page;
+ unsigned long size;
+ unsigned long offset;
+ unsigned long displacement;
+};
+
+struct bts_buffer {
+ size_t real_size; /* multiple of BTS_RECORD_SIZE */
+ unsigned int nr_pages;
+ unsigned int nr_bufs;
+ unsigned int cur_buf;
+ bool snapshot;
+ local_t data_size;
+ local_t lost;
+ local_t head;
+ unsigned long end;
+ void **data_pages;
+ struct bts_phys buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+ return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+ struct bts_buffer *buf;
+ struct page *page;
+ int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+ unsigned long offset;
+ size_t size = nr_pages << PAGE_SHIFT;
+ int pg, nbuf, pad;
+
+ /* count all the high order buffers */
+ for (pg = 0, nbuf = 0; pg < nr_pages;) {
+ page = virt_to_page(pages[pg]);
+ if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+ return NULL;
+ pg += 1 << page_private(page);
+ nbuf++;
+ }
+
+ /*
+ * to avoid interrupts in overwrite mode, only allow one physical
+ */
+ if (overwrite && nbuf > 1)
+ return NULL;
+
+ buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->nr_pages = nr_pages;
+ buf->nr_bufs = nbuf;
+ buf->snapshot = overwrite;
+ buf->data_pages = pages;
+ buf->real_size = size - size % BTS_RECORD_SIZE;
+
+ for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+ unsigned int __nr_pages;
+
+ page = virt_to_page(pages[pg]);
+ __nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+ buf->buf[nbuf].page = page;
+ buf->buf[nbuf].offset = offset;
+ buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+ buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+ pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+ buf->buf[nbuf].size -= pad;
+
+ pg += __nr_pages;
+ offset += __nr_pages << PAGE_SHIFT;
+ }
+
+ return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+ kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+ return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_phys *phys = &buf->buf[buf->cur_buf];
+ unsigned long index, thresh = 0, end = phys->size;
+ struct page *page = phys->page;
+
+ index = local_read(&buf->head);
+
+ if (!buf->snapshot) {
+ if (buf->end < phys->offset + buf_size(page))
+ end = buf->end - phys->offset - phys->displacement;
+
+ index -= phys->offset + phys->displacement;
+
+ if (end - index > BTS_SAFETY_MARGIN)
+ thresh = end - BTS_SAFETY_MARGIN;
+ else if (end - index > BTS_RECORD_SIZE)
+ thresh = end - BTS_RECORD_SIZE;
+ else
+ thresh = end;
+ }
+
+ ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
+ ds->bts_index = ds->bts_buffer_base + index;
+ ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+ ds->bts_interrupt_threshold = !buf->snapshot
+ ? ds->bts_buffer_base + thresh
+ : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+ unsigned long index = head - phys->offset;
+
+ memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= bts->handle.size ||
+ bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+ return true;
+
+ return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+ int cpu = raw_smp_processor_id();
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+ if (!buf)
+ return;
+
+ head = index + bts_buffer_offset(buf, buf->cur_buf);
+ old = local_xchg(&buf->head, head);
+
+ if (!buf->snapshot) {
+ if (old == head)
+ return;
+
+ if (ds->bts_index >= ds->bts_absolute_maximum)
+ local_inc(&buf->lost);
+
+ /*
+ * old and head are always in the same physical buffer, so we
+ * can subtract them to get the data size.
+ */
+ local_add(head - old, &buf->data_size);
+ } else {
+ local_set(&buf->data_size, head);
+ }
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+ u64 config = 0;
+
+ if (!buf || bts_buffer_is_full(buf, bts))
+ return;
+
+ event->hw.state = 0;
+
+ if (!buf->snapshot)
+ config |= ARCH_PERFMON_EVENTSEL_INT;
+ if (!event->attr.exclude_kernel)
+ config |= ARCH_PERFMON_EVENTSEL_OS;
+ if (!event->attr.exclude_user)
+ config |= ARCH_PERFMON_EVENTSEL_USR;
+
+ bts_config_buffer(buf);
+
+ /*
+ * local barrier to make sure that ds configuration made it
+ * before we enable BTS
+ */
+ wmb();
+
+ intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ __bts_event_start(event);
+
+ /* PMI handler: this counter is running and likely generating PMIs */
+ ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+ /*
+ * No extra synchronization is mandated by the documentation to have
+ * BTS data stores globally visible.
+ */
+ intel_pmu_disable_bts();
+
+ if (event->hw.state & PERF_HES_STOPPED)
+ return;
+
+ ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ /* PMI handler: don't restart this counter */
+ ACCESS_ONCE(bts->started) = 0;
+
+ __bts_event_stop(event);
+
+ if (flags & PERF_EF_UPDATE)
+ bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event && bts->started)
+ __bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+ if (bts->handle.event)
+ __bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+ unsigned long head, space, next_space, pad, gap, skip, wakeup;
+ unsigned int next_buf;
+ struct bts_phys *phys, *next_phys;
+ int ret;
+
+ if (buf->snapshot)
+ return 0;
+
+ head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+ if (WARN_ON_ONCE(head != local_read(&buf->head)))
+ return -EINVAL;
+
+ phys = &buf->buf[buf->cur_buf];
+ space = phys->offset + phys->displacement + phys->size - head;
+ pad = space;
+ if (space > handle->size) {
+ space = handle->size;
+ space -= space % BTS_RECORD_SIZE;
+ }
+ if (space <= BTS_SAFETY_MARGIN) {
+ /* See if next phys buffer has more space */
+ next_buf = buf->cur_buf + 1;
+ if (next_buf >= buf->nr_bufs)
+ next_buf = 0;
+ next_phys = &buf->buf[next_buf];
+ gap = buf_size(phys->page) - phys->displacement - phys->size +
+ next_phys->displacement;
+ skip = pad + gap;
+ if (handle->size >= skip) {
+ next_space = next_phys->size;
+ if (next_space + skip > handle->size) {
+ next_space = handle->size - skip;
+ next_space -= next_space % BTS_RECORD_SIZE;
+ }
+ if (next_space > space || !space) {
+ if (pad)
+ bts_buffer_pad_out(phys, head);
+ ret = perf_aux_output_skip(handle, skip);
+ if (ret)
+ return ret;
+ /* Advance to next phys buffer */
+ phys = next_phys;
+ space = next_space;
+ head = phys->offset + phys->displacement;
+ /*
+ * After this, cur_buf and head won't match ds
+ * anymore, so we must not be racing with
+ * bts_update().
+ */
+ buf->cur_buf = next_buf;
+ local_set(&buf->head, head);
+ }
+ }
+ }
+
+ /* Don't go far beyond wakeup watermark */
+ wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+ handle->head;
+ if (space > wakeup) {
+ space = wakeup;
+ space -= space % BTS_RECORD_SIZE;
+ }
+
+ buf->end = head + space;
+
+ /*
+ * If we have no space, the lost notification would have been sent when
+ * we hit absolute_maximum - see bts_update()
+ */
+ if (!space)
+ return -ENOSPC;
+
+ return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct perf_event *event = bts->handle.event;
+ struct bts_buffer *buf;
+ s64 old_head;
+ int err;
+
+ if (!event || !bts->started)
+ return 0;
+
+ buf = perf_get_aux(&bts->handle);
+ /*
+ * Skip snapshot counters: they don't use the interrupt, but
+ * there's no other way of telling, because the pointer will
+ * keep moving
+ */
+ if (!buf || buf->snapshot)
+ return 0;
+
+ old_head = local_read(&buf->head);
+ bts_update(bts);
+
+ /* no new data */
+ if (old_head == local_read(&buf->head))
+ return 0;
+
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return 1;
+
+ err = bts_buffer_reset(buf, &bts->handle);
+ if (err)
+ perf_aux_output_end(&bts->handle, 0, false);
+
+ return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+ bts_event_stop(event, PERF_EF_UPDATE);
+
+ if (buf) {
+ if (buf->snapshot)
+ bts->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+ !!local_xchg(&buf->lost, 0));
+ }
+
+ cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+ cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+ cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+ struct bts_buffer *buf;
+ struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ return -EBUSY;
+
+ if (bts->handle.event)
+ return -EBUSY;
+
+ buf = perf_aux_output_begin(&bts->handle, event);
+ if (!buf)
+ return -EINVAL;
+
+ ret = bts_buffer_reset(buf, &bts->handle);
+ if (ret) {
+ perf_aux_output_end(&bts->handle, 0, false);
+ return ret;
+ }
+
+ bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+ bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+ bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+ if (mode & PERF_EF_START) {
+ bts_event_start(event, 0);
+ if (hwc->state & PERF_HES_STOPPED) {
+ bts_event_del(event, 0);
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+ if (event->attr.type != bts_pmu.type)
+ return -ENOENT;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_bts))
+ return -EBUSY;
+
+ event->destroy = bts_event_destroy;
+
+ return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+ return -ENODEV;
+
+ bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+ bts_pmu.task_ctx_nr = perf_sw_context;
+ bts_pmu.event_init = bts_event_init;
+ bts_pmu.add = bts_event_add;
+ bts_pmu.del = bts_event_del;
+ bts_pmu.start = bts_event_start;
+ bts_pmu.stop = bts_event_stop;
+ bts_pmu.read = bts_event_read;
+ bts_pmu.setup_aux = bts_buffer_setup_aux;
+ bts_pmu.free_aux = bts_buffer_free_aux;
+
+ return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..e4d1b8b738fa
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,1379 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC 0x0c8f
+#define MSR_IA32_QM_CTR 0x0c8e
+#define MSR_IA32_QM_EVTSEL 0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+ raw_spinlock_t lock;
+ int rmid;
+ int cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
+ */
+static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR (1ULL << 63)
+#define RMID_VAL_UNAVAIL (1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
+
+#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
+
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID (-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+ if (!rmid || rmid == INVALID_RMID)
+ return false;
+
+ return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
+{
+ u64 val;
+
+ /*
+ * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+ * it just says that to increase confusion.
+ */
+ wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+ rdmsrl(MSR_IA32_QM_CTR, val);
+
+ /*
+ * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+ * the number of cachelines tagged with @rmid.
+ */
+ return val;
+}
+
+enum rmid_recycle_state {
+ RMID_YOUNG = 0,
+ RMID_AVAILABLE,
+ RMID_DIRTY,
+};
+
+struct cqm_rmid_entry {
+ unsigned int rmid;
+ enum rmid_recycle_state state;
+ struct list_head list;
+ unsigned long queue_time;
+};
+
+/*
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ entry = cqm_rmid_ptrs[rmid];
+ WARN_ON(entry->rmid != rmid);
+
+ return entry;
+}
+
+/*
+ * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
+ */
+static int __get_rmid(void)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ if (list_empty(&cqm_rmid_free_lru))
+ return INVALID_RMID;
+
+ entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
+ list_del(&entry->list);
+
+ return entry->rmid;
+}
+
+static void __put_rmid(unsigned int rmid)
+{
+ struct cqm_rmid_entry *entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ WARN_ON(!__rmid_valid(rmid));
+ entry = __rmid_entry(rmid);
+
+ entry->queue_time = jiffies;
+ entry->state = RMID_YOUNG;
+
+ list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+ struct cqm_rmid_entry *entry;
+ unsigned int nr_rmids;
+ int r = 0;
+
+ nr_rmids = cqm_max_rmid + 1;
+ cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+ nr_rmids, GFP_KERNEL);
+ if (!cqm_rmid_ptrs)
+ return -ENOMEM;
+
+ for (; r <= cqm_max_rmid; r++) {
+ struct cqm_rmid_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto fail;
+
+ INIT_LIST_HEAD(&entry->list);
+ entry->rmid = r;
+ cqm_rmid_ptrs[r] = entry;
+
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+ /*
+ * RMID 0 is special and is always allocated. It's used for all
+ * tasks that are not monitored.
+ */
+ entry = __rmid_entry(0);
+ list_del(&entry->list);
+
+ mutex_lock(&cache_mutex);
+ intel_cqm_rotation_rmid = __get_rmid();
+ mutex_unlock(&cache_mutex);
+
+ return 0;
+fail:
+ while (r--)
+ kfree(cqm_rmid_ptrs[r]);
+
+ kfree(cqm_rmid_ptrs);
+ return -ENOMEM;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+ /* Per-cpu and task events don't mix */
+ if ((a->attach_state & PERF_ATTACH_TASK) !=
+ (b->attach_state & PERF_ATTACH_TASK))
+ return false;
+
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, we're machine wide */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.target == b->hw.target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return perf_cgroup_from_task(event->hw.target);
+
+ return event->cgrp;
+}
+#endif
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ * PROHIBITS
+ * system-wide -> cgroup and task
+ * cgroup -> system-wide
+ * -> task in cgroup
+ * task -> system-wide
+ * -> task in cgroup
+ *
+ * Call this function before allocating an RMID.
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+#ifdef CONFIG_CGROUP_PERF
+ /*
+ * We can have any number of cgroups but only one system-wide
+ * event at a time.
+ */
+ if (a->cgrp && b->cgrp) {
+ struct perf_cgroup *ac = a->cgrp;
+ struct perf_cgroup *bc = b->cgrp;
+
+ /*
+ * This condition should have been caught in
+ * __match_event() and we should be sharing an RMID.
+ */
+ WARN_ON_ONCE(ac == bc);
+
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+
+ if (a->cgrp || b->cgrp) {
+ struct perf_cgroup *ac, *bc;
+
+ /*
+ * cgroup and system-wide events are mutually exclusive
+ */
+ if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+ (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+ return true;
+
+ /*
+ * Ensure neither event is part of the other's cgroup
+ */
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+ if (ac == bc)
+ return true;
+
+ /*
+ * Must have cgroup and non-intersecting task events.
+ */
+ if (!ac || !bc)
+ return false;
+
+ /*
+ * We have cgroup and task events, and the task belongs
+ * to a cgroup. Check for for overlap.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+#endif
+ /*
+ * If one of them is not a task, same story as above with cgroups.
+ */
+ if (!(a->attach_state & PERF_ATTACH_TASK) ||
+ !(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Must be non-overlapping.
+ */
+ return false;
+}
+
+struct rmid_read {
+ unsigned int rmid;
+ atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+ struct perf_event *event;
+ unsigned int old_rmid = group->hw.cqm_rmid;
+ struct list_head *head = &group->hw.cqm_group_entry;
+
+ lockdep_assert_held(&cache_mutex);
+
+ /*
+ * If our RMID is being deallocated, perform a read now.
+ */
+ if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ .rmid = old_rmid,
+ };
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+ &rr, 1);
+ local64_set(&group->count, atomic64_read(&rr.value));
+ }
+
+ raw_spin_lock_irq(&cache_lock);
+
+ group->hw.cqm_rmid = rmid;
+ list_for_each_entry(event, head, hw.cqm_group_entry)
+ event->hw.cqm_rmid = rmid;
+
+ raw_spin_unlock_irq(&cache_lock);
+
+ return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+ struct cqm_rmid_entry *entry;
+
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ if (entry->state != RMID_AVAILABLE)
+ break;
+
+ if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+ entry->state = RMID_DIRTY;
+ }
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+ struct perf_event *leader, *event;
+
+ lockdep_assert_held(&cache_mutex);
+
+ leader = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+ event = leader;
+
+ list_for_each_entry_continue(event, &cache_groups,
+ hw.cqm_groups_entry) {
+ if (__rmid_valid(event->hw.cqm_rmid))
+ continue;
+
+ if (__conflict_event(event, leader))
+ continue;
+
+ intel_cqm_xchg_rmid(event, rmid);
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+ struct cqm_rmid_entry *entry, *tmp;
+
+ lockdep_assert_held(&cache_mutex);
+
+ *available = 0;
+ list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+ unsigned long min_queue_time;
+ unsigned long now = jiffies;
+
+ /*
+ * We hold RMIDs placed into limbo for a minimum queue
+ * time. Before the minimum queue time has elapsed we do
+ * not recycle RMIDs.
+ *
+ * The reasoning is that until a sufficient time has
+ * passed since we stopped using an RMID, any RMID
+ * placed onto the limbo list will likely still have
+ * data tagged in the cache, which means we'll probably
+ * fail to recycle it anyway.
+ *
+ * We can save ourselves an expensive IPI by skipping
+ * any RMIDs that have not been queued for the minimum
+ * time.
+ */
+ min_queue_time = entry->queue_time +
+ msecs_to_jiffies(__rmid_queue_time_ms);
+
+ if (time_after(min_queue_time, now))
+ break;
+
+ entry->state = RMID_AVAILABLE;
+ (*available)++;
+ }
+
+ /*
+ * Fast return if none of the RMIDs on the limbo list have been
+ * sitting on the queue for the minimum queue time.
+ */
+ if (!*available)
+ return false;
+
+ /*
+ * Test whether an RMID is free for each package.
+ */
+ on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+ list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+ /*
+ * Exhausted all RMIDs that have waited min queue time.
+ */
+ if (entry->state == RMID_YOUNG)
+ break;
+
+ if (entry->state == RMID_DIRTY)
+ continue;
+
+ list_del(&entry->list); /* remove from limbo */
+
+ /*
+ * The rotation RMID gets priority if it's
+ * currently invalid. In which case, skip adding
+ * the RMID to the the free lru.
+ */
+ if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_rotation_rmid = entry->rmid;
+ continue;
+ }
+
+ /*
+ * If we have groups waiting for RMIDs, hand
+ * them one now provided they don't conflict.
+ */
+ if (intel_cqm_sched_in_event(entry->rmid))
+ continue;
+
+ /*
+ * Otherwise place it onto the free list.
+ */
+ list_add_tail(&entry->list, &cqm_rmid_free_lru);
+ }
+
+
+ return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
+ */
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
+{
+ struct perf_event *rotor;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ rotor = list_first_entry(&cache_groups, struct perf_event,
+ hw.cqm_groups_entry);
+
+ /*
+ * The group at the front of the list should always have a valid
+ * RMID. If it doesn't then no groups have RMIDs assigned and we
+ * don't need to rotate the list.
+ */
+ if (next == rotor)
+ return;
+
+ rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+ __put_rmid(rmid);
+
+ list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+ struct perf_event *group, *g;
+ unsigned int rmid;
+
+ lockdep_assert_held(&cache_mutex);
+
+ list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+ if (group == event)
+ continue;
+
+ rmid = group->hw.cqm_rmid;
+
+ /*
+ * Skip events that don't have a valid RMID.
+ */
+ if (!__rmid_valid(rmid))
+ continue;
+
+ /*
+ * No conflict? No problem! Leave the event alone.
+ */
+ if (!__conflict_event(group, event))
+ continue;
+
+ intel_cqm_xchg_rmid(group, INVALID_RMID);
+ __put_rmid(rmid);
+ }
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * We rotate for two reasons,
+ * 1. To handle the scheduling of conflicting events
+ * 2. To recycle RMIDs
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+ struct perf_event *group, *start = NULL;
+ unsigned int threshold_limit;
+ unsigned int nr_needed = 0;
+ unsigned int nr_available;
+ bool rotated = false;
+
+ mutex_lock(&cache_mutex);
+
+again:
+ /*
+ * Fast path through this function if there are no groups and no
+ * RMIDs that need cleaning.
+ */
+ if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+ if (!__rmid_valid(group->hw.cqm_rmid)) {
+ if (!start)
+ start = group;
+ nr_needed++;
+ }
+ }
+
+ /*
+ * We have some event groups, but they all have RMIDs assigned
+ * and no RMIDs need cleaning.
+ */
+ if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+ goto out;
+
+ if (!nr_needed)
+ goto stabilize;
+
+ /*
+ * We have more event groups without RMIDs than available RMIDs,
+ * or we have event groups that conflict with the ones currently
+ * scheduled.
+ *
+ * We force deallocate the rmid of the group at the head of
+ * cache_groups. The first event group without an RMID then gets
+ * assigned intel_cqm_rotation_rmid. This ensures we always make
+ * forward progress.
+ *
+ * Rotate the cache_groups list so the previous head is now the
+ * tail.
+ */
+ __intel_cqm_pick_and_rotate(start);
+
+ /*
+ * If the rotation is going to succeed, reduce the threshold so
+ * that we don't needlessly reuse dirty RMIDs.
+ */
+ if (__rmid_valid(intel_cqm_rotation_rmid)) {
+ intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+ intel_cqm_rotation_rmid = __get_rmid();
+
+ intel_cqm_sched_out_conflicting_events(start);
+
+ if (__intel_cqm_threshold)
+ __intel_cqm_threshold--;
+ }
+
+ rotated = true;
+
+stabilize:
+ /*
+ * We now need to stablize the RMID we freed above (if any) to
+ * ensure that the next time we rotate we have an RMID with zero
+ * occupancy value.
+ *
+ * Alternatively, if we didn't need to perform any rotation,
+ * we'll have a bunch of RMIDs in limbo that need stabilizing.
+ */
+ threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+ while (intel_cqm_rmid_stabilize(&nr_available) &&
+ __intel_cqm_threshold < threshold_limit) {
+ unsigned int steal_limit;
+
+ /*
+ * Don't spin if nobody is actively waiting for an RMID,
+ * the rotation worker will be kicked as soon as an
+ * event needs an RMID anyway.
+ */
+ if (!nr_needed)
+ break;
+
+ /* Allow max 25% of RMIDs to be in limbo. */
+ steal_limit = (cqm_max_rmid + 1) / 4;
+
+ /*
+ * We failed to stabilize any RMIDs so our rotation
+ * logic is now stuck. In order to make forward progress
+ * we have a few options:
+ *
+ * 1. rotate ("steal") another RMID
+ * 2. increase the threshold
+ * 3. do nothing
+ *
+ * We do both of 1. and 2. until we hit the steal limit.
+ *
+ * The steal limit prevents all RMIDs ending up on the
+ * limbo list. This can happen if every RMID has a
+ * non-zero occupancy above threshold_limit, and the
+ * occupancy values aren't dropping fast enough.
+ *
+ * Note that there is prioritisation at work here - we'd
+ * rather increase the number of RMIDs on the limbo list
+ * than increase the threshold, because increasing the
+ * threshold skews the event data (because we reuse
+ * dirty RMIDs) - threshold bumps are a last resort.
+ */
+ if (nr_available < steal_limit)
+ goto again;
+
+ __intel_cqm_threshold++;
+ }
+
+out:
+ mutex_unlock(&cache_mutex);
+ return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+ unsigned long delay;
+
+ __intel_cqm_rmid_rotate();
+
+ delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+ schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static void intel_cqm_setup_event(struct perf_event *event,
+ struct perf_event **group)
+{
+ struct perf_event *iter;
+ unsigned int rmid;
+ bool conflict = false;
+
+ list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+ rmid = iter->hw.cqm_rmid;
+
+ if (__match_event(iter, event)) {
+ /* All tasks in a group share an RMID */
+ event->hw.cqm_rmid = rmid;
+ *group = iter;
+ return;
+ }
+
+ /*
+ * We only care about conflicts for events that are
+ * actually scheduled in (and hence have a valid RMID).
+ */
+ if (__conflict_event(iter, event) && __rmid_valid(rmid))
+ conflict = true;
+ }
+
+ if (conflict)
+ rmid = INVALID_RMID;
+ else
+ rmid = __get_rmid();
+
+ event->hw.cqm_rmid = rmid;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+ unsigned long flags;
+ unsigned int rmid;
+ u64 val;
+
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ rmid = event->hw.cqm_rmid;
+
+ if (!__rmid_valid(rmid))
+ goto out;
+
+ val = __rmid_read(rmid);
+
+ /*
+ * Ignore this reading on error states and do not update the value.
+ */
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ goto out;
+
+ local64_set(&event->count, val);
+out:
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+}
+
+static void __intel_cqm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read(rr->rmid);
+
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+ unsigned long flags;
+ struct rmid_read rr = {
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /*
+ * We only need to worry about task events. System-wide events
+ * are handled like usual, i.e. entirely with
+ * intel_cqm_event_read().
+ */
+ if (event->cpu != -1)
+ return __perf_event_count(event);
+
+ /*
+ * Only the group leader gets to report values. This stops us
+ * reporting duplicate values to userspace, and gives us a clear
+ * rule for which task gets to report the values.
+ *
+ * Note that it is impossible to attribute these values to
+ * specific packages - we forfeit that ability when we create
+ * task events.
+ */
+ if (!cqm_group_leader(event))
+ return 0;
+
+ /*
+ * Notice that we don't perform the reading of an RMID
+ * atomically, because we can't hold a spin lock across the
+ * IPIs.
+ *
+ * Speculatively perform the read, since @event might be
+ * assigned a different (possibly invalid) RMID while we're
+ * busying performing the IPI calls. It's therefore necessary to
+ * check @event's RMID afterwards, and if it has changed,
+ * discard the result of the read.
+ */
+ rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
+
+ if (!__rmid_valid(rr.rmid))
+ goto out;
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+ if (event->hw.cqm_rmid == rr.rmid)
+ local64_set(&event->count, atomic64_read(&rr.value));
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
+ return __perf_event_count(event);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned int rmid = event->hw.cqm_rmid;
+ unsigned long flags;
+
+ if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+ return;
+
+ event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+
+ if (state->cnt++)
+ WARN_ON_ONCE(state->rmid != rmid);
+ else
+ WARN_ON_ONCE(state->rmid);
+
+ state->rmid = rmid;
+ wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+ struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+ unsigned long flags;
+
+ if (event->hw.cqm_state & PERF_HES_STOPPED)
+ return;
+
+ event->hw.cqm_state |= PERF_HES_STOPPED;
+
+ raw_spin_lock_irqsave(&state->lock, flags);
+ intel_cqm_event_read(event);
+
+ if (!--state->cnt) {
+ state->rmid = 0;
+ wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+ } else {
+ WARN_ON_ONCE(!state->rmid);
+ }
+
+ raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+ unsigned long flags;
+ unsigned int rmid;
+
+ raw_spin_lock_irqsave(&cache_lock, flags);
+
+ event->hw.cqm_state = PERF_HES_STOPPED;
+ rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid) && (mode & PERF_EF_START))
+ intel_cqm_event_start(event, mode);
+
+ raw_spin_unlock_irqrestore(&cache_lock, flags);
+
+ return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+ intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+ struct perf_event *group_other = NULL;
+
+ mutex_lock(&cache_mutex);
+
+ /*
+ * If there's another event in this group...
+ */
+ if (!list_empty(&event->hw.cqm_group_entry)) {
+ group_other = list_first_entry(&event->hw.cqm_group_entry,
+ struct perf_event,
+ hw.cqm_group_entry);
+ list_del(&event->hw.cqm_group_entry);
+ }
+
+ /*
+ * And we're the group leader..
+ */
+ if (cqm_group_leader(event)) {
+ /*
+ * If there was a group_other, make that leader, otherwise
+ * destroy the group and return the RMID.
+ */
+ if (group_other) {
+ list_replace(&event->hw.cqm_groups_entry,
+ &group_other->hw.cqm_groups_entry);
+ } else {
+ unsigned int rmid = event->hw.cqm_rmid;
+
+ if (__rmid_valid(rmid))
+ __put_rmid(rmid);
+ list_del(&event->hw.cqm_groups_entry);
+ }
+ }
+
+ mutex_unlock(&cache_mutex);
+}
+
+static int intel_cqm_event_init(struct perf_event *event)
+{
+ struct perf_event *group = NULL;
+ bool rotate = false;
+
+ if (event->attr.type != intel_cqm_pmu.type)
+ return -ENOENT;
+
+ if (event->attr.config & ~QOS_EVENT_MASK)
+ return -EINVAL;
+
+ /* unsupported modes and filters */
+ if (event->attr.exclude_user ||
+ event->attr.exclude_kernel ||
+ event->attr.exclude_hv ||
+ event->attr.exclude_idle ||
+ event->attr.exclude_host ||
+ event->attr.exclude_guest ||
+ event->attr.sample_period) /* no sampling */
+ return -EINVAL;
+
+ INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+ INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+ event->destroy = intel_cqm_event_destroy;
+
+ mutex_lock(&cache_mutex);
+
+ /* Will also set rmid */
+ intel_cqm_setup_event(event, &group);
+
+ if (group) {
+ list_add_tail(&event->hw.cqm_group_entry,
+ &group->hw.cqm_group_entry);
+ } else {
+ list_add_tail(&event->hw.cqm_groups_entry,
+ &cache_groups);
+
+ /*
+ * All RMIDs are either in use or have recently been
+ * used. Kick the rotation worker to clean/free some.
+ *
+ * We only do this for the group leader, rather than for
+ * every event in a group to save on needless work.
+ */
+ if (!__rmid_valid(event->hw.cqm_rmid))
+ rotate = true;
+ }
+
+ mutex_unlock(&cache_mutex);
+
+ if (rotate)
+ schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
+ return 0;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+ EVENT_PTR(intel_cqm_llc),
+ EVENT_PTR(intel_cqm_llc_pkg),
+ EVENT_PTR(intel_cqm_llc_unit),
+ EVENT_PTR(intel_cqm_llc_scale),
+ EVENT_PTR(intel_cqm_llc_snapshot),
+ NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+ .name = "events",
+ .attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+ &format_attr_event.attr,
+ NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+ .name = "format",
+ .attrs = intel_cqm_formats_attr,
+};
+
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+ char *page)
+{
+ ssize_t rv;
+
+ mutex_lock(&cache_mutex);
+ rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+ mutex_unlock(&cache_mutex);
+
+ return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int bytes, cachelines;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &bytes);
+ if (ret)
+ return ret;
+
+ mutex_lock(&cache_mutex);
+
+ __intel_cqm_max_threshold = bytes;
+ cachelines = bytes / cqm_l3_scale;
+
+ /*
+ * The new maximum takes effect immediately.
+ */
+ if (__intel_cqm_threshold > cachelines)
+ __intel_cqm_threshold = cachelines;
+
+ mutex_unlock(&cache_mutex);
+
+ return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+ &dev_attr_max_recycle_threshold.attr,
+ NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+ .attrs = intel_cqm_attrs,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+ &intel_cqm_events_group,
+ &intel_cqm_format_group,
+ &intel_cqm_group,
+ NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+ .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+ .attr_groups = intel_cqm_attr_groups,
+ .task_ctx_nr = perf_sw_context,
+ .event_init = intel_cqm_event_init,
+ .add = intel_cqm_event_add,
+ .del = intel_cqm_event_del,
+ .start = intel_cqm_event_start,
+ .stop = intel_cqm_event_stop,
+ .read = intel_cqm_event_read,
+ .count = intel_cqm_event_count,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ for_each_cpu(i, &cqm_cpumask) {
+ if (phys_id == topology_physical_package_id(i))
+ return; /* already got reader for this socket */
+ }
+
+ cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+ struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ raw_spin_lock_init(&state->lock);
+ state->rmid = 0;
+ state->cnt = 0;
+
+ WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+ WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+ int phys_id = topology_physical_package_id(cpu);
+ int i;
+
+ /*
+ * Is @cpu a designated cqm reader?
+ */
+ if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+ return;
+
+ for_each_online_cpu(i) {
+ if (i == cpu)
+ continue;
+
+ if (phys_id == topology_physical_package_id(i)) {
+ cpumask_set_cpu(i, &cqm_cpumask);
+ break;
+ }
+ }
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_UP_PREPARE:
+ intel_cqm_cpu_prepare(cpu);
+ break;
+ case CPU_DOWN_PREPARE:
+ intel_cqm_cpu_exit(cpu);
+ break;
+ case CPU_STARTING:
+ cqm_pick_event_reader(cpu);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+ { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+ {}
+};
+
+static int __init intel_cqm_init(void)
+{
+ char *str, scale[20];
+ int i, cpu, ret;
+
+ if (!x86_match_cpu(intel_cqm_match))
+ return -ENODEV;
+
+ cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /*
+ * It's possible that not all resources support the same number
+ * of RMIDs. Instead of making scheduling much more complicated
+ * (where we have to match a task's RMID to a cpu that supports
+ * that many RMIDs) just find the minimum RMIDs supported across
+ * all cpus.
+ *
+ * Also, check that the scales match on all cpus.
+ */
+ cpu_notifier_register_begin();
+
+ for_each_online_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->x86_cache_max_rmid < cqm_max_rmid)
+ cqm_max_rmid = c->x86_cache_max_rmid;
+
+ if (c->x86_cache_occ_scale != cqm_l3_scale) {
+ pr_err("Multiple LLC scale values, disabling\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /*
+ * A reasonable upper limit on the max threshold is the number
+ * of lines tagged per RMID if all RMIDs have the same number of
+ * lines tagged in the LLC.
+ *
+ * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+ */
+ __intel_cqm_max_threshold =
+ boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
+ snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+ str = kstrdup(scale, GFP_KERNEL);
+ if (!str) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ event_attr_intel_cqm_llc_scale.event_str = str;
+
+ ret = intel_cqm_setup_rmid_cache();
+ if (ret)
+ goto out;
+
+ for_each_online_cpu(i) {
+ intel_cqm_cpu_prepare(i);
+ cqm_pick_event_reader(i);
+ }
+
+ __perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+ if (ret)
+ pr_err("Intel CQM perf registration failed: %d\n", ret);
+ else
+ pr_info("Intel CQM monitoring enabled\n");
+
+out:
+ cpu_notifier_register_done();
+
+ return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
debugctlmsr |= DEBUGCTLMSR_TR;
debugctlmsr |= DEBUGCTLMSR_BTS;
- debugctlmsr |= DEBUGCTLMSR_BTINT;
+ if (config & ARCH_PERFMON_EVENTSEL_INT)
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
if (!(config & ARCH_PERFMON_EVENTSEL_OS))
debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
@@ -557,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -564,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
EVENT_CONSTRAINT_END
};
@@ -587,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -602,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
+ /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+ INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
EVENT_CONSTRAINT_END
};
@@ -611,6 +620,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -622,6 +635,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+ INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+ INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
@@ -633,16 +650,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
- INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
- INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+ INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf), /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+ INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf), /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
/* Allow all events as PEBS with no flags */
INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
EVENT_CONSTRAINT_END
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
#define LBR_FAR_BIT 8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT 9 /* enable call stack */
#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
#define LBR_USER (1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
#define LBR_FAR (1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK (1 << LBR_CALL_STACK_BIT)
#define LBR_PLM (LBR_KERNEL | LBR_USER)
@@ -69,33 +71,31 @@ static enum {
#define LBR_FROM_FLAG_IN_TX (1ULL << 62)
#define LBR_FROM_FLAG_ABORT (1ULL << 61)
-#define for_each_branch_sample_type(x) \
- for ((x) = PERF_SAMPLE_BRANCH_USER; \
- (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
/*
* x86control flow change classification
* x86control flow changes include branches, interrupts, traps, faults
*/
enum {
- X86_BR_NONE = 0, /* unknown */
-
- X86_BR_USER = 1 << 0, /* branch target is user */
- X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
-
- X86_BR_CALL = 1 << 2, /* call */
- X86_BR_RET = 1 << 3, /* return */
- X86_BR_SYSCALL = 1 << 4, /* syscall */
- X86_BR_SYSRET = 1 << 5, /* syscall return */
- X86_BR_INT = 1 << 6, /* sw interrupt */
- X86_BR_IRET = 1 << 7, /* return from interrupt */
- X86_BR_JCC = 1 << 8, /* conditional */
- X86_BR_JMP = 1 << 9, /* jump */
- X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
- X86_BR_IND_CALL = 1 << 11,/* indirect calls */
- X86_BR_ABORT = 1 << 12,/* transaction abort */
- X86_BR_IN_TX = 1 << 13,/* in transaction */
- X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_NONE = 0, /* unknown */
+
+ X86_BR_USER = 1 << 0, /* branch target is user */
+ X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
+
+ X86_BR_CALL = 1 << 2, /* call */
+ X86_BR_RET = 1 << 3, /* return */
+ X86_BR_SYSCALL = 1 << 4, /* syscall */
+ X86_BR_SYSRET = 1 << 5, /* syscall return */
+ X86_BR_INT = 1 << 6, /* sw interrupt */
+ X86_BR_IRET = 1 << 7, /* return from interrupt */
+ X86_BR_JCC = 1 << 8, /* conditional */
+ X86_BR_JMP = 1 << 9, /* jump */
+ X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
+ X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+ X86_BR_ABORT = 1 << 12,/* transaction abort */
+ X86_BR_IN_TX = 1 << 13,/* in transaction */
+ X86_BR_NO_TX = 1 << 14,/* not in transaction */
+ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */
+ X86_BR_CALL_STACK = 1 << 16,/* call stack */
};
#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -112,13 +112,15 @@ enum {
X86_BR_JMP |\
X86_BR_IRQ |\
X86_BR_ABORT |\
- X86_BR_IND_CALL)
+ X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL)
#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
#define X86_BR_ANY_CALL \
(X86_BR_CALL |\
X86_BR_IND_CALL |\
+ X86_BR_ZERO_CALL |\
X86_BR_SYSCALL |\
X86_BR_IRQ |\
X86_BR_INT)
@@ -130,17 +132,32 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
* otherwise it becomes near impossible to get a reliable stack.
*/
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
{
- u64 debugctl;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ u64 debugctl, lbr_select = 0, orig_debugctl;
- if (cpuc->lbr_sel)
- wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+ /*
+ * No need to reprogram LBR_SELECT in a PMI, as it
+ * did not change.
+ */
+ if (cpuc->lbr_sel && !pmi) {
+ lbr_select = cpuc->lbr_sel->config;
+ wrmsrl(MSR_LBR_SELECT, lbr_select);
+ }
rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
- debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
- wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ orig_debugctl = debugctl;
+ debugctl |= DEBUGCTLMSR_LBR;
+ /*
+ * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+ * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+ * may cause superfluous increase/decrease of LBR_TOS.
+ */
+ if (!(lbr_select & LBR_CALL_STACK))
+ debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+ if (orig_debugctl != debugctl)
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
}
static void __intel_pmu_lbr_disable(void)
@@ -181,9 +198,116 @@ void intel_pmu_lbr_reset(void)
intel_pmu_lbr_reset_64();
}
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+ return tos;
+}
+
+enum {
+ LBR_NONE,
+ LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0 ||
+ task_ctx->lbr_stack_state == LBR_NONE) {
+ intel_pmu_lbr_reset();
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+ int i;
+ unsigned lbr_idx, mask;
+ u64 tos;
+
+ if (task_ctx->lbr_callstack_users == 0) {
+ task_ctx->lbr_stack_state = LBR_NONE;
+ return;
+ }
+
+ mask = x86_pmu.lbr_nr - 1;
+ tos = intel_pmu_lbr_tos();
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ lbr_idx = (tos - i) & mask;
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+ }
+ task_ctx->lbr_stack_state = LBR_VALID;
+}
+
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ /*
+ * If LBR callstack feature is enabled and the stack was saved when
+ * the task was scheduled out, restore the stack. Otherwise flush
+ * the LBR stack.
+ */
+ task_ctx = ctx ? ctx->task_ctx_data : NULL;
+ if (task_ctx) {
+ if (sched_in) {
+ __intel_pmu_lbr_restore(task_ctx);
+ cpuc->lbr_context = ctx;
+ } else {
+ __intel_pmu_lbr_save(task_ctx);
+ }
+ return;
+ }
+
+ /*
+ * When sampling the branck stack in system-wide, it may be
+ * necessary to flush the stack on context switch. This happens
+ * when the branch stack does not tag its entries with the pid
+ * of the current task. Otherwise it becomes impossible to
+ * associate a branch entry with a task. This ambiguity is more
+ * likely to appear when the branch stack supports priv level
+ * filtering and the user sets it to monitor only at the user
+ * level (which could be a useful measurement in system-wide
+ * mode). In that case, the risk is high of having a branch
+ * stack with branch from multiple tasks.
+ */
+ if (sched_in) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = ctx;
+ }
+}
+
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+ return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
void intel_pmu_lbr_enable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
@@ -198,18 +322,33 @@ void intel_pmu_lbr_enable(struct perf_event *event)
}
cpuc->br_sel = event->hw.branch_reg.reg;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users++;
+ }
+
cpuc->lbr_users++;
+ perf_sched_cb_inc(event->ctx->pmu);
}
void intel_pmu_lbr_disable(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+ struct x86_perf_task_context *task_ctx;
if (!x86_pmu.lbr_nr)
return;
+ if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+ event->ctx->task_ctx_data) {
+ task_ctx = event->ctx->task_ctx_data;
+ task_ctx->lbr_callstack_users--;
+ }
+
cpuc->lbr_users--;
WARN_ON_ONCE(cpuc->lbr_users < 0);
+ perf_sched_cb_dec(event->ctx->pmu);
if (cpuc->enabled && !cpuc->lbr_users) {
__intel_pmu_lbr_disable();
@@ -218,12 +357,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
}
}
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
if (cpuc->lbr_users)
- __intel_pmu_lbr_enable();
+ __intel_pmu_lbr_enable(pmi);
}
void intel_pmu_lbr_disable_all(void)
@@ -234,18 +373,6 @@ void intel_pmu_lbr_disable_all(void)
__intel_pmu_lbr_disable();
}
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
- u64 tos;
-
- rdmsrl(x86_pmu.lbr_tos, tos);
-
- return tos;
-}
-
static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
{
unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -350,7 +477,7 @@ void intel_pmu_lbr_read(void)
* - in case there is no HW filter
* - in case the HW filter has errata or limitations
*/
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
{
u64 br_type = event->attr.branch_sample_type;
int mask = 0;
@@ -387,11 +514,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
if (br_type & PERF_SAMPLE_BRANCH_COND)
mask |= X86_BR_JCC;
+ if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+ if (!x86_pmu_has_lbr_callstack())
+ return -EOPNOTSUPP;
+ if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+ return -EINVAL;
+ mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+ X86_BR_CALL_STACK;
+ }
+
/*
* stash actual user request into reg, it may
* be used by fixup code for some CPU
*/
event->hw.branch_reg.reg = mask;
+ return 0;
}
/*
@@ -403,14 +540,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
{
struct hw_perf_event_extra *reg;
u64 br_type = event->attr.branch_sample_type;
- u64 mask = 0, m;
- u64 v;
+ u64 mask = 0, v;
+ int i;
- for_each_branch_sample_type(m) {
- if (!(br_type & m))
+ for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
+ if (!(br_type & (1ULL << i)))
continue;
- v = x86_pmu.lbr_sel_map[m];
+ v = x86_pmu.lbr_sel_map[i];
if (v == LBR_NOT_SUPP)
return -EOPNOTSUPP;
@@ -420,8 +557,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
reg = &event->hw.branch_reg;
reg->idx = EXTRA_REG_LBR;
- /* LBR_SELECT operates in suppress mode so invert mask */
- reg->config = ~mask & x86_pmu.lbr_sel_mask;
+ /*
+ * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+ * in suppress mode. So LBR_SELECT should be set to
+ * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+ */
+ reg->config = mask ^ x86_pmu.lbr_sel_mask;
return 0;
}
@@ -439,7 +580,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
/*
* setup SW LBR filter
*/
- intel_pmu_setup_sw_lbr_filter(event);
+ ret = intel_pmu_setup_sw_lbr_filter(event);
+ if (ret)
+ return ret;
/*
* setup HW LBR filter, if any
@@ -568,6 +711,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
ret = X86_BR_INT;
break;
case 0xe8: /* call near rel */
+ insn_get_immediate(&insn);
+ if (insn.immediate1.value == 0) {
+ /* zero length call */
+ ret = X86_BR_ZERO_CALL;
+ break;
+ }
case 0x9a: /* call far absolute */
ret = X86_BR_CALL;
break;
@@ -678,35 +827,49 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
/*
* Map interface branch filters onto LBR filters
*/
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
- | LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_REL_JMP
+ | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
*/
- [PERF_SAMPLE_BRANCH_ANY_CALL] =
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
/*
* NHM/WSM erratum: must include IND_JMP to capture IND_CALL
*/
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
};
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
- [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
- [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
- [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
- [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
- [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
- [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
- | LBR_FAR,
- [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
- [PERF_SAMPLE_BRANCH_COND] = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+};
+
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+ [PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_ANY,
+ [PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_USER,
+ [PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_KERNEL,
+ [PERF_SAMPLE_BRANCH_HV_SHIFT] = LBR_IGN,
+ [PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_RETURN | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_FAR,
+ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL,
+ [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC,
+ [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL
+ | LBR_RETURN | LBR_CALL_STACK,
};
/* core */
@@ -765,6 +928,20 @@ void __init intel_pmu_lbr_init_snb(void)
pr_cont("16-deep LBR, ");
}
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = MSR_LBR_TOS;
+ x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+ x86_pmu.lbr_to = MSR_LBR_NHM_TO;
+
+ x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+ x86_pmu.lbr_sel_map = hsw_lbr_sel_map;
+
+ pr_cont("16-deep LBR, ");
+}
+
/* atom */
void __init intel_pmu_lbr_init_atom(void)
{
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..ffe666c2c6b5
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1100 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m) \
+ [PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l, \
+ .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+ const char *name;
+ u32 leaf;
+ u8 reg;
+ u32 mask;
+} pt_caps[] = {
+ PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff),
+ PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)),
+ PT_CAP(topa_output, 0, CR_ECX, BIT(0)),
+ PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)),
+ PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+ struct pt_cap_desc *cd = &pt_caps[cap];
+ u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+ unsigned int shift = __ffs(cd->mask);
+
+ return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct dev_ext_attribute *ea =
+ container_of(attr, struct dev_ext_attribute, attr);
+ enum pt_capabilities cap = (long)ea->var;
+
+ return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+ .name = "caps",
+};
+
+PMU_FORMAT_ATTR(tsc, "config:10" );
+PMU_FORMAT_ATTR(noretcomp, "config:11" );
+
+static struct attribute *pt_formats_attr[] = {
+ &format_attr_tsc.attr,
+ &format_attr_noretcomp.attr,
+ NULL,
+};
+
+static struct attribute_group pt_format_group = {
+ .name = "format",
+ .attrs = pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+ &pt_cap_group,
+ &pt_format_group,
+ NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+ struct dev_ext_attribute *de_attrs;
+ struct attribute **attrs;
+ size_t size;
+ int ret;
+ long i;
+
+ attrs = NULL;
+ ret = -ENODEV;
+ if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+ goto fail;
+
+ for (i = 0; i < PT_CPUID_LEAVES; i++) {
+ cpuid_count(20, i,
+ &pt_pmu.caps[CR_EAX + i*4],
+ &pt_pmu.caps[CR_EBX + i*4],
+ &pt_pmu.caps[CR_ECX + i*4],
+ &pt_pmu.caps[CR_EDX + i*4]);
+ }
+
+ ret = -ENOMEM;
+ size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
+ attrs = kzalloc(size, GFP_KERNEL);
+ if (!attrs)
+ goto fail;
+
+ size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
+ de_attrs = kzalloc(size, GFP_KERNEL);
+ if (!de_attrs)
+ goto fail;
+
+ for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+ struct dev_ext_attribute *de_attr = de_attrs + i;
+
+ de_attr->attr.attr.name = pt_caps[i].name;
+
+ sysfs_attr_init(&de_attrs->attr.attr);
+
+ de_attr->attr.attr.mode = S_IRUGO;
+ de_attr->attr.show = pt_cap_show;
+ de_attr->var = (void *)i;
+
+ attrs[i] = &de_attr->attr.attr;
+ }
+
+ pt_cap_group.attrs = attrs;
+
+ return 0;
+
+fail:
+ kfree(attrs);
+
+ return ret;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+ u64 config = event->attr.config;
+
+ if ((config & PT_CONFIG_MASK) != config)
+ return false;
+
+ return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+ u64 reg;
+
+ reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+ if (!event->attr.exclude_kernel)
+ reg |= RTIT_CTL_OS;
+ if (!event->attr.exclude_user)
+ reg |= RTIT_CTL_USR;
+
+ reg |= (event->attr.config & PT_CONFIG_MASK);
+
+ wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+ u64 ctl;
+
+ rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+ if (start)
+ ctl |= RTIT_CTL_TRACEEN;
+ else
+ ctl &= ~RTIT_CTL_TRACEEN;
+ wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+ /*
+ * A wrmsr that disables trace generation serializes other PT
+ * registers and causes all data packets to be written to memory,
+ * but a fence is required for the data to become globally visible.
+ *
+ * The below WMB, separating data store and aux_head store matches
+ * the consumer's RMB that separates aux_head load and data load.
+ */
+ if (!start)
+ wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+ unsigned int output_off)
+{
+ u64 reg;
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+ reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table: actual ToPA table entries, as understood by PT hardware
+ * @list: linkage to struct pt_buffer's list of tables
+ * @phys: physical address of this page
+ * @offset: offset of the first entry in this table in the buffer
+ * @size: total size of all entries in this table
+ * @last: index of the last initialized entry in this table
+ */
+struct topa {
+ struct topa_entry table[TENTS_PER_PAGE];
+ struct list_head list;
+ u64 phys;
+ u64 offset;
+ size_t size;
+ int last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu: CPU on which to allocate.
+ * @gfp: Allocation flags.
+ *
+ * Return: On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+ int node = cpu_to_node(cpu);
+ struct topa *topa;
+ struct page *p;
+
+ p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+ if (!p)
+ return NULL;
+
+ topa = page_address(p);
+ topa->last = 0;
+ topa->phys = page_to_phys(p);
+
+ /*
+ * In case of singe-entry ToPA, always put the self-referencing END
+ * link as the 2nd entry in the table
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, 1)->end = 1;
+ }
+
+ return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa: Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+ free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf: PT buffer that's being extended.
+ * @topa: New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+ struct topa *last = buf->last;
+
+ list_add_tail(&topa->list, &buf->tables);
+
+ if (!buf->first) {
+ buf->first = buf->last = buf->cur = topa;
+ return;
+ }
+
+ topa->offset = last->offset + last->size;
+ buf->last = topa;
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return;
+
+ BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+ TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa: ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+ /* single-entry ToPA is a special case */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return !!topa->last;
+
+ return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf: PT buffer being initialized.
+ * @gfp: Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return: 0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+ struct topa *topa = buf->last;
+ int order = 0;
+ struct page *p;
+
+ p = virt_to_page(buf->data_pages[buf->nr_pages]);
+ if (PagePrivate(p))
+ order = page_private(p);
+
+ if (topa_table_full(topa)) {
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+ }
+
+ TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+ TOPA_ENTRY(topa, -1)->size = order;
+ if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(topa, -1)->intr = 1;
+ TOPA_ENTRY(topa, -1)->stop = 1;
+ }
+
+ topa->last++;
+ topa->size += sizes(order);
+
+ buf->nr_pages += 1ul << order;
+
+ return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf: PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+ struct topa *topa;
+
+ list_for_each_entry(topa, &buf->tables, list) {
+ int i;
+
+ pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+ topa->phys, topa->offset, topa->size);
+ for (i = 0; i < TENTS_PER_PAGE; i++) {
+ pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+ &topa->table[i],
+ (unsigned long)topa->table[i].base << TOPA_SHIFT,
+ sizes(topa->table[i].size),
+ topa->table[i].end ? 'E' : ' ',
+ topa->table[i].intr ? 'I' : ' ',
+ topa->table[i].stop ? 'S' : ' ',
+ *(u64 *)&topa->table[i]);
+ if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+ topa->table[i].stop) ||
+ topa->table[i].end)
+ break;
+ }
+ }
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf: PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+ buf->output_off = 0;
+ buf->cur_idx++;
+
+ if (buf->cur_idx == buf->cur->last) {
+ if (buf->cur == buf->last)
+ buf->cur = buf->first;
+ else
+ buf->cur = list_entry(buf->cur->list.next, struct topa,
+ list);
+ buf->cur_idx = 0;
+ }
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt: Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ u64 topa_idx, base, old;
+
+ /* offset of the first region in this table from the beginning of buf */
+ base = buf->cur->offset + buf->output_off;
+
+ /* offset of the current output region within this table */
+ for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+ base += sizes(buf->cur->table[topa_idx].size);
+
+ if (buf->snapshot) {
+ local_set(&buf->data_size, base);
+ } else {
+ old = (local64_xchg(&buf->head, base) &
+ ((buf->nr_pages << PAGE_SHIFT) - 1));
+ if (base < old)
+ base += buf->nr_pages << PAGE_SHIFT;
+
+ local_add(base - old, &buf->data_size);
+ }
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf: PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+ return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf: PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+ return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt: Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+ int advance = 0;
+ u64 status;
+
+ rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+ if (status & RTIT_STATUS_ERROR) {
+ pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+ pt_topa_dump(buf);
+ status &= ~RTIT_STATUS_ERROR;
+ }
+
+ if (status & RTIT_STATUS_STOPPED) {
+ status &= ~RTIT_STATUS_STOPPED;
+
+ /*
+ * On systems that only do single-entry ToPA, hitting STOP
+ * means we are already losing data; need to let the decoder
+ * know.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+ buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+ local_inc(&buf->lost);
+ advance++;
+ }
+ }
+
+ /*
+ * Also on single-entry ToPA implementations, interrupt will come
+ * before the output reaches its output region's boundary.
+ */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+ pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+ void *head = pt_buffer_region(buf);
+
+ /* everything within this margin needs to be zeroed out */
+ memset(head + buf->output_off, 0,
+ pt_buffer_region_size(buf) -
+ buf->output_off);
+ advance++;
+ }
+
+ if (advance)
+ pt_buffer_advance(buf);
+
+ wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf: PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+ u64 offset, base_topa;
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+ buf->cur = phys_to_virt(base_topa);
+
+ rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+ /* offset within current output region */
+ buf->output_off = offset >> 32;
+ /* index of current output region within this table */
+ buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf: PT buffer.
+ * @pg: Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+ struct topa_entry *te = buf->topa_index[pg];
+
+ /* one region */
+ if (buf->first == buf->last && buf->first->last == 1)
+ return pg;
+
+ do {
+ pg++;
+ pg &= buf->nr_pages - 1;
+ } while (buf->topa_index[pg] == te);
+
+ return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf: PT buffer.
+ * @handle: Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+ struct perf_output_handle *handle)
+
+{
+ unsigned long idx, npages, end;
+
+ if (buf->snapshot)
+ return 0;
+
+ /* can't stop in the middle of an output region */
+ if (buf->output_off + handle->size + 1 <
+ sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+ return -EINVAL;
+
+
+ /* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ return 0;
+
+ /* clear STOP and INT from current entry */
+ buf->topa_index[buf->stop_pos]->stop = 0;
+ buf->topa_index[buf->intr_pos]->intr = 0;
+
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ npages = (handle->size + 1) >> PAGE_SHIFT;
+ end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+ /*if (end > handle->wakeup >> PAGE_SHIFT)
+ end = handle->wakeup >> PAGE_SHIFT;*/
+ idx = end & (buf->nr_pages - 1);
+ buf->stop_pos = idx;
+ idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+ idx &= buf->nr_pages - 1;
+ buf->intr_pos = idx;
+ }
+
+ buf->topa_index[buf->stop_pos]->stop = 1;
+ buf->topa_index[buf->intr_pos]->intr = 1;
+
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf: PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+ struct topa *cur = buf->first, *prev = buf->last;
+ struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+ *te_prev = TOPA_ENTRY(prev, prev->last - 1);
+ int pg = 0, idx = 0, ntopa = 0;
+
+ while (pg < buf->nr_pages) {
+ int tidx;
+
+ /* pages within one topa entry */
+ for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+ buf->topa_index[pg] = te_prev;
+
+ te_prev = te_cur;
+
+ if (idx == cur->last - 1) {
+ /* advance to next topa table */
+ idx = 0;
+ cur = list_entry(cur->list.next, struct topa, list);
+ ntopa++;
+ } else
+ idx++;
+ te_cur = TOPA_ENTRY(cur, idx);
+ }
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf: PT buffer.
+ * @head: Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+ int pg;
+
+ if (buf->snapshot)
+ head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+ pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+ pg = pt_topa_next_entry(buf, pg);
+
+ buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+ buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+ (unsigned long)buf->cur) / sizeof(struct topa_entry);
+ buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+ local64_set(&buf->head, head);
+ local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf: PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+ struct topa *topa, *iter;
+
+ list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+ /*
+ * right now, this is in free_aux() path only, so
+ * no need to unlink this table from the list
+ */
+ topa_free(topa);
+ }
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf: PT buffer.
+ * @size: Total size of all regions within this ToPA.
+ * @gfp: Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+ gfp_t gfp)
+{
+ struct topa *topa;
+ int err;
+
+ topa = topa_alloc(buf->cpu, gfp);
+ if (!topa)
+ return -ENOMEM;
+
+ topa_insert_table(buf, topa);
+
+ while (buf->nr_pages < nr_pages) {
+ err = topa_insert_pages(buf, gfp);
+ if (err) {
+ pt_buffer_fini_topa(buf);
+ return -ENOMEM;
+ }
+ }
+
+ pt_buffer_setup_topa_index(buf);
+
+ /* link last table to the first one, unless we're double buffering */
+ if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+ TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+ TOPA_ENTRY(buf->last, -1)->end = 1;
+ }
+
+ pt_topa_dump(buf);
+ return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu: Cpu on which to allocate, -1 means current.
+ * @pages: Array of pointers to buffer pages passed from perf core.
+ * @nr_pages: Number of pages in the buffer.
+ * @snapshot: If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return: Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+ struct pt_buffer *buf;
+ int node, ret;
+
+ if (!nr_pages)
+ return NULL;
+
+ if (cpu == -1)
+ cpu = raw_smp_processor_id();
+ node = cpu_to_node(cpu);
+
+ buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+ GFP_KERNEL, node);
+ if (!buf)
+ return NULL;
+
+ buf->cpu = cpu;
+ buf->snapshot = snapshot;
+ buf->data_pages = pages;
+
+ INIT_LIST_HEAD(&buf->tables);
+
+ ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+ if (ret) {
+ kfree(buf);
+ return NULL;
+ }
+
+ return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data: PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+ struct pt_buffer *buf = data;
+
+ pt_buffer_fini_topa(buf);
+ kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf: PT buffer.
+ * @pt: Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+ if (buf->snapshot)
+ return false;
+
+ if (local_read(&buf->data_size) >= pt->handle.size)
+ return true;
+
+ return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+ struct perf_event *event = pt->handle.event;
+
+ /*
+ * There may be a dangling PT bit in the interrupt status register
+ * after PT has been disabled by pt_event_stop(). Make sure we don't
+ * do anything (particularly, re-enable) for this event here.
+ */
+ if (!ACCESS_ONCE(pt->handle_nmi))
+ return;
+
+ pt_config_start(false);
+
+ if (!event)
+ return;
+
+ buf = perf_get_aux(&pt->handle);
+ if (!buf)
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+
+ if (!event->hw.state) {
+ int ret;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ if (!buf) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret) {
+ perf_aux_output_end(&pt->handle, 0, true);
+ return;
+ }
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+ }
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+ event->hw.state = PERF_HES_STOPPED;
+ return;
+ }
+
+ ACCESS_ONCE(pt->handle_nmi) = 1;
+ event->hw.state = 0;
+
+ pt_config_buffer(buf->cur->table, buf->cur_idx,
+ buf->output_off);
+ wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+ pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+ /*
+ * Protect against the PMI racing with disabling wrmsr,
+ * see comment in intel_pt_interrupt().
+ */
+ ACCESS_ONCE(pt->handle_nmi) = 0;
+ pt_config_start(false);
+
+ if (event->hw.state == PERF_HES_STOPPED)
+ return;
+
+ event->hw.state = PERF_HES_STOPPED;
+
+ if (mode & PERF_EF_UPDATE) {
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (!buf)
+ return;
+
+ if (WARN_ON_ONCE(pt->handle.event != event))
+ return;
+
+ pt_read_offset(buf);
+
+ pt_handle_status(pt);
+
+ pt_update_head(pt);
+ }
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct pt_buffer *buf;
+
+ pt_event_stop(event, PERF_EF_UPDATE);
+
+ buf = perf_get_aux(&pt->handle);
+
+ if (buf) {
+ if (buf->snapshot)
+ pt->handle.head =
+ local_xchg(&buf->data_size,
+ buf->nr_pages << PAGE_SHIFT);
+ perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+ local_xchg(&buf->lost, 0));
+ }
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+ struct pt_buffer *buf;
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct hw_perf_event *hwc = &event->hw;
+ int ret = -EBUSY;
+
+ if (pt->handle.event)
+ goto fail;
+
+ buf = perf_aux_output_begin(&pt->handle, event);
+ ret = -EINVAL;
+ if (!buf)
+ goto fail_stop;
+
+ pt_buffer_reset_offsets(buf, pt->handle.head);
+ if (!buf->snapshot) {
+ ret = pt_buffer_reset_markers(buf, &pt->handle);
+ if (ret)
+ goto fail_end_stop;
+ }
+
+ if (mode & PERF_EF_START) {
+ pt_event_start(event, 0);
+ ret = -EBUSY;
+ if (hwc->state == PERF_HES_STOPPED)
+ goto fail_end_stop;
+ } else {
+ hwc->state = PERF_HES_STOPPED;
+ }
+
+ return 0;
+
+fail_end_stop:
+ perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+ hwc->state = PERF_HES_STOPPED;
+fail:
+ return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+ x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+ if (event->attr.type != pt_pmu.pmu.type)
+ return -ENOENT;
+
+ if (!pt_event_valid(event))
+ return -EINVAL;
+
+ if (x86_add_exclusive(x86_lbr_exclusive_pt))
+ return -EBUSY;
+
+ event->destroy = pt_event_destroy;
+
+ return 0;
+}
+
+static __init int pt_init(void)
+{
+ int ret, cpu, prior_warn = 0;
+
+ BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ u64 ctl;
+
+ ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+ if (!ret && (ctl & RTIT_CTL_TRACEEN))
+ prior_warn++;
+ }
+ put_online_cpus();
+
+ if (prior_warn) {
+ x86_add_exclusive(x86_lbr_exclusive_pt);
+ pr_warn("PT is enabled at boot time, doing nothing\n");
+
+ return -EBUSY;
+ }
+
+ ret = pt_pmu_hw_init();
+ if (ret)
+ return ret;
+
+ if (!pt_cap_get(PT_CAP_topa_output)) {
+ pr_warn("ToPA output is not supported on this CPU\n");
+ return -ENODEV;
+ }
+
+ if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+ pt_pmu.pmu.capabilities =
+ PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+ pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+ pt_pmu.pmu.attr_groups = pt_attr_groups;
+ pt_pmu.pmu.task_ctx_nr = perf_sw_context;
+ pt_pmu.pmu.event_init = pt_event_init;
+ pt_pmu.pmu.add = pt_event_add;
+ pt_pmu.pmu.del = pt_event_del;
+ pt_pmu.pmu.start = pt_event_start;
+ pt_pmu.pmu.stop = pt_event_stop;
+ pt_pmu.pmu.read = pt_event_read;
+ pt_pmu.pmu.setup_aux = pt_buffer_setup_aux;
+ pt_pmu.pmu.free_aux = pt_buffer_free_aux;
+ ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+ return ret;
+}
+
+module_init(pt_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
#define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */
#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
+#define NR_RAPL_DOMAINS 0x4
+static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+ "pp0-core",
+ "package",
+ "dram",
+ "pp1-gpu",
+};
+
/* Clients have PP0, PKG */
#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \
struct rapl_pmu {
spinlock_t lock;
- int hw_unit; /* 1/2^hw_unit Joule */
int n_active; /* number of active events */
struct list_head active_list;
struct pmu *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
struct hrtimer hrtimer;
};
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */
static struct pmu rapl_pmu_class;
static cpumask_t rapl_cpu_mask;
static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
+static struct x86_pmu_quirk *rapl_quirks;
static inline u64 rapl_read_counter(struct perf_event *event)
{
u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
return raw;
}
-static inline u64 rapl_scale(u64 v)
+#define rapl_add_quirk(func_) \
+do { \
+ static struct x86_pmu_quirk __quirk __initdata = { \
+ .func = func_, \
+ }; \
+ __quirk.next = rapl_quirks; \
+ rapl_quirks = &__quirk; \
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
{
+ if (cfg > NR_RAPL_DOMAINS) {
+ pr_warn("invalid domain %d, failed to scale data\n", cfg);
+ return v;
+ }
/*
* scale delta to smallest unit (1/2^32)
* users must then scale back: count * 1/(1e9*2^32) to get Joules
* or use ldexp(count, -32).
* Watts = Joules/Time delta
*/
- return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
+ return v << (32 - rapl_hw_unit[cfg - 1]);
}
static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- sdelta = rapl_scale(delta);
+ sdelta = rapl_scale(delta, event->hw.config);
local64_add(sdelta, &event->count);
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
cpumask_set_cpu(cpu, &rapl_cpu_mask);
}
+static __init void rapl_hsw_server_quirk(void)
+{
+ /*
+ * DRAM domain on HSW server has fixed energy unit which can be
+ * different than the unit from power unit MSR.
+ * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+ * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+ */
+ rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
static int rapl_cpu_prepare(int cpu)
{
struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
int phys_id = topology_physical_package_id(cpu);
u64 ms;
- u64 msr_rapl_power_unit_bits;
if (pmu)
return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
if (phys_id < 0)
return -1;
- /* protect rdmsrl() to handle virtualization */
- if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
- return -1;
-
pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
if (!pmu)
return -1;
-
spin_lock_init(&pmu->lock);
INIT_LIST_HEAD(&pmu->active_list);
- /*
- * grab power unit as: 1/2^unit Joules
- *
- * we cache in local PMU instance
- */
- pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
pmu->pmu = &rapl_pmu_class;
/*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
* divide interval by 2 to avoid lockstep (2 * 100)
* if hw unit is 32, then we use 2 ms 1/200/2
*/
- if (pmu->hw_unit < 32)
- ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+ if (rapl_hw_unit[0] < 32)
+ ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
else
ms = 2;
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
return NOTIFY_OK;
}
+static int rapl_check_hw_unit(void)
+{
+ u64 msr_rapl_power_unit_bits;
+ int i;
+
+ /* protect rdmsrl() to handle virtualization */
+ if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+ return -1;
+ for (i = 0; i < NR_RAPL_DOMAINS; i++)
+ rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+ return 0;
+}
+
static const struct x86_cpu_id rapl_cpu_match[] = {
[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
[1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
{
struct rapl_pmu *pmu;
int cpu, ret;
+ struct x86_pmu_quirk *quirk;
+ int i;
/*
* check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
rapl_cntr_mask = RAPL_IDX_CLN;
rapl_pmu_events_group.attrs = rapl_events_cln_attr;
break;
+ case 63: /* Haswell-Server */
+ rapl_add_quirk(rapl_hsw_server_quirk);
+ rapl_cntr_mask = RAPL_IDX_SRV;
+ rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+ break;
case 60: /* Haswell */
case 69: /* Haswell-Celeron */
rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
/* unsupported */
return 0;
}
+ ret = rapl_check_hw_unit();
+ if (ret)
+ return ret;
+ /* run cpu model quirks */
+ for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+ quirk->func();
cpu_notifier_register_begin();
for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
pmu = __this_cpu_read(rapl_pmu);
- pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+ pr_info("RAPL PMU detected,"
" API unit is 2^-32 Joules,"
" %d fixed counters"
" %llu ms ovfl timer\n",
- pmu->hw_unit,
hweight32(rapl_cntr_mask),
ktime_to_ms(pmu->timer_interval));
-
+ for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+ if (rapl_cntr_mask & (1 << i)) {
+ pr_info("hw unit of domain %s 2^-%d Joules\n",
+ rapl_domain_names[i], rapl_hw_unit[i]);
+ }
+ }
out:
cpu_notifier_register_done();
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
index 3001015b755c..4562e9e22c60 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -1,6 +1,13 @@
/* Nehalem/SandBridge/Haswell uncore support */
#include "perf_event_intel_uncore.h"
+/* Uncore IMC PCI IDs */
+#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
+#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154
+#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150
+#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00
+#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04
+
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
@@ -472,6 +479,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ },
};
@@ -502,6 +513,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */
IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */
IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */
+ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */
{ /* end marker */ }
};
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
}
}
- if (ubox_dev)
- pci_dev_put(ubox_dev);
+ pci_dev_put(ubox_dev);
return err ? pcibios_err_to_errno(err) : 0;
}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
{ X86_FEATURE_HWP_ACT_WINDOW, CR_EAX, 9, 0x00000006, 0 },
{ X86_FEATURE_HWP_EPP, CR_EAX,10, 0x00000006, 0 },
{ X86_FEATURE_HWP_PKG_REQ, CR_EAX,11, 0x00000006, 0 },
+ { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 },
{ X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
{ X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
{ X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7d46bb260334..e2ce85db2283 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
case E820_UNUSABLE:
printk(KERN_CONT "unusable");
break;
+ case E820_PRAM:
+ printk(KERN_CONT "persistent (type %u)", type);
+ break;
default:
printk(KERN_CONT "type %u", type);
break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
* continue building up new bios map based on this
* information
*/
- if (current_type != last_type) {
+ if (current_type != last_type || current_type == E820_PRAM) {
if (last_type != 0) {
new_bios[new_bios_entry].size =
change_point[chgidx]->addr - last_addr;
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
register_nosave_region(pfn, PFN_UP(ei->addr));
pfn = PFN_DOWN(ei->addr + ei->size);
+
if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
register_nosave_region(PFN_UP(ei->addr), pfn);
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
/*
* Find the highest page frame number we have available
*/
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
unsigned long start_pfn;
unsigned long end_pfn;
- if (ei->type != type)
+ /*
+ * Persistent memory is accounted as ram for purposes of
+ * establishing max_pfn and mem_map.
+ */
+ if (ei->type != E820_RAM && ei->type != E820_PRAM)
continue;
start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
}
unsigned long __init e820_end_of_ram_pfn(void)
{
- return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+ return e820_end_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
- return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+ return e820_end_pfn(1UL << (32-PAGE_SHIFT));
}
static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
} else if (*p == '$') {
start_at = memparse(p+1, &p);
e820_add_region(start_at, mem_size, E820_RESERVED);
+ } else if (*p == '!') {
+ start_at = memparse(p+1, &p);
+ e820_add_region(start_at, mem_size, E820_PRAM);
} else
e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
case E820_ACPI: return "ACPI Tables";
case E820_NVS: return "ACPI Non-volatile Storage";
case E820_UNUSABLE: return "Unusable memory";
+ case E820_PRAM: return "Persistent RAM";
default: return "reserved";
}
}
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
* pci device BAR resource and insert them later in
* pcibios_resource_survey()
*/
- if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+ if (((e820.map[i].type != E820_RESERVED) &&
+ (e820.map[i].type != E820_PRAM)) ||
+ res->start < (1ULL<<20)) {
res->flags |= IORESOURCE_BUSY;
insert_resource(&iomem_resource, res);
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 49ff55ef9b26..89427d8d4fc5 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -375,12 +375,6 @@ static int __init setup_early_printk(char *buf)
if (!strncmp(buf, "xen", 3))
early_console_register(&xenboot_console, keep);
#endif
-#ifdef CONFIG_EARLY_PRINTK_INTEL_MID
- if (!strncmp(buf, "hsu", 3)) {
- hsu_early_console_init(buf + 3);
- early_console_register(&early_hsu_console, keep);
- }
-#endif
#ifdef CONFIG_EARLY_PRINTK_EFI
if (!strncmp(buf, "efi", 3))
early_console_register(&early_efi_console, keep);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 60705b032521..7423e3e2f5c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,6 +295,15 @@ system_call_fastpath:
* rflags from r11 (but RF and VM bits are forced to 0),
* cs and ss are loaded from MSRs.
* Restoration of rflags re-enables interrupts.
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we should
+ * avoid SYSRET with SS == NULL, which could happen if we schedule,
+ * exit the kernel, and re-enter using an interrupt vector. (All
+ * interrupt entries on x86_64 set SS to NULL.) We prevent that
+ * from happening by reloading SS in __switch_to. (Actually
+ * detecting the failure in 64-bit userspace is tricky but can be
+ * done.)
*/
USERGS_SYSRET64
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 367f39d35e9c..009183276bb7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -341,7 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+ struct xsave_struct *xsave;
int ret;
if (!cpu_has_xsave)
@@ -351,6 +351,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ xsave = &target->thread.fpu.state->xsave;
+
/*
* Copy the 48bytes defined by the software first into the xstate
* memory layout in the thread struct, so that we can copy the entire
@@ -369,7 +371,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+ struct xsave_struct *xsave;
int ret;
if (!cpu_has_xsave)
@@ -379,6 +381,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ xsave = &target->thread.fpu.state->xsave;
+
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 24d079604fd5..1deffe6cc873 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
{
struct insn insn;
kprobe_opcode_t buf[MAX_INSN_SIZE];
+ int length;
unsigned long recovered_insn =
recover_probed_instruction(buf, (unsigned long)src);
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
return 0;
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
insn_get_length(&insn);
+ length = insn.length;
+
/* Another subsystem puts a breakpoint, failed to recover */
if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
return 0;
- memcpy(dest, insn.kaddr, insn.length);
+ memcpy(dest, insn.kaddr, length);
#ifdef CONFIG_X86_64
if (insn_rip_relative(&insn)) {
s64 newdisp;
u8 *disp;
- kernel_insn_init(&insn, dest, insn.length);
+ kernel_insn_init(&insn, dest, length);
insn_get_displacement(&insn);
/*
* The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
*(s32 *) disp = (s32) newdisp;
}
#endif
- return insn.length;
+ return length;
}
static int arch_copy_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
* can get false positives too easily, for example if the host is
* overcommitted.
*/
- watchdog_enable_hardlockup_detector(false);
+ hardlockup_detector_disable();
}
static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 7563114d9c3a..58bcfb67c01f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -445,7 +445,7 @@ struct pv_mmu_ops pv_mmu_ops = {
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
#ifdef CONFIG_X86_PAE
.set_pte_atomic = native_set_pte_atomic,
.pte_clear = native_pte_clear,
@@ -456,13 +456,13 @@ struct pv_mmu_ops pv_mmu_ops = {
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PTE_IDENT,
.make_pud = PTE_IDENT,
.set_pgd = native_set_pgd,
#endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
.pte_val = PTE_IDENT,
.pgd_val = PTE_IDENT,
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..3420c874ddc5
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+ struct platform_device *pdev;
+ int error;
+
+ pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+ if (!pdev)
+ return;
+
+ error = platform_device_add_resources(pdev, res, 1);
+ if (error)
+ goto out_put_pdev;
+
+ error = platform_device_add(pdev);
+ if (error)
+ goto out_put_pdev;
+ return;
+
+out_put_pdev:
+ dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
+ platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+
+ if (ei->type == E820_PRAM) {
+ struct resource res = {
+ .flags = IORESOURCE_MEM,
+ .start = ei->addr,
+ .end = ei->addr + ei->size - 1,
+ };
+ register_pmem_device(&res);
+ }
+ }
+
+ return 0;
+}
+device_initcall(register_pmem_devices);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8213da62b1b7..6e338e3b1dc0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -57,7 +57,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
#endif
};
-EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
+EXPORT_PER_CPU_SYMBOL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -156,11 +156,13 @@ void flush_thread(void)
/* FPU state will be reallocated lazily at the first use. */
drop_fpu(tsk);
free_thread_xstate(tsk);
- } else if (!used_math()) {
- /* kthread execs. TODO: cleanup this horror. */
- if (WARN_ON(init_fpu(tsk)))
- force_sig(SIGKILL, tsk);
- user_fpu_begin();
+ } else {
+ if (!tsk_used_math(tsk)) {
+ /* kthread execs. TODO: cleanup this horror. */
+ if (WARN_ON(init_fpu(tsk)))
+ force_sig(SIGKILL, tsk);
+ user_fpu_begin();
+ }
restore_init_xstate();
}
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 4baaa972f52a..ddfdbf74f174 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
__switch_to_xtra(prev_p, next_p, tss);
+ if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
+ /*
+ * AMD CPUs have a misfeature: SYSRET sets the SS selector but
+ * does not update the cached descriptor. As a result, if we
+ * do SYSRET while SS is NULL, we'll end up in user mode with
+ * SS apparently equal to __USER_DS but actually unusable.
+ *
+ * The straightforward workaround would be to fix it up just
+ * before SYSRET, but that would slow down the system call
+ * fast paths. Instead, we ensure that SS is never NULL in
+ * system call context. We do this by replacing NULL SS
+ * selectors at every context switch. SYSCALL sets up a valid
+ * SS, so the only way to get NULL is to re-enter the kernel
+ * from CPL 3 through an interrupt. Since that can't happen
+ * in the same task as a running syscall, we are guaranteed to
+ * context switch between every interrupt vector entry and a
+ * subsequent SYSRET.
+ *
+ * We read SS first because SS reads are much faster than
+ * writes. Out of caution, we force SS to __KERNEL_DS even if
+ * it previously had a different non-NULL value.
+ */
+ unsigned short ss_sel;
+ savesegment(ss, ss_sel);
+ if (ss_sel != __KERNEL_DS)
+ loadsegment(ss, __KERNEL_DS);
+ }
+
return prev_p;
}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index e5ecd20e72dd..2f355d229a58 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
-static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
-
-static struct pvclock_vsyscall_time_info *
-pvclock_get_vsyscall_user_time_info(int cpu)
-{
- if (!pvclock_vdso_info) {
- BUG();
- return NULL;
- }
-
- return &pvclock_vdso_info[cpu];
-}
-
-struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
-{
- return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
-}
-
#ifdef CONFIG_X86_64
-static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
- void *v)
-{
- struct task_migration_notifier *mn = v;
- struct pvclock_vsyscall_time_info *pvti;
-
- pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
-
- /* this is NULL when pvclock vsyscall is not initialized */
- if (unlikely(pvti == NULL))
- return NOTIFY_DONE;
-
- pvti->migrate_count++;
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block pvclock_migrate = {
- .notifier_call = pvclock_task_migrate,
-};
-
/*
* Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a
@@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
- pvclock_vdso_info = i;
-
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR);
}
-
- register_task_migration_notifier(&pvclock_migrate);
-
return 0;
}
#endif
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 3e581865c8e2..1ea14fd53933 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -593,24 +593,10 @@ badframe:
return 0;
}
-/*
- * OK, we're invoking a handler:
- */
-static int signr_convert(int sig)
-{
-#ifdef CONFIG_X86_32
- struct thread_info *info = current_thread_info();
-
- if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
- return info->exec_domain->signal_invmap[sig];
-#endif /* CONFIG_X86_32 */
- return sig;
-}
-
static int
setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
{
- int usig = signr_convert(ksig->sig);
+ int usig = ksig->sig;
sigset_t *set = sigmask_to_save();
compat_sigset_t *cset = (compat_sigset_t *) set;
@@ -630,7 +616,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
static void
handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
- bool failed;
+ bool stepping, failed;
+
/* Are we from a system call? */
if (syscall_get_nr(current, regs) >= 0) {
/* If so, check system call restarting.. */
@@ -654,12 +641,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
}
/*
- * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
- * flag so that register information in the sigcontext is correct.
+ * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+ * so that register information in the sigcontext is correct and
+ * then notify the tracer before entering the signal handler.
*/
- if (unlikely(regs->flags & X86_EFLAGS_TF) &&
- likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
- regs->flags &= ~X86_EFLAGS_TF;
+ stepping = test_thread_flag(TIF_SINGLESTEP);
+ if (stepping)
+ user_disable_single_step(current);
failed = (setup_rt_frame(ksig, regs) < 0);
if (!failed) {
@@ -670,10 +658,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* it might disable possible debug exception from the
* signal handler.
*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
+ * Clear TF for the case when it wasn't set by debugger to
+ * avoid the recursive send_sigtrap() in SIGTRAP handler.
*/
regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
/*
@@ -682,7 +668,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
if (used_math())
fpu_reset_state(current);
}
- signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
+ signal_setup_done(failed, ksig, stepping);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7035f6b21c3f..50e547eac8cd 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
#include <asm/realmode.h>
#include <asm/misc.h>
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
/* enable local interrupts */
@@ -954,7 +951,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
*/
mtrr_save_state();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* x86 CPUs take themselves offline, so delayed offline is OK. */
+ err = cpu_check_up_prepare(cpu);
+ if (err && err != -EBUSY)
+ return err;
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
@@ -1197,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
- per_cpu(cpu_state, me) = CPU_ONLINE;
+ cpu_set_state_online(me);
}
void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1324,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
-static DEFINE_PER_CPU(struct completion, die_complete);
-
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1355,24 +1351,27 @@ int native_cpu_disable(void)
return 0;
}
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
{
- wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+ int ret = 0;
-void native_cpu_die(unsigned int cpu)
-{
/* We don't do anything here: idle task is faking death itself. */
- cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ if (cpu_wait_death(cpu, 5)) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
} else {
pr_err("CPU %u didn't die...\n", cpu);
+ ret = -1;
}
+
+ return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ common_cpu_die(cpu);
}
void play_dead_common(void)
@@ -1381,10 +1380,8 @@ void play_dead_common(void)
reset_lazy_tlbstate();
amd_e400_remove_cpu(raw_smp_processor_id());
- mb();
/* Ack it */
- __this_cpu_write(cpu_state, CPU_DEAD);
- complete(&per_cpu(die_complete, smp_processor_id()));
+ (void)cpu_report_death();
/*
* With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
/* test 3: check the value hasn't changed */
/* If this test fails, we managed to overwrite the data */
if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+ printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
return -ENODEV;
}
/* test 4: check if the rodata section is 4Kb aligned */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f4fa991406cd..324ab5247687 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
- prev_state = IN_KERNEL; /* the value is irrelevant. */
+ prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
}
/*
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
index 6eb5c20ee373..d090ecf08809 100644
--- a/arch/x86/kvm/assigned-dev.c
+++ b/arch/x86/kvm/assigned-dev.c
@@ -666,7 +666,7 @@ static int probe_sysfs_permissions(struct pci_dev *dev)
if (r)
return r;
- inode = path.dentry->d_inode;
+ inode = d_backing_inode(path.dentry);
r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
path_put(&path);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d67206a7b99a..629af0f1c5c4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
unsigned long bitmap = 1;
struct kvm_lapic **dst;
int i;
- bool ret = false;
- bool x2apic_ipi = src && apic_x2apic_mode(src);
+ bool ret, x2apic_ipi;
*r = -1;
@@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
if (irq->shorthand)
return false;
+ x2apic_ipi = src && apic_x2apic_mode(src);
if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
return false;
+ ret = true;
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
- if (!map)
+ if (!map) {
+ ret = false;
goto out;
-
- ret = true;
+ }
if (irq->dest_mode == APIC_DEST_PHYSICAL) {
if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 146f295ee322..d43867c33bc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
pfn = spte_to_pfn(*sptep);
/*
- * Only EPT supported for now; otherwise, one would need to
- * find out efficiently whether the guest page tables are
- * also using huge pages.
+ * We cannot do huge page mapping for indirect shadow pages,
+ * which are found on the last rmap (level = 1) when not using
+ * tdp; such shadow pages are synced with the page table in
+ * the guest, and the guest page table is using 4K page size
+ * mapping if the indirect sp has level = 1.
*/
if (sp->role.direct &&
!kvm_is_reserved_pfn(pfn) &&
@@ -4504,19 +4506,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
bool flush = false;
unsigned long *rmapp;
unsigned long last_index, index;
- gfn_t gfn_start, gfn_end;
spin_lock(&kvm->mmu_lock);
- gfn_start = memslot->base_gfn;
- gfn_end = memslot->base_gfn + memslot->npages - 1;
-
- if (gfn_start >= gfn_end)
- goto out;
-
rmapp = memslot->arch.rmap[0];
- last_index = gfn_to_index(gfn_end, memslot->base_gfn,
- PT_PAGE_TABLE_LEVEL);
+ last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
+ memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
for (index = 0; index <= last_index; ++index, ++rmapp) {
if (*rmapp)
@@ -4534,7 +4529,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
if (flush)
kvm_flush_remote_tlbs(kvm);
-out:
spin_unlock(&kvm->mmu_lock);
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce8046c..f7b61687bd79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
- KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+ /*
+ * Pass through host's Machine Check Enable value to hw_cr4, which
+ * is in force while we are in guest mode. Do not let guests control
+ * this bit, even if host CR4.MCE == 0.
+ */
+ unsigned long hw_cr4 =
+ (cr4_read_shadow() & X86_CR4_MCE) |
+ (cr4 & ~X86_CR4_MCE) |
+ (to_vmx(vcpu)->rmode.vm86_active ?
+ KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
if (cr4 & X86_CR4_VMXE) {
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1a81267f3f6..c73efcd03e29 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
&guest_hv_clock, sizeof(guest_hv_clock))))
return 0;
- /*
- * The interface expects us to write an even number signaling that the
- * update is finished. Since the guest won't see the intermediate
- * state, we just increase by 2 at the end.
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
*/
- vcpu->hv_clock.version = guest_hv_clock.version + 2;
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
return 0;
}
@@ -5799,7 +5822,6 @@ int kvm_arch_init(void *opaque)
kvm_set_mmio_spte_mask();
kvm_x86_ops = ops;
- kvm_init_msr_list();
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -7253,7 +7275,14 @@ void kvm_arch_hardware_disable(void)
int kvm_arch_hardware_setup(void)
{
- return kvm_x86_ops->hardware_setup();
+ int r;
+
+ r = kvm_x86_ops->hardware_setup();
+ if (r != 0)
+ return r;
+
+ kvm_init_msr_list();
+ return 0;
}
void kvm_arch_hardware_unsetup(void)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 717908b16037..8f9a133cc099 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,8 +87,7 @@
struct lguest_data lguest_data = {
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
- .noirq_start = (u32)lguest_noirq_start,
- .noirq_end = (u32)lguest_noirq_end,
+ .noirq_iret = (u32)lguest_noirq_iret,
.kernel_address = PAGE_OFFSET,
.blocked_interrupts = { 1 }, /* Block timer interrupts */
.syscall_vec = SYSCALL_VECTOR,
@@ -262,7 +261,7 @@ PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl);
PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable);
/*:*/
-/* These are in i386_head.S */
+/* These are in head_32.S */
extern void lg_irq_enable(void);
extern void lg_restore_fl(unsigned long flags);
@@ -1368,7 +1367,7 @@ static void lguest_restart(char *reason)
* fit comfortably.
*
* First we need assembly templates of each of the patchable Guest operations,
- * and these are in i386_head.S.
+ * and these are in head_32.S.
*/
/*G:060 We construct a table from the assembler templates: */
diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S
index 6ddfe4fc23c3..d5ae63f5ec5d 100644
--- a/arch/x86/lguest/head_32.S
+++ b/arch/x86/lguest/head_32.S
@@ -84,7 +84,7 @@ ENTRY(lg_irq_enable)
* set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
* jump to send_interrupts, otherwise we're done.
*/
- testl $0, lguest_data+LGUEST_DATA_irq_pending
+ cmpl $0, lguest_data+LGUEST_DATA_irq_pending
jnz send_interrupts
/*
* One cool thing about x86 is that you can do many things without using
@@ -133,9 +133,8 @@ ENTRY(lg_restore_fl)
ret
/*:*/
-/* These demark the EIP range where host should never deliver interrupts. */
-.global lguest_noirq_start
-.global lguest_noirq_end
+/* These demark the EIP where host should never deliver interrupts. */
+.global lguest_noirq_iret
/*M:004
* When the Host reflects a trap or injects an interrupt into the Guest, it
@@ -168,29 +167,26 @@ ENTRY(lg_restore_fl)
* So we have to copy eflags from the stack to lguest_data.irq_enabled before
* we do the "iret".
*
- * There are two problems with this: firstly, we need to use a register to do
- * the copy and secondly, the whole thing needs to be atomic. The first
- * problem is easy to solve: push %eax on the stack so we can use it, and then
- * restore it at the end just before the real "iret".
+ * There are two problems with this: firstly, we can't clobber any registers
+ * and secondly, the whole thing needs to be atomic. The first problem
+ * is solved by using "push memory"/"pop memory" instruction pair for copying.
*
* The second is harder: copying eflags to lguest_data.irq_enabled will turn
* interrupts on before we're finished, so we could be interrupted before we
- * return to userspace or wherever. Our solution to this is to surround the
- * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the
+ * return to userspace or wherever. Our solution to this is to tell the
* Host that it is *never* to interrupt us there, even if interrupts seem to be
- * enabled.
+ * enabled. (It's not necessary to protect pop instruction, since
+ * data gets updated only after it completes, so we only need to protect
+ * one instruction, iret).
*/
ENTRY(lguest_iret)
- pushl %eax
- movl 12(%esp), %eax
-lguest_noirq_start:
+ pushl 2*4(%esp)
/*
* Note the %ss: segment prefix here. Normal data accesses use the
* "ds" segment, but that will have already been restored for whatever
* we're returning to (such as userspace): we can't trust it. The %ss:
* prefix makes sure we use the stack segment, which is still valid.
*/
- movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled
- popl %eax
+ popl %ss:lguest_data+LGUEST_DATA_irq_enabled
+lguest_noirq_iret:
iret
-lguest_noirq_end:
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 1f33b3d1fd68..0a42327a59d7 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
clac();
/* If the destination is a kernel buffer, we always clear the end */
- if ((unsigned long)to >= TASK_SIZE_MAX)
+ if (!__addr_ok(to))
memset(to, 0, len);
return len;
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o
obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
-obj-$(CONFIG_MEMTEST) += memtest.o
-
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..70e7444c6835 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
/*
* Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
*
* NOTE! We need to allow non-page-aligned mappings too: we will obviously
* have to convert them into an offset in a page-aligned mapping, but the
@@ -326,24 +331,40 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+ return cpu_has_gbpages;
+#else
+ return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+ return cpu_has_pse;
+}
+
/*
* Convert a physical pointer to a virtual kernel pointer for /dev/mem
* access
*/
void *xlate_dev_mem_ptr(phys_addr_t phys)
{
- void *addr;
- unsigned long start = phys & PAGE_MASK;
+ unsigned long start = phys & PAGE_MASK;
+ unsigned long offset = phys & ~PAGE_MASK;
+ unsigned long vaddr;
/* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
if (page_is_ram(start >> PAGE_SHIFT))
return __va(phys);
- addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
- if (addr)
- addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+ vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE);
+ /* Only add the offset on success and return NULL if the ioremap() failed: */
+ if (vaddr)
+ vaddr += offset;
- return addr;
+ return (void *)vaddr;
}
void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/memblock.h>
-
-static u64 patterns[] __initdata = {
- /* The first entry has to be 0 to leave memtest with zeroed memory */
- 0,
- 0xffffffffffffffffULL,
- 0x5555555555555555ULL,
- 0xaaaaaaaaaaaaaaaaULL,
- 0x1111111111111111ULL,
- 0x2222222222222222ULL,
- 0x4444444444444444ULL,
- 0x8888888888888888ULL,
- 0x3333333333333333ULL,
- 0x6666666666666666ULL,
- 0x9999999999999999ULL,
- 0xccccccccccccccccULL,
- 0x7777777777777777ULL,
- 0xbbbbbbbbbbbbbbbbULL,
- 0xddddddddddddddddULL,
- 0xeeeeeeeeeeeeeeeeULL,
- 0x7a6c7258554e494cULL, /* yeah ;-) */
-};
-
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
-{
- printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
- (unsigned long long) pattern,
- (unsigned long long) start_bad,
- (unsigned long long) end_bad);
- memblock_reserve(start_bad, end_bad - start_bad);
-}
-
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
-{
- u64 *p, *start, *end;
- u64 start_bad, last_bad;
- u64 start_phys_aligned;
- const size_t incr = sizeof(pattern);
-
- start_phys_aligned = ALIGN(start_phys, incr);
- start = __va(start_phys_aligned);
- end = start + (size - (start_phys_aligned - start_phys)) / incr;
- start_bad = 0;
- last_bad = 0;
-
- for (p = start; p < end; p++)
- *p = pattern;
-
- for (p = start; p < end; p++, start_phys_aligned += incr) {
- if (*p == pattern)
- continue;
- if (start_phys_aligned == last_bad + incr) {
- last_bad += incr;
- continue;
- }
- if (start_bad)
- reserve_bad_mem(pattern, start_bad, last_bad + incr);
- start_bad = last_bad = start_phys_aligned;
- }
- if (start_bad)
- reserve_bad_mem(pattern, start_bad, last_bad + incr);
-}
-
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
-{
- u64 i;
- phys_addr_t this_start, this_end;
-
- for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
- this_start = clamp_t(phys_addr_t, this_start, start, end);
- this_end = clamp_t(phys_addr_t, this_end, start, end);
- if (this_start < this_end) {
- printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
- (unsigned long long)this_start,
- (unsigned long long)this_end,
- (unsigned long long)cpu_to_be64(pattern));
- memtest(pattern, this_start, this_end - this_start);
- }
- }
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
- if (arg)
- memtest_pattern = simple_strtoul(arg, NULL, 0);
- else
- memtest_pattern = ARRAY_SIZE(patterns);
-
- return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-void __init early_memtest(unsigned long start, unsigned long end)
-{
- unsigned int i;
- unsigned int idx = 0;
-
- if (!memtest_pattern)
- return;
-
- printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
- for (i = memtest_pattern-1; i < UINT_MAX; --i) {
- idx = i % ARRAY_SIZE(patterns);
- do_one_pass(patterns[idx], start, end);
- }
-}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,24 +65,23 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd = 0;
+ unsigned long rnd;
/*
- * 8 bits of randomness in 32bit mmaps, 20 address space bits
- * 28 bits of randomness in 64bit mmaps, 40 address space bits
- */
- if (current->flags & PF_RANDOMIZE) {
- if (mmap_is_ia32())
- rnd = get_random_int() % (1<<8);
- else
- rnd = get_random_int() % (1<<28);
- }
+ * 8 bits of randomness in 32bit mmaps, 20 address space bits
+ * 28 bits of randomness in 64bit mmaps, 40 address space bits
+ */
+ if (mmap_is_ia32())
+ rnd = (unsigned long)get_random_int() % (1<<8);
+ else
+ rnd = (unsigned long)get_random_int() % (1<<28);
+
return rnd << PAGE_SHIFT;
}
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
{
unsigned long gap = rlimit(RLIMIT_STACK);
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+ return PAGE_ALIGN(TASK_SIZE - gap - rnd);
}
/*
* Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
* does, but not when emulating X86_32
*/
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
{
if (mmap_is_ia32())
return TASK_UNMAPPED_BASE;
else
- return TASK_UNMAPPED_BASE + mmap_rnd();
+ return TASK_UNMAPPED_BASE + rnd;
}
/*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
*/
void arch_pick_mmap_layout(struct mm_struct *mm)
{
- mm->mmap_legacy_base = mmap_legacy_base();
- mm->mmap_base = mmap_base();
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ mm->mmap_legacy_base = mmap_legacy_base(random_factor);
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
+ mm->mmap_base = mmap_base(random_factor);
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <asm/fixmap.h>
+#include <asm/mtrr.h>
#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
@@ -58,7 +59,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
tlb_remove_page(tlb, pte);
}
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = virt_to_page(pmd);
@@ -74,14 +75,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb_remove_page(tlb, page);
}
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
-#endif /* PAGETABLE_LEVELS > 3 */
-#endif /* PAGETABLE_LEVELS > 2 */
+#endif /* CONFIG_PGTABLE_LEVELS > 3 */
+#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline void pgd_list_add(pgd_t *pgd)
{
@@ -117,9 +118,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
/* If the pgd points to a shared pagetable level (either the
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
- PAGETABLE_LEVELS == 4) {
+ if (CONFIG_PGTABLE_LEVELS == 2 ||
+ (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ CONFIG_PGTABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pud, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+ u8 mtrr;
+
+ /*
+ * Do not use a huge page when the range is covered by non-WB type
+ * of MTRRs.
+ */
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+ if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ return 0;
+
+ prot = pgprot_4k_2_large(prot);
+
+ set_pte((pte_t *)pmd, pfn_pte(
+ (u64)addr >> PAGE_SHIFT,
+ __pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+ return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+ if (pud_large(*pud)) {
+ pud_clear(pud);
+ return 1;
+ }
+
+ return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+ if (pmd_large(*pmd)) {
+ pmd_clear(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e4695985f9de..d93963340c3c 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -325,6 +325,26 @@ static void release_pci_root_info(struct pci_host_bridge *bridge)
kfree(info);
}
+/*
+ * An IO port or MMIO resource assigned to a PCI host bridge may be
+ * consumed by the host bridge itself or available to its child
+ * bus/devices. The ACPI specification defines a bit (Producer/Consumer)
+ * to tell whether the resource is consumed by the host bridge itself,
+ * but firmware hasn't used that bit consistently, so we can't rely on it.
+ *
+ * On x86 and IA64 platforms, all IO port and MMIO resources are assumed
+ * to be available to child bus/devices except one special case:
+ * IO port [0xCF8-0xCFF] is consumed by the host bridge itself
+ * to access PCI configuration space.
+ *
+ * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF].
+ */
+static bool resource_is_pcicfg_ioport(struct resource *res)
+{
+ return (res->flags & IORESOURCE_IO) &&
+ res->start == 0xCF8 && res->end == 0xCFF;
+}
+
static void probe_pci_root_info(struct pci_root_info *info,
struct acpi_device *device,
int busnum, int domain,
@@ -346,8 +366,8 @@ static void probe_pci_root_info(struct pci_root_info *info,
"no IO and memory resources present in _CRS\n");
else
resource_list_for_each_entry_safe(entry, tmp, list) {
- if ((entry->res->flags & IORESOURCE_WINDOW) == 0 ||
- (entry->res->flags & IORESOURCE_DISABLED))
+ if ((entry->res->flags & IORESOURCE_DISABLED) ||
+ resource_is_pcicfg_ioport(entry->res))
resource_list_destroy_entry(entry);
else
entry->res->name = info->name;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 2fb384724ebb..8fd6f44aee83 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
if (!bus) {
pci_free_resource_list(&resources);
kfree(sd);
+ return;
}
+ pci_bus_add_devices(bus);
}
void __init pcibios_set_cache_line_size(void)
diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile
index 0a8ee703b9fa..0ce1b1913673 100644
--- a/arch/x86/platform/intel-mid/Makefile
+++ b/arch/x86/platform/intel-mid/Makefile
@@ -1,5 +1,4 @@
obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o
-obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o
# SFI specific code
ifdef CONFIG_X86_INTEL_MID
diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c
deleted file mode 100644
index 4e720829ab90..000000000000
--- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * early_printk_intel_mid.c - early consoles for Intel MID platforms
- *
- * Copyright (c) 2008-2010, Intel Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
- */
-
-/*
- * This file implements early console named hsu.
- * hsu is based on a High Speed UART device which only exists in the Medfield
- * platform
- */
-
-#include <linux/serial_reg.h>
-#include <linux/serial_mfd.h>
-#include <linux/console.h>
-#include <linux/kernel.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-
-#include <asm/fixmap.h>
-#include <asm/pgtable.h>
-#include <asm/intel-mid.h>
-
-/*
- * Following is the early console based on Medfield HSU (High
- * Speed UART) device.
- */
-#define HSU_PORT_BASE 0xffa28080
-
-static void __iomem *phsu;
-
-void hsu_early_console_init(const char *s)
-{
- unsigned long paddr, port = 0;
- u8 lcr;
-
- /*
- * Select the early HSU console port if specified by user in the
- * kernel command line.
- */
- if (*s && !kstrtoul(s, 10, &port))
- port = clamp_val(port, 0, 2);
-
- paddr = HSU_PORT_BASE + port * 0x80;
- phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr);
-
- /* Disable FIFO */
- writeb(0x0, phsu + UART_FCR);
-
- /* Set to default 115200 bps, 8n1 */
- lcr = readb(phsu + UART_LCR);
- writeb((0x80 | lcr), phsu + UART_LCR);
- writeb(0x18, phsu + UART_DLL);
- writeb(lcr, phsu + UART_LCR);
- writel(0x3600, phsu + UART_MUL*4);
-
- writeb(0x8, phsu + UART_MCR);
- writeb(0x7, phsu + UART_FCR);
- writeb(0x3, phsu + UART_LCR);
-
- /* Clear IRQ status */
- readb(phsu + UART_LSR);
- readb(phsu + UART_RX);
- readb(phsu + UART_IIR);
- readb(phsu + UART_MSR);
-
- /* Enable FIFO */
- writeb(0x7, phsu + UART_FCR);
-}
-
-#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
-
-static void early_hsu_putc(char ch)
-{
- unsigned int timeout = 10000; /* 10ms */
- u8 status;
-
- while (--timeout) {
- status = readb(phsu + UART_LSR);
- if (status & BOTH_EMPTY)
- break;
- udelay(1);
- }
-
- /* Only write the char when there was no timeout */
- if (timeout)
- writeb(ch, phsu + UART_TX);
-}
-
-static void early_hsu_write(struct console *con, const char *str, unsigned n)
-{
- int i;
-
- for (i = 0; i < n && *str; i++) {
- if (*str == '\n')
- early_hsu_putc('\r');
- early_hsu_putc(*str);
- str++;
- }
-}
-
-struct console early_hsu_console = {
- .name = "earlyhsu",
- .write = early_hsu_write,
- .flags = CON_PRINTBUFFER,
- .index = -1,
-};
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
if (psy) {
power_supply_changed(psy);
- put_device(psy->dev);
+ power_supply_put(psy);
}
}
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 3323c2745248..a55abb9f6c5e 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR $@
quiet_cmd_systbl = SYSTBL $@
cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
+quiet_cmd_hypercalls = HYPERCALLS $@
+ cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
+
syshdr_abi_unistd_32 := i386
$(uapi)/unistd_32.h: $(syscall32) $(syshdr)
$(call if_changed,syshdr)
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
$(out)/syscalls_64.h: $(syscall64) $(systbl)
$(call if_changed,systbl)
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
+ $(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
uapisyshdr-y += unistd_32.h unistd_64.h unistd_x32.h
syshdr-y += syscalls_32.h
syshdr-$(CONFIG_X86_64) += unistd_32_ia32.h unistd_64_x32.h
syshdr-$(CONFIG_X86_64) += syscalls_64.h
+syshdr-$(CONFIG_XEN) += xen-hypercalls.h
targets += $(uapisyshdr-y) $(syshdr-y)
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index eafa324eb7a5..acb384d24669 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o
subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
-subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
else
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 8ffd2146fa6a..7e8a1a650435 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,22 +36,11 @@
#endif /* CONFIG_X86_PPRO_FENCE */
#define dma_wmb() barrier()
-#ifdef CONFIG_SMP
-
-#define smp_mb() mb()
-#define smp_rmb() dma_rmb()
-#define smp_wmb() barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-
-#else /* CONFIG_SMP */
-
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
#define set_mb(var, value) do { var = value; barrier(); } while (0)
-#endif /* CONFIG_SMP */
-
#define read_barrier_depends() do { } while (0)
#define smp_read_barrier_depends() do { } while (0)
diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 25a1022dd793..0a656b727b1a 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
#define ELF_EXEC_PAGESIZE 4096
-#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
extern long elf_aux_hwcap;
#define ELF_HWCAP (elf_aux_hwcap)
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 8e08176f0bcb..5c0b711d2433 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -8,9 +8,7 @@
#include <linux/slab.h>
#include <asm/unistd.h>
#include <os.h>
-#include <proc_mm.h>
#include <skas.h>
-#include <skas_ptrace.h>
#include <sysdep/tls.h>
extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
struct user_desc *desc, void **addr, int done)
{
long res;
-
- if (proc_mm) {
- /*
- * This is a special handling for the case, that the mm to
- * modify isn't current->active_mm.
- * If this is called directly by modify_ldt,
- * (current->active_mm->context.skas.u == mm_idp)
- * will be true. So no call to __switch_mm(mm_idp) is done.
- * If this is called in case of init_new_ldt or PTRACE_LDT,
- * mm_idp won't belong to current->active_mm, but child->mm.
- * So we need to switch child's mm into our userspace, then
- * later switch back.
- *
- * Note: I'm unsure: should interrupts be disabled here?
- */
- if (!current->active_mm || current->active_mm == &init_mm ||
- mm_idp != &current->active_mm->context.id)
- __switch_mm(mm_idp);
- }
-
- if (ptrace_ldt) {
- struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
- .func = func,
- .ptr = desc,
- .bytecount = sizeof(*desc)};
- u32 cpu;
- int pid;
-
- if (!proc_mm)
- pid = mm_idp->u.pid;
- else {
- cpu = get_cpu();
- pid = userspace_pid[cpu];
- }
-
- res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op);
-
- if (proc_mm)
- put_cpu();
- }
- else {
- void *stub_addr;
- res = syscall_stub_data(mm_idp, (unsigned long *)desc,
- (sizeof(*desc) + sizeof(long) - 1) &
- ~(sizeof(long) - 1),
- addr, &stub_addr);
- if (!res) {
- unsigned long args[] = { func,
- (unsigned long)stub_addr,
- sizeof(*desc),
- 0, 0, 0 };
- res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
- 0, addr, done);
- }
+ void *stub_addr;
+ res = syscall_stub_data(mm_idp, (unsigned long *)desc,
+ (sizeof(*desc) + sizeof(long) - 1) &
+ ~(sizeof(long) - 1),
+ addr, &stub_addr);
+ if (!res) {
+ unsigned long args[] = { func,
+ (unsigned long)stub_addr,
+ sizeof(*desc),
+ 0, 0, 0 };
+ res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
+ 0, addr, done);
}
- if (proc_mm) {
- /*
- * This is the second part of special handling, that makes
- * PTRACE_LDT possible to implement.
- */
- if (current->active_mm && current->active_mm != &init_mm &&
- mm_idp != &current->active_mm->context.id)
- __switch_mm(&current->active_mm->context.id);
- }
-
- return res;
-}
-
-static long read_ldt_from_host(void __user * ptr, unsigned long bytecount)
-{
- int res, n;
- struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) {
- .func = 0,
- .bytecount = bytecount,
- .ptr = kmalloc(bytecount, GFP_KERNEL)};
- u32 cpu;
-
- if (ptrace_ldt.ptr == NULL)
- return -ENOMEM;
-
- /*
- * This is called from sys_modify_ldt only, so userspace_pid gives
- * us the right number
- */
-
- cpu = get_cpu();
- res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt);
- put_cpu();
- if (res < 0)
- goto out;
-
- n = copy_to_user(ptr, ptrace_ldt.ptr, res);
- if (n != 0)
- res = -EFAULT;
-
- out:
- kfree(ptrace_ldt.ptr);
-
return res;
}
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
err = bytecount;
- if (ptrace_ldt)
- return read_ldt_from_host(ptr, bytecount);
-
mutex_lock(&ldt->lock);
if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
goto out;
}
- if (!ptrace_ldt)
- mutex_lock(&ldt->lock);
+ mutex_lock(&ldt->lock);
err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
if (err)
goto out_unlock;
- else if (ptrace_ldt) {
- /* With PTRACE_LDT available, this is used as a flag only */
- ldt->entry_count = 1;
- goto out;
- }
if (ldt_info.entry_number >= ldt->entry_count &&
ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
int i;
long page, err=0;
void *addr = NULL;
- struct proc_mm_op copy;
- if (!ptrace_ldt)
- mutex_init(&new_mm->arch.ldt.lock);
+ mutex_init(&new_mm->arch.ldt.lock);
if (!from_mm) {
memset(&desc, 0, sizeof(desc));
/*
- * We have to initialize a clean ldt.
+ * Now we try to retrieve info about the ldt, we
+ * inherited from the host. All ldt-entries found
+ * will be reset in the following loop
*/
- if (proc_mm) {
- /*
- * If the new mm was created using proc_mm, host's
- * default-ldt currently is assigned, which normally
- * contains the call-gates for lcall7 and lcall27.
- * To remove these gates, we simply write an empty
- * entry as number 0 to the host.
- */
- err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1);
- }
- else{
- /*
- * Now we try to retrieve info about the ldt, we
- * inherited from the host. All ldt-entries found
- * will be reset in the following loop
- */
- ldt_get_host_info();
- for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
- desc.entry_number = *num_p;
- err = write_ldt_entry(&new_mm->id, 1, &desc,
- &addr, *(num_p + 1) == -1);
- if (err)
- break;
- }
+ ldt_get_host_info();
+ for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
+ desc.entry_number = *num_p;
+ err = write_ldt_entry(&new_mm->id, 1, &desc,
+ &addr, *(num_p + 1) == -1);
+ if (err)
+ break;
}
new_mm->arch.ldt.entry_count = 0;
goto out;
}
- if (proc_mm) {
- /*
- * We have a valid from_mm, so we now have to copy the LDT of
- * from_mm to new_mm, because using proc_mm an new mm with
- * an empty/default LDT was created in new_mm()
- */
- copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS,
- .u =
- { .copy_segments =
- from_mm->id.u.mm_fd } } );
- i = os_write_file(new_mm->id.u.mm_fd, &copy, sizeof(copy));
- if (i != sizeof(copy))
- printk(KERN_ERR "new_mm : /proc/mm copy_segments "
- "failed, err = %d\n", -i);
- }
-
- if (!ptrace_ldt) {
- /*
- * Our local LDT is used to supply the data for
- * modify_ldt(READLDT), if PTRACE_LDT isn't available,
- * i.e., we have to use the stub for modify_ldt, which
- * can't handle the big read buffer of up to 64kB.
- */
- mutex_lock(&from_mm->arch.ldt.lock);
- if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
- memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
- sizeof(new_mm->arch.ldt.u.entries));
- else {
- i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
- while (i-->0) {
- page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
- if (!page) {
- err = -ENOMEM;
- break;
- }
- new_mm->arch.ldt.u.pages[i] =
- (struct ldt_entry *) page;
- memcpy(new_mm->arch.ldt.u.pages[i],
- from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
+ /*
+ * Our local LDT is used to supply the data for
+ * modify_ldt(READLDT), if PTRACE_LDT isn't available,
+ * i.e., we have to use the stub for modify_ldt, which
+ * can't handle the big read buffer of up to 64kB.
+ */
+ mutex_lock(&from_mm->arch.ldt.lock);
+ if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
+ memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
+ sizeof(new_mm->arch.ldt.u.entries));
+ else {
+ i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+ while (i-->0) {
+ page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+ if (!page) {
+ err = -ENOMEM;
+ break;
}
+ new_mm->arch.ldt.u.pages[i] =
+ (struct ldt_entry *) page;
+ memcpy(new_mm->arch.ldt.u.pages[i],
+ from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
}
- new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
- mutex_unlock(&from_mm->arch.ldt.lock);
}
+ new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
+ mutex_unlock(&from_mm->arch.ldt.lock);
out:
return err;
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm)
{
int i;
- if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
+ if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
while (i-- > 0)
free_page((long) mm->arch.ldt.u.pages[i]);
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index a26086b8a800..b6f2437ec29c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -27,9 +27,6 @@ struct faultinfo {
/* This is Page Fault */
#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
-/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
-#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo)
-
#define PTRACE_FULL_FAULTINFO 0
#endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index f811cbe15d62..ee88f88974ea 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -27,9 +27,6 @@ struct faultinfo {
/* This is Page Fault */
#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14)
-/* No broken SKAS API, which doesn't pass trap_no, here. */
-#define SEGV_MAYBE_FIXABLE(fi) 0
-
#define PTRACE_FULL_FAULTINFO 1
#endif
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h
deleted file mode 100644
index 453febe98993..000000000000
--- a/arch/x86/um/shared/sysdep/skas_ptrace.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_SKAS_PTRACE_H
-#define __SYSDEP_X86_SKAS_PTRACE_H
-
-struct ptrace_faultinfo {
- int is_write;
- unsigned long addr;
-};
-
-struct ptrace_ldt {
- int func;
- void *ptr;
- unsigned long bytecount;
-};
-
-#define PTRACE_LDT 54
-
-#endif
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 0c8c32bfd792..592491d1d70d 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
if (err)
return err;
- /* Set up registers for signal handler */
- {
- struct exec_domain *ed = current_thread_info()->exec_domain;
- if (unlikely(ed && ed->signal_invmap && sig < 32))
- sig = ed->signal_invmap[sig];
- }
-
PT_REGS_SP(regs) = (unsigned long) frame;
PT_REGS_DI(regs) = sig;
/* In case the signal handler was declared without prototypes */
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 40d2473836c9..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
cycle_t ret;
u64 last;
u32 version;
- u32 migrate_count;
u8 flags;
unsigned cpu, cpu1;
/*
- * When looping to get a consistent (time-info, tsc) pair, we
- * also need to deal with the possibility we can switch vcpus,
- * so make sure we always re-fetch time-info for the current vcpu.
+ * Note: hypervisor must guarantee that:
+ * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
+ * 2. that per-CPU pvclock time info is updated if the
+ * underlying CPU changes.
+ * 3. that version is increased whenever underlying CPU
+ * changes.
+ *
*/
do {
cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
* __getcpu() calls (Gleb).
*/
- /* Make sure migrate_count will change if we leave the VCPU. */
- do {
- pvti = get_pvti(cpu);
- migrate_count = pvti->migrate_count;
-
- cpu1 = cpu;
- cpu = __getcpu() & VGETCPU_CPU_MASK;
- } while (unlikely(cpu != cpu1));
+ pvti = get_pvti(cpu);
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
/*
* Test we're still on the cpu as well as the version.
- * - We must read TSC of pvti's VCPU.
- * - KVM doesn't follow the versioning protocol, so data could
- * change before version if we left the VCPU.
+ * We could have been migrated just after the first
+ * vgetcpu but before fetching the version, so we
+ * wouldn't notice a version change.
*/
- smp_rmb();
- } while (unlikely((pvti->pvti.version & 1) ||
- pvti->pvti.version != version ||
- pvti->migrate_count != migrate_count));
+ cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+ } while (unlikely(cpu != cpu1 ||
+ (pvti->pvti.version & 1) ||
+ pvti->pvti.version != version));
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
*mode = VCLOCK_NONE;
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7005ced5d1ad..70e060ad879a 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
#include <xen/xen.h>
#include <xen/interface/physdev.h>
#include "xen-ops.h"
+#include "smp.h"
static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
{
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
return 0xfd;
}
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+ WARN_ON(1);
+ return x;
+}
+
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+ return ((x)>>24) & 0xFFu;
+}
+
+static u32 xen_apic_read(u32 reg)
+{
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = 0,
+ };
+ int ret = 0;
+
+ /* Shouldn't need this as APIC is turned off for PV, and we only
+ * get called on the bootup processor. But just in case. */
+ if (!xen_initial_domain() || smp_processor_id())
+ return 0;
+
+ if (reg == APIC_LVR)
+ return 0x10;
+#ifdef CONFIG_X86_32
+ if (reg == APIC_LDR)
+ return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+#endif
+ if (reg != APIC_ID)
+ return 0;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return 0;
+
+ return op.u.pcpu_info.apic_id << 24;
+}
+
+static void xen_apic_write(u32 reg, u32 val)
+{
+ /* Warn to see if there's any stray references */
+ WARN(1,"register: %x, value: %x\n", reg, val);
+}
+
+static u64 xen_apic_icr_read(void)
+{
+ return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+ /* Warn to see if there's any stray references */
+ WARN_ON(1);
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+ return 0;
+}
+
+static int xen_apic_probe_pv(void)
+{
+ if (xen_pv_domain())
+ return 1;
+
+ return 0;
+}
+
+static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ return xen_pv_domain();
+}
+
+static int xen_id_always_valid(int apicid)
+{
+ return 1;
+}
+
+static int xen_id_always_registered(void)
+{
+ return 1;
+}
+
+static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+ return initial_apic_id >> index_msb;
+}
+
+#ifdef CONFIG_X86_32
+static int xen_x86_32_early_logical_apicid(int cpu)
+{
+ /* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
+ return 1 << cpu;
+}
+#endif
+
+static void xen_noop(void)
+{
+}
+
+static void xen_silent_inquire(int apicid)
+{
+}
+
+static struct apic xen_pv_apic = {
+ .name = "Xen PV",
+ .probe = xen_apic_probe_pv,
+ .acpi_madt_oem_check = xen_madt_oem_check,
+ .apic_id_valid = xen_id_always_valid,
+ .apic_id_registered = xen_id_always_registered,
+
+ /* .irq_delivery_mode - used in native_compose_msi_msg only */
+ /* .irq_dest_mode - used in native_compose_msi_msg only */
+
+ .target_cpus = default_target_cpus,
+ .disable_esr = 0,
+ /* .dest_logical - default_send_IPI_ use it but we use our own. */
+ .check_apicid_used = default_check_apicid_used, /* Used on 32-bit */
+
+ .vector_allocation_domain = flat_vector_allocation_domain,
+ .init_apic_ldr = xen_noop, /* setup_local_APIC calls it */
+
+ .ioapic_phys_id_map = default_ioapic_phys_id_map, /* Used on 32-bit */
+ .setup_apic_routing = NULL,
+ .cpu_present_to_apicid = default_cpu_present_to_apicid,
+ .apicid_to_cpu_present = physid_set_mask_of_physid, /* Used on 32-bit */
+ .check_phys_apicid_present = default_check_phys_apicid_present, /* smp_sanity_check needs it */
+ .phys_pkg_id = xen_phys_pkg_id, /* detect_ht */
+
+ .get_apic_id = xen_get_apic_id,
+ .set_apic_id = xen_set_apic_id, /* Can be NULL on 32-bit. */
+ .apic_id_mask = 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
+
+ .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and,
+
+#ifdef CONFIG_SMP
+ .send_IPI_mask = xen_send_IPI_mask,
+ .send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself,
+ .send_IPI_allbutself = xen_send_IPI_allbutself,
+ .send_IPI_all = xen_send_IPI_all,
+ .send_IPI_self = xen_send_IPI_self,
+#endif
+ /* .wait_for_init_deassert- used by AP bootup - smp_callin which we don't use */
+ .inquire_remote_apic = xen_silent_inquire,
+
+ .read = xen_apic_read,
+ .write = xen_apic_write,
+ .eoi_write = xen_apic_write,
+
+ .icr_read = xen_apic_icr_read,
+ .icr_write = xen_apic_icr_write,
+ .wait_icr_idle = xen_noop,
+ .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+ /* generic_processor_info and setup_local_APIC. */
+ .x86_32_early_logical_apicid = xen_x86_32_early_logical_apicid,
+#endif
+};
+
+static void __init xen_apic_check(void)
+{
+ if (apic == &xen_pv_apic)
+ return;
+
+ pr_info("Switched APIC routing from %s to %s.\n", apic->name,
+ xen_pv_apic.name);
+ apic = &xen_pv_apic;
+}
void __init xen_init_apic(void)
{
x86_io_apic_ops.read = xen_io_apic_read;
+ /* On PV guests the APIC CPUID bit is disabled so none of the
+ * routines end up executing. */
+ if (!xen_initial_domain())
+ apic = &xen_pv_apic;
+
+ x86_platform.apic_post_init = xen_apic_check;
}
+apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3797b6b31f95..fe969ac1c65e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -928,92 +928,6 @@ static void xen_io_delay(void)
{
}
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_set_apic_id(unsigned int x)
-{
- WARN_ON(1);
- return x;
-}
-static unsigned int xen_get_apic_id(unsigned long x)
-{
- return ((x)>>24) & 0xFFu;
-}
-static u32 xen_apic_read(u32 reg)
-{
- struct xen_platform_op op = {
- .cmd = XENPF_get_cpuinfo,
- .interface_version = XENPF_INTERFACE_VERSION,
- .u.pcpu_info.xen_cpuid = 0,
- };
- int ret = 0;
-
- /* Shouldn't need this as APIC is turned off for PV, and we only
- * get called on the bootup processor. But just in case. */
- if (!xen_initial_domain() || smp_processor_id())
- return 0;
-
- if (reg == APIC_LVR)
- return 0x10;
-
- if (reg != APIC_ID)
- return 0;
-
- ret = HYPERVISOR_dom0_op(&op);
- if (ret)
- return 0;
-
- return op.u.pcpu_info.apic_id << 24;
-}
-
-static void xen_apic_write(u32 reg, u32 val)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static u64 xen_apic_icr_read(void)
-{
- return 0;
-}
-
-static void xen_apic_icr_write(u32 low, u32 id)
-{
- /* Warn to see if there's any stray references */
- WARN_ON(1);
-}
-
-static void xen_apic_wait_icr_idle(void)
-{
- return;
-}
-
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
- return 0;
-}
-
-static void set_xen_basic_apic_ops(void)
-{
- apic->read = xen_apic_read;
- apic->write = xen_apic_write;
- apic->icr_read = xen_apic_icr_read;
- apic->icr_write = xen_apic_icr_write;
- apic->wait_icr_idle = xen_apic_wait_icr_idle;
- apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
- apic->set_apic_id = xen_set_apic_id;
- apic->get_apic_id = xen_get_apic_id;
-
-#ifdef CONFIG_SMP
- apic->send_IPI_allbutself = xen_send_IPI_allbutself;
- apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
- apic->send_IPI_mask = xen_send_IPI_mask;
- apic->send_IPI_all = xen_send_IPI_all;
- apic->send_IPI_self = xen_send_IPI_self;
-#endif
-}
-
-#endif
-
static void xen_clts(void)
{
struct multicall_space mcs;
@@ -1620,7 +1534,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
/*
* set up the basic apic ops.
*/
- set_xen_basic_apic_ops();
+ xen_init_apic();
#endif
if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1733,8 +1647,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
if (HYPERVISOR_dom0_op(&op) == 0)
boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
- xen_init_apic();
-
/* Make sure ACS will be enabled */
pci_request_acs();
@@ -1849,6 +1761,9 @@ static struct notifier_block xen_hvm_cpu_notifier = {
static void __init xen_hvm_guest_init(void)
{
+ if (xen_pv_domain())
+ return;
+
init_hvm_pv_info();
xen_hvm_init_shared_info();
@@ -1864,6 +1779,7 @@ static void __init xen_hvm_guest_init(void)
xen_hvm_init_time_ops();
xen_hvm_init_mmu_ops();
}
+#endif
static bool xen_nopv = false;
static __init int xen_parse_nopv(char *arg)
@@ -1873,14 +1789,11 @@ static __init int xen_parse_nopv(char *arg)
}
early_param("xen_nopv", xen_parse_nopv);
-static uint32_t __init xen_hvm_platform(void)
+static uint32_t __init xen_platform(void)
{
if (xen_nopv)
return 0;
- if (xen_pv_domain())
- return 0;
-
return xen_cpuid_base();
}
@@ -1898,11 +1811,19 @@ bool xen_hvm_need_lapic(void)
}
EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
-const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
- .name = "Xen HVM",
- .detect = xen_hvm_platform,
+static void xen_set_cpu_features(struct cpuinfo_x86 *c)
+{
+ if (xen_pv_domain())
+ clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+}
+
+const struct hypervisor_x86 x86_hyper_xen = {
+ .name = "Xen",
+ .detect = xen_platform,
+#ifdef CONFIG_XEN_PVHVM
.init_platform = xen_hvm_guest_init,
+#endif
.x2apic_available = xen_x2apic_para_available,
+ .set_cpu_features = xen_set_cpu_features,
};
-EXPORT_SYMBOL(x86_hyper_xen_hvm);
-#endif
+EXPORT_SYMBOL(x86_hyper_xen);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..dd151b2045b0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
}
PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
__visible pudval_t xen_pud_val(pud_t pud)
{
return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
/*
* (Yet another) pagetable walker. This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
-#endif /* PAGETABLE_LEVELS == 4 */
+#endif /* CONFIG_PGTABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
@@ -2436,99 +2436,11 @@ void __init xen_hvm_init_mmu_ops(void)
}
#endif
-#ifdef CONFIG_XEN_PVH
-/*
- * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
- * space creating new guest on pvh dom0 and needing to map domU pages.
- */
-static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
- unsigned int domid)
-{
- int rc, err = 0;
- xen_pfn_t gpfn = lpfn;
- xen_ulong_t idx = fgfn;
-
- struct xen_add_to_physmap_range xatp = {
- .domid = DOMID_SELF,
- .foreign_domid = domid,
- .size = 1,
- .space = XENMAPSPACE_gmfn_foreign,
- };
- set_xen_guest_handle(xatp.idxs, &idx);
- set_xen_guest_handle(xatp.gpfns, &gpfn);
- set_xen_guest_handle(xatp.errs, &err);
-
- rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
- if (rc < 0)
- return rc;
- return err;
-}
-
-static int xlate_remove_from_p2m(unsigned long spfn, int count)
-{
- struct xen_remove_from_physmap xrp;
- int i, rc;
-
- for (i = 0; i < count; i++) {
- xrp.domid = DOMID_SELF;
- xrp.gpfn = spfn+i;
- rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
- if (rc)
- break;
- }
- return rc;
-}
-
-struct xlate_remap_data {
- unsigned long fgfn; /* foreign domain's gfn */
- pgprot_t prot;
- domid_t domid;
- int index;
- struct page **pages;
-};
-
-static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
- void *data)
-{
- int rc;
- struct xlate_remap_data *remap = data;
- unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
- pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
-
- rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
- if (rc)
- return rc;
- native_set_pte(ptep, pteval);
-
- return 0;
-}
-
-static int xlate_remap_gfn_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long mfn,
- int nr, pgprot_t prot, unsigned domid,
- struct page **pages)
-{
- int err;
- struct xlate_remap_data pvhdata;
-
- BUG_ON(!pages);
-
- pvhdata.fgfn = mfn;
- pvhdata.prot = prot;
- pvhdata.domid = domid;
- pvhdata.index = 0;
- pvhdata.pages = pages;
- err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
- xlate_map_pte_fn, &pvhdata);
- flush_tlb_all();
- return err;
-}
-#endif
-
#define REMAP_BATCH_SIZE 16
struct remap_data {
- unsigned long mfn;
+ xen_pfn_t *mfn;
+ bool contiguous;
pgprot_t prot;
struct mmu_update *mmu_update;
};
@@ -2537,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
unsigned long addr, void *data)
{
struct remap_data *rmd = data;
- pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
+ pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
+
+ /* If we have a contigious range, just update the mfn itself,
+ else update pointer to be "next mfn". */
+ if (rmd->contiguous)
+ (*rmd->mfn)++;
+ else
+ rmd->mfn++;
rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
rmd->mmu_update->val = pte_val_ma(pte);
@@ -2546,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
return 0;
}
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
- unsigned long addr,
- xen_pfn_t mfn, int nr,
- pgprot_t prot, unsigned domid,
- struct page **pages)
-
+static int do_remap_mfn(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *mfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned domid,
+ struct page **pages)
{
+ int err = 0;
struct remap_data rmd;
struct mmu_update mmu_update[REMAP_BATCH_SIZE];
- int batch;
unsigned long range;
- int err = 0;
+ int mapped = 0;
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
if (xen_feature(XENFEAT_auto_translated_physmap)) {
#ifdef CONFIG_XEN_PVH
/* We need to update the local page tables and the xen HAP */
- return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
- domid, pages);
+ return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+ prot, domid, pages);
#else
return -EINVAL;
#endif
@@ -2573,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
rmd.mfn = mfn;
rmd.prot = prot;
+ /* We use the err_ptr to indicate if there we are doing a contigious
+ * mapping or a discontigious mapping. */
+ rmd.contiguous = !err_ptr;
while (nr) {
- batch = min(REMAP_BATCH_SIZE, nr);
+ int index = 0;
+ int done = 0;
+ int batch = min(REMAP_BATCH_SIZE, nr);
+ int batch_left = batch;
range = (unsigned long)batch << PAGE_SHIFT;
rmd.mmu_update = mmu_update;
@@ -2584,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
if (err)
goto out;
- err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
- if (err < 0)
- goto out;
+ /* We record the error for each page that gives an error, but
+ * continue mapping until the whole set is done */
+ do {
+ int i;
+
+ err = HYPERVISOR_mmu_update(&mmu_update[index],
+ batch_left, &done, domid);
+
+ /*
+ * @err_ptr may be the same buffer as @mfn, so
+ * only clear it after each chunk of @mfn is
+ * used.
+ */
+ if (err_ptr) {
+ for (i = index; i < index + done; i++)
+ err_ptr[i] = 0;
+ }
+ if (err < 0) {
+ if (!err_ptr)
+ goto out;
+ err_ptr[i] = err;
+ done++; /* Skip failed frame. */
+ } else
+ mapped += done;
+ batch_left -= done;
+ index += done;
+ } while (batch_left);
nr -= batch;
addr += range;
+ if (err_ptr)
+ err_ptr += batch;
}
-
- err = 0;
out:
xen_flush_tlb_all();
- return err;
+ return err < 0 ? err : mapped;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t mfn, int nr,
+ pgprot_t prot, unsigned domid,
+ struct page **pages)
+{
+ return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
}
EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+ unsigned long addr,
+ xen_pfn_t *mfn, int nr,
+ int *err_ptr, pgprot_t prot,
+ unsigned domid, struct page **pages)
+{
+ /* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
+ * and the consequences later is quite hard to detect what the actual
+ * cause of "wrong memory was mapped in".
+ */
+ BUG_ON(err_ptr == NULL);
+ return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
+
/* Returns: 0 success */
int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
int numpgs, struct page **pages)
@@ -2609,22 +2583,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
return 0;
#ifdef CONFIG_XEN_PVH
- while (numpgs--) {
- /*
- * The mmu has already cleaned up the process mmu
- * resources at this point (lookup_address will return
- * NULL).
- */
- unsigned long pfn = page_to_pfn(pages[numpgs]);
-
- xlate_remove_from_p2m(pfn, 1);
- }
- /*
- * We don't need to flush tlbs because as part of
- * xlate_remove_from_p2m, the hypervisor will do tlb flushes
- * after removing the p2m entries from the EPT/NPT
- */
- return 0;
+ return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
#else
return -EINVAL;
#endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7413ee3706d0..86484384492e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
set_cpu_online(cpu, true);
- this_cpu_write(cpu_state, CPU_ONLINE);
-
- wmb();
+ cpu_set_state_online(cpu); /* Implies full memory barrier. */
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
-
- wmb(); /* make sure everything is out */
}
/*
@@ -451,7 +447,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
xen_setup_timer(cpu);
xen_init_lock_cpu(cpu);
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /*
+ * PV VCPUs are always successfully taken down (see 'while' loop
+ * in xen_cpu_die()), so -EBUSY is an error.
+ */
+ rc = cpu_check_up_prepare(cpu);
+ if (rc)
+ return rc;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -467,10 +469,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
- while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+ while (cpu_report_state(cpu) != CPU_ONLINE)
HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
- barrier();
- }
return 0;
}
@@ -499,11 +499,11 @@ static void xen_cpu_die(unsigned int cpu)
schedule_timeout(HZ/10);
}
- cpu_die_common(cpu);
-
- xen_smp_intr_free(cpu);
- xen_uninit_lock_cpu(cpu);
- xen_teardown_timer(cpu);
+ if (common_cpu_die(cpu) == 0) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ xen_teardown_timer(cpu);
+ }
}
static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -735,6 +735,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc;
+
+ /*
+ * This can happen if CPU was offlined earlier and
+ * offlining timed out in common_cpu_die().
+ */
+ if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+ xen_smp_intr_free(cpu);
+ xen_uninit_lock_cpu(cpu);
+ }
+
/*
* xen_smp_intr_init() needs to run before native_cpu_up()
* so that IPI vectors are set up on the booting CPU before
@@ -756,12 +766,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
return rc;
}
-static void xen_hvm_cpu_die(unsigned int cpu)
-{
- xen_cpu_die(cpu);
- native_cpu_die(cpu);
-}
-
void __init xen_hvm_smp_init(void)
{
if (!xen_have_vector_callback)
@@ -769,7 +773,7 @@ void __init xen_hvm_smp_init(void)
smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
smp_ops.cpu_up = xen_hvm_cpu_up;
- smp_ops.cpu_die = xen_hvm_cpu_die;
+ smp_ops.cpu_die = xen_cpu_die;
smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index d9497698645a..53b4c0811f4f 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -88,7 +88,17 @@ static void xen_vcpu_notify_restore(void *data)
tick_resume_local();
}
+static void xen_vcpu_notify_suspend(void *data)
+{
+ tick_suspend_local();
+}
+
void xen_arch_resume(void)
{
on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
}
+
+void xen_arch_suspend(void)
+{
+ on_each_cpu(xen_vcpu_notify_suspend, NULL, 1);
+}
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 520022d1a181..a702ec2f5931 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -1,54 +1,12 @@
#include <linux/ftrace.h>
#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
-#define N(x) [__HYPERVISOR_##x] = "("#x")"
+#define HYPERCALL(x) [__HYPERVISOR_##x] = "("#x")",
static const char *xen_hypercall_names[] = {
- N(set_trap_table),
- N(mmu_update),
- N(set_gdt),
- N(stack_switch),
- N(set_callbacks),
- N(fpu_taskswitch),
- N(sched_op_compat),
- N(dom0_op),
- N(set_debugreg),
- N(get_debugreg),
- N(update_descriptor),
- N(memory_op),
- N(multicall),
- N(update_va_mapping),
- N(set_timer_op),
- N(event_channel_op_compat),
- N(xen_version),
- N(console_io),
- N(physdev_op_compat),
- N(grant_table_op),
- N(vm_assist),
- N(update_va_mapping_otherdomain),
- N(iret),
- N(vcpu_op),
- N(set_segment_base),
- N(mmuext_op),
- N(acm_op),
- N(nmi_op),
- N(sched_op),
- N(callback_op),
- N(xenoprof_op),
- N(event_channel_op),
- N(physdev_op),
- N(hvm_op),
-
-/* Architecture-specific hypercall definitions. */
- N(arch_0),
- N(arch_1),
- N(arch_2),
- N(arch_3),
- N(arch_4),
- N(arch_5),
- N(arch_6),
- N(arch_7),
+#include <asm/xen-hypercalls.h>
};
-#undef N
+#undef HYPERCALL
static const char *xen_hypercall_name(unsigned op)
{
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 674b222544b7..8afdfccf6086 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -12,6 +12,8 @@
#include <xen/interface/elfnote.h>
#include <xen/interface/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
#include <asm/xen/interface.h>
#ifdef CONFIG_XEN_PVH
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init)
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
-#define NEXT_HYPERCALL(x) \
- ENTRY(xen_hypercall_##x) \
- .skip 32
-
-NEXT_HYPERCALL(set_trap_table)
-NEXT_HYPERCALL(mmu_update)
-NEXT_HYPERCALL(set_gdt)
-NEXT_HYPERCALL(stack_switch)
-NEXT_HYPERCALL(set_callbacks)
-NEXT_HYPERCALL(fpu_taskswitch)
-NEXT_HYPERCALL(sched_op_compat)
-NEXT_HYPERCALL(platform_op)
-NEXT_HYPERCALL(set_debugreg)
-NEXT_HYPERCALL(get_debugreg)
-NEXT_HYPERCALL(update_descriptor)
-NEXT_HYPERCALL(ni)
-NEXT_HYPERCALL(memory_op)
-NEXT_HYPERCALL(multicall)
-NEXT_HYPERCALL(update_va_mapping)
-NEXT_HYPERCALL(set_timer_op)
-NEXT_HYPERCALL(event_channel_op_compat)
-NEXT_HYPERCALL(xen_version)
-NEXT_HYPERCALL(console_io)
-NEXT_HYPERCALL(physdev_op_compat)
-NEXT_HYPERCALL(grant_table_op)
-NEXT_HYPERCALL(vm_assist)
-NEXT_HYPERCALL(update_va_mapping_otherdomain)
-NEXT_HYPERCALL(iret)
-NEXT_HYPERCALL(vcpu_op)
-NEXT_HYPERCALL(set_segment_base)
-NEXT_HYPERCALL(mmuext_op)
-NEXT_HYPERCALL(xsm_op)
-NEXT_HYPERCALL(nmi_op)
-NEXT_HYPERCALL(sched_op)
-NEXT_HYPERCALL(callback_op)
-NEXT_HYPERCALL(xenoprof_op)
-NEXT_HYPERCALL(event_channel_op)
-NEXT_HYPERCALL(physdev_op)
-NEXT_HYPERCALL(hvm_op)
-NEXT_HYPERCALL(sysctl)
-NEXT_HYPERCALL(domctl)
-NEXT_HYPERCALL(kexec_op)
-NEXT_HYPERCALL(tmem_op) /* 38 */
-ENTRY(xen_hypercall_rsvr)
- .skip 320
-NEXT_HYPERCALL(mca) /* 48 */
-NEXT_HYPERCALL(arch_1)
-NEXT_HYPERCALL(arch_2)
-NEXT_HYPERCALL(arch_3)
-NEXT_HYPERCALL(arch_4)
-NEXT_HYPERCALL(arch_5)
-NEXT_HYPERCALL(arch_6)
- .balign PAGE_SIZE
+ .skip PAGE_SIZE
+
+#define HYPERCALL(n) \
+ .equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+ .type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")