aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/include/asm/archrandom.h11
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h3
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h35
-rw-r--r--arch/powerpc/include/asm/kvm_host.h47
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h2
-rw-r--r--arch/powerpc/include/asm/pgtable.h28
-rw-r--r--arch/powerpc/include/asm/time.h3
-rw-r--r--arch/powerpc/include/uapi/asm/tm.h2
-rw-r--r--arch/powerpc/kernel/asm-offsets.c20
-rw-r--r--arch/powerpc/kernel/eeh.c17
-rw-r--r--arch/powerpc/kernel/entry_64.S19
-rw-r--r--arch/powerpc/kernel/idle_power7.S2
-rw-r--r--arch/powerpc/kernel/io-workarounds.c10
-rw-r--r--arch/powerpc/kernel/time.c6
-rw-r--r--arch/powerpc/kvm/Kconfig16
-rw-r--r--arch/powerpc/kvm/book3s.c76
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c203
-rw-r--r--arch/powerpc/kvm/book3s_hv.c435
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c100
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c111
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c238
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S559
-rw-r--r--arch/powerpc/kvm/book3s_pr_papr.c28
-rw-r--r--arch/powerpc/kvm/book3s_xics.c105
-rw-r--r--arch/powerpc/kvm/book3s_xics.h13
-rw-r--r--arch/powerpc/kvm/e500_mmu_host.c32
-rw-r--r--arch/powerpc/kvm/powerpc.c3
-rw-r--r--arch/powerpc/mm/hash_utils_64.c3
-rw-r--r--arch/powerpc/mm/hugetlbpage.c32
-rw-r--r--arch/powerpc/perf/callchain.c24
-rw-r--r--arch/powerpc/platforms/cell/spufs/inode.c22
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c2
-rw-r--r--arch/powerpc/platforms/powernv/rng.c29
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c10
34 files changed, 1726 insertions, 520 deletions
diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
index bde531103638..0cc6eedc4780 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -30,8 +30,6 @@ static inline int arch_has_random(void)
return !!ppc_md.get_random_long;
}
-int powernv_get_random_long(unsigned long *v);
-
static inline int arch_get_random_seed_long(unsigned long *v)
{
return 0;
@@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void)
#endif /* CONFIG_ARCH_RANDOM */
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
#endif /* _ASM_POWERPC_ARCHRANDOM_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 993090422690..b91e74a817d8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -288,6 +288,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu)
return !is_kvmppc_hv_enabled(vcpu->kvm);
}
+extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu);
+extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
+
/* Magic register values loaded into r3 and r4 before the 'sc' assembly
* instruction for the OSI hypercalls */
#define OSI_SC_MAGIC_R3 0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 14619a59ec09..3536d12eb798 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits)
return old == 0;
}
+static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+ hpte_v &= ~HPTE_V_HVLOCK;
+ asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+ hpte[0] = cpu_to_be64(hpte_v);
+}
+
+/* Without barrier */
+static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+ hpte_v &= ~HPTE_V_HVLOCK;
+ hpte[0] = cpu_to_be64(hpte_v);
+}
+
static inline int __hpte_actual_psize(unsigned int lp, int psize)
{
int i, shift;
@@ -281,16 +295,17 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type)
/*
* If it's present and writable, atomically set dirty and referenced bits and
- * return the PTE, otherwise return 0. If we find a transparent hugepage
- * and if it is marked splitting we return 0;
+ * return the PTE, otherwise return 0.
*/
-static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
- unsigned int hugepage)
+static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing)
{
pte_t old_pte, new_pte = __pte(0);
while (1) {
- old_pte = *ptep;
+ /*
+ * Make sure we don't reload from ptep
+ */
+ old_pte = READ_ONCE(*ptep);
/*
* wait until _PAGE_BUSY is clear then set it atomically
*/
@@ -298,12 +313,6 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *ptep, int writing,
cpu_relax();
continue;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* If hugepage and is trans splitting return None */
- if (unlikely(hugepage &&
- pmd_trans_splitting(pte_pmd(old_pte))))
- return __pte(0);
-#endif
/* If pte is not present return None */
if (unlikely(!(pte_val(old_pte) & _PAGE_PRESENT)))
return __pte(0);
@@ -424,6 +433,10 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
return rcu_dereference_raw_notrace(kvm->memslots);
}
+extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+
+extern void kvmhv_rm_send_ipi(int cpu);
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c610961720c7..a193a13cf08b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -227,10 +227,8 @@ struct kvm_arch {
unsigned long host_sdr1;
int tlbie_lock;
unsigned long lpcr;
- unsigned long rmor;
- struct kvm_rma_info *rma;
unsigned long vrma_slb_v;
- int rma_setup_done;
+ int hpte_setup_done;
u32 hpt_order;
atomic_t vcpus_running;
u32 online_vcores;
@@ -239,6 +237,8 @@ struct kvm_arch {
atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush;
int hpt_cma_alloc;
+ struct dentry *debugfs_dir;
+ struct dentry *htab_dentry;
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
struct mutex hpt_mutex;
@@ -263,18 +263,15 @@ struct kvm_arch {
/*
* Struct for a virtual core.
- * Note: entry_exit_count combines an entry count in the bottom 8 bits
- * and an exit count in the next 8 bits. This is so that we can
- * atomically increment the entry count iff the exit count is 0
- * without taking the lock.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits. This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
*/
struct kvmppc_vcore {
int n_runnable;
- int n_busy;
int num_threads;
- int entry_exit_count;
- int n_woken;
- int nap_count;
+ int entry_exit_map;
int napping_threads;
int first_vcpuid;
u16 pcpu;
@@ -299,13 +296,14 @@ struct kvmppc_vcore {
ulong conferring_threads;
};
-#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff)
-#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8)
+#define VCORE_ENTRY_MAP(vc) ((vc)->entry_exit_map & 0xff)
+#define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
+#define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0)
/* Values for vcore_state */
#define VCORE_INACTIVE 0
#define VCORE_SLEEPING 1
-#define VCORE_STARTING 2
+#define VCORE_PREEMPT 2
#define VCORE_RUNNING 3
#define VCORE_EXITING 4
@@ -368,6 +366,14 @@ struct kvmppc_slb {
u8 base_page_size; /* MMU_PAGE_xxx */
};
+/* Struct used to accumulate timing information in HV real mode code */
+struct kvmhv_tb_accumulator {
+ u64 seqcount; /* used to synchronize access, also count * 2 */
+ u64 tb_total; /* total time in timebase ticks */
+ u64 tb_min; /* min time */
+ u64 tb_max; /* max time */
+};
+
# ifdef CONFIG_PPC_FSL_BOOK3E
#define KVMPPC_BOOKE_IAC_NUM 2
#define KVMPPC_BOOKE_DAC_NUM 2
@@ -656,6 +662,19 @@ struct kvm_vcpu_arch {
u32 emul_inst;
#endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ struct kvmhv_tb_accumulator *cur_activity; /* What we're timing */
+ u64 cur_tb_start; /* when it started */
+ struct kvmhv_tb_accumulator rm_entry; /* real-mode entry code */
+ struct kvmhv_tb_accumulator rm_intr; /* real-mode intr handling */
+ struct kvmhv_tb_accumulator rm_exit; /* real-mode exit code */
+ struct kvmhv_tb_accumulator guest_time; /* guest execution */
+ struct kvmhv_tb_accumulator cede_time; /* time napping inside guest */
+
+ struct dentry *debugfs_dir;
+ struct dentry *debugfs_timings;
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
};
#define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 46bf652c9169..b8475daad884 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
return kvm->arch.kvm_ops == kvmppc_hv_ops;
}
+extern int kvmppc_hwrng_present(void);
+
/*
* Cuts out inst bits with ordering according to spec.
* That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9835ac4173b7..11a38635dd65 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -247,28 +247,16 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
#define pmd_large(pmd) 0
#define has_transparent_hugepage() 0
#endif
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
unsigned *shift);
-
-static inline pte_t *lookup_linux_ptep(pgd_t *pgdir, unsigned long hva,
- unsigned long *pte_sizep)
+static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift)
{
- pte_t *ptep;
- unsigned long ps = *pte_sizep;
- unsigned int shift;
-
- ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
- if (!ptep)
- return NULL;
- if (shift)
- *pte_sizep = 1ul << shift;
- else
- *pte_sizep = PAGE_SIZE;
-
- if (ps > *pte_sizep)
- return NULL;
-
- return ptep;
+ if (!arch_irqs_disabled()) {
+ pr_info("%s called with irq enabled\n", __func__);
+ dump_stack();
+ }
+ return __find_linux_pte_or_hugepte(pgdir, ea, shift);
}
#endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 03cbada59d3a..10fc784a2ad4 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void);
DECLARE_PER_CPU(u64, decrementers_next_tb);
+/* Convert timebase ticks to nanoseconds */
+unsigned long long tb_to_ns(unsigned long long tb_ticks);
+
#endif /* __KERNEL__ */
#endif /* __POWERPC_TIME_H */
diff --git a/arch/powerpc/include/uapi/asm/tm.h b/arch/powerpc/include/uapi/asm/tm.h
index 5047659815a5..5d836b7c1176 100644
--- a/arch/powerpc/include/uapi/asm/tm.h
+++ b/arch/powerpc/include/uapi/asm/tm.h
@@ -11,7 +11,7 @@
#define TM_CAUSE_RESCHED 0xde
#define TM_CAUSE_TLBI 0xdc
#define TM_CAUSE_FAC_UNAV 0xda
-#define TM_CAUSE_SYSCALL 0xd8
+#define TM_CAUSE_SYSCALL 0xd8 /* future use */
#define TM_CAUSE_MISC 0xd6 /* future use */
#define TM_CAUSE_SIGNAL 0xd4
#define TM_CAUSE_ALIGNMENT 0xd2
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4717859fdd04..0034b6b3556a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -37,6 +37,7 @@
#include <asm/thread_info.h>
#include <asm/rtas.h>
#include <asm/vdso_datapage.h>
+#include <asm/dbell.h>
#ifdef CONFIG_PPC64
#include <asm/paca.h>
#include <asm/lppaca.h>
@@ -459,6 +460,19 @@ int main(void)
DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
+ DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
+ DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
+ DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
+ DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
+ DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
+ DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
+ DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
+ DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
+ DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
+ DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
+#endif
DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
@@ -492,7 +506,6 @@ int main(void)
DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
- DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
@@ -550,8 +563,7 @@ int main(void)
DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
- DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
- DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+ DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
@@ -748,5 +760,7 @@ int main(void)
offsetof(struct paca_struct, subcore_sibling_mask));
#endif
+ DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+
return 0;
}
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index a4c62eb0ee48..9ee61d15653d 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -334,9 +334,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
int hugepage_shift;
/*
- * We won't find hugepages here, iomem
+ * We won't find hugepages here(this is iomem). Hence we are not
+ * worried about _PAGE_SPLITTING/collapse. Also we will not hit
+ * page table free, because of init_mm.
*/
- ptep = find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
+ ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, &hugepage_shift);
if (!ptep)
return token;
WARN_ON(hugepage_shift);
@@ -747,21 +749,24 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
eeh_unfreeze_pe(pe, false);
eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
break;
case pcie_hot_reset:
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
eeh_ops->reset(pe, EEH_RESET_HOT);
break;
case pcie_warm_reset:
+ eeh_pe_state_mark(pe, EEH_PE_ISOLATED);
eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
break;
default:
- eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+ eeh_pe_state_clear(pe, EEH_PE_ISOLATED | EEH_PE_CFG_BLOCKED);
return -EINVAL;
};
@@ -1056,6 +1061,9 @@ void eeh_add_device_early(struct pci_dn *pdn)
if (!edev || !eeh_enabled())
return;
+ if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE))
+ return;
+
/* USB Bus children of PCI devices will not have BUID's */
phb = edev->phb;
if (NULL == phb ||
@@ -1110,6 +1118,9 @@ void eeh_add_device_late(struct pci_dev *dev)
return;
}
+ if (eeh_has_flag(EEH_PROBE_MODE_DEV))
+ eeh_ops->probe(pdn, NULL);
+
/*
* The EEH cache might not be removed correctly because of
* unbalanced kref to the device during unplug time, which
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 8ca9434c40e6..afbc20019c2e 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -34,7 +34,6 @@
#include <asm/ftrace.h>
#include <asm/hw_irq.h>
#include <asm/context_tracking.h>
-#include <asm/tm.h>
/*
* System calls.
@@ -146,24 +145,6 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
andi. r11,r10,_TIF_SYSCALL_DOTRACE
bne syscall_dotrace
.Lsyscall_dotrace_cont:
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-BEGIN_FTR_SECTION
- b 1f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
- extrdi. r11, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
- beq+ 1f
-
- /* Doom the transaction and don't perform the syscall: */
- mfmsr r11
- li r12, 1
- rldimi r11, r12, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r11, 0
- li r11, (TM_CAUSE_SYSCALL|TM_CAUSE_PERSISTENT)
- TABORT(R11)
-
- b .Lsyscall_exit
-1:
-#endif
cmpldi 0,r0,NR_syscalls
bge- syscall_enosys
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index eeaa0d5f69d5..ccde8f084ce4 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -501,9 +501,11 @@ BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
ld r1,PACAR1(r13)
+ ld r6,_CCR(r1)
ld r4,_MSR(r1)
ld r5,_NIP(r1)
addi r1,r1,INT_FRAME_SIZE
+ mtcr r6
mtspr SPRN_SRR1,r4
mtspr SPRN_SRR0,r5
rfid
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index 24b968f8e4d8..63d9cc4d7366 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -71,15 +71,15 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
vaddr = (unsigned long)PCI_FIX_ADDR(addr);
if (vaddr < PHB_IO_BASE || vaddr >= PHB_IO_END)
return NULL;
-
- ptep = find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
+ /*
+ * We won't find huge pages here (iomem). Also can't hit
+ * a page table free due to init_mm
+ */
+ ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
&hugepage_shift);
if (ptep == NULL)
paddr = 0;
else {
- /*
- * we don't have hugepages backing iomem
- */
WARN_ON(hugepage_shift);
paddr = pte_pfn(*ptep) << PAGE_SHIFT;
}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2d7b33fab953..56f44848b044 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void)
}
#endif
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+ return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
/*
* Scheduler clock - returns current time in nanosec units.
*
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 11850f310fb4..3caec2c42105 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -75,7 +75,7 @@ config KVM_BOOK3S_64
config KVM_BOOK3S_64_HV
tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
- depends on KVM_BOOK3S_64
+ depends on KVM_BOOK3S_64 && PPC_POWERNV
select KVM_BOOK3S_HV_POSSIBLE
select MMU_NOTIFIER
select CMA
@@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR
processor, including emulating 32-bit processors on a 64-bit
host.
+config KVM_BOOK3S_HV_EXIT_TIMING
+ bool "Detailed timing for hypervisor real-mode code"
+ depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+ ---help---
+ Calculate time taken for each vcpu in the real-mode guest entry,
+ exit, and interrupt handling code, plus time spent in the guest
+ and in nap mode due to idle (cede) while other threads are still
+ in the guest. The total, minimum and maximum times in nanoseconds
+ together with the number of executions are reported in debugfs in
+ kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40
+ ns per exit on POWER8.
+
+ If unsure, say N.
+
config KVM_BOOKE_HV
bool
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index cfbcdc654201..453a8a47a467 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
#endif
}
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+ unsigned long size = kvmppc_get_gpr(vcpu, 4);
+ unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+ u64 buf;
+ int ret;
+
+ if (!is_power_of_2(size) || (size > sizeof(buf)))
+ return H_TOO_HARD;
+
+ ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+ if (ret != 0)
+ return H_TOO_HARD;
+
+ switch (size) {
+ case 1:
+ kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+ break;
+
+ case 2:
+ kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+ break;
+
+ case 4:
+ kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+ break;
+
+ case 8:
+ kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+ break;
+
+ default:
+ BUG();
+ }
+
+ return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+ unsigned long size = kvmppc_get_gpr(vcpu, 4);
+ unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+ unsigned long val = kvmppc_get_gpr(vcpu, 6);
+ u64 buf;
+ int ret;
+
+ switch (size) {
+ case 1:
+ *(u8 *)&buf = val;
+ break;
+
+ case 2:
+ *(__be16 *)&buf = cpu_to_be16(val);
+ break;
+
+ case 4:
+ *(__be32 *)&buf = cpu_to_be32(val);
+ break;
+
+ case 8:
+ *(__be64 *)&buf = cpu_to_be64(val);
+ break;
+
+ default:
+ return H_TOO_HARD;
+ }
+
+ ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+ if (ret != 0)
+ return H_TOO_HARD;
+
+ return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
int kvmppc_core_check_processor_compat(void)
{
/*
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 534acb3c6c3d..1a4acf8bf4f4 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -27,6 +27,7 @@
#include <linux/srcu.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
@@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
long order;
mutex_lock(&kvm->lock);
- if (kvm->arch.rma_setup_done) {
- kvm->arch.rma_setup_done = 0;
- /* order rma_setup_done vs. vcpus_running */
+ if (kvm->arch.hpte_setup_done) {
+ kvm->arch.hpte_setup_done = 0;
+ /* order hpte_setup_done vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.rma_setup_done = 1;
+ kvm->arch.hpte_setup_done = 1;
goto out;
}
}
@@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
gr = kvm->arch.revmap[index].guest_rpte;
- /* Unlock the HPTE */
- asm volatile("lwsync" : : : "memory");
- hptep[0] = cpu_to_be64(v);
+ unlock_hpte(hptep, v);
preempt_enable();
gpte->eaddr = eaddr;
@@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
hpte[1] = be64_to_cpu(hptep[1]);
hpte[2] = r = rev->guest_rpte;
- asm volatile("lwsync" : : : "memory");
- hptep[0] = cpu_to_be64(hpte[0]);
+ unlock_hpte(hptep, hpte[0]);
preempt_enable();
if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@ -537,23 +535,21 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
- unsigned int hugepage_shift;
pte_t *ptep, pte;
-
+ unsigned long flags;
/*
* We need to protect against page table destruction
- * while looking up and updating the pte.
+ * hugepage split and collapse.
*/
- rcu_read_lock_sched();
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(current->mm->pgd,
- hva, &hugepage_shift);
+ hva, NULL);
if (ptep) {
- pte = kvmppc_read_update_linux_pte(ptep, 1,
- hugepage_shift);
+ pte = kvmppc_read_update_linux_pte(ptep, 1);
if (pte_write(pte))
write_ok = 1;
}
- rcu_read_unlock_sched();
+ local_irq_restore(flags);
}
}
@@ -621,7 +617,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
hptep[1] = cpu_to_be64(r);
eieio();
- hptep[0] = cpu_to_be64(hpte[0]);
+ __unlock_hpte(hptep, hpte[0]);
asm volatile("ptesync" : : : "memory");
preempt_enable();
if (page && hpte_is_writable(r))
@@ -642,7 +638,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return ret;
out_unlock:
- hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
preempt_enable();
goto out_put;
}
@@ -771,7 +767,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
}
unlock_rmap(rmapp);
- hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
return 0;
}
@@ -857,7 +853,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
}
ret = 1;
}
- hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
} while ((i = j) != head);
unlock_rmap(rmapp);
@@ -974,8 +970,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
/* Now check and modify the HPTE */
if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
- /* unlock and continue */
- hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
continue;
}
@@ -996,9 +991,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
npages_dirty = n;
eieio();
}
- v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+ v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
- hptep[0] = cpu_to_be64(v);
+ __unlock_hpte(hptep, v);
} while ((i = j) != head);
unlock_rmap(rmapp);
@@ -1218,8 +1213,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
r &= ~HPTE_GR_MODIFIED;
revp->guest_rpte = r;
}
- asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
- hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ unlock_hpte(hptp, be64_to_cpu(hptp[0]));
preempt_enable();
if (!(valid == want_valid && (first_pass || dirty)))
ok = 0;
@@ -1339,20 +1333,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
- int rma_setup;
+ int hpte_setup;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
/* lock out vcpus from running while we're doing this */
mutex_lock(&kvm->lock);
- rma_setup = kvm->arch.rma_setup_done;
- if (rma_setup) {
- kvm->arch.rma_setup_done = 0; /* temporarily */
- /* order rma_setup_done vs. vcpus_running */
+ hpte_setup = kvm->arch.hpte_setup_done;
+ if (hpte_setup) {
+ kvm->arch.hpte_setup_done = 0; /* temporarily */
+ /* order hpte_setup_done vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
- kvm->arch.rma_setup_done = 1;
+ kvm->arch.hpte_setup_done = 1;
mutex_unlock(&kvm->lock);
return -EBUSY;
}
@@ -1405,7 +1399,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
"r=%lx\n", ret, i, v, r);
goto out;
}
- if (!rma_setup && is_vrma_hpte(v)) {
+ if (!hpte_setup && is_vrma_hpte(v)) {
unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
@@ -1414,7 +1408,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
(VRMA_VSID << SLB_VSID_SHIFT_1T);
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- rma_setup = 1;
+ hpte_setup = 1;
}
++i;
hptp += 2;
@@ -1430,9 +1424,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
}
out:
- /* Order HPTE updates vs. rma_setup_done */
+ /* Order HPTE updates vs. hpte_setup_done */
smp_wmb();
- kvm->arch.rma_setup_done = rma_setup;
+ kvm->arch.hpte_setup_done = hpte_setup;
mutex_unlock(&kvm->lock);
if (err)
@@ -1495,6 +1489,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
return ret;
}
+struct debugfs_htab_state {
+ struct kvm *kvm;
+ struct mutex mutex;
+ unsigned long hpt_index;
+ int chars_left;
+ int buf_index;
+ char buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+ struct debugfs_htab_state *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ kvm_get_kvm(kvm);
+ p->kvm = kvm;
+ mutex_init(&p->mutex);
+ file->private_data = p;
+
+ return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+ struct debugfs_htab_state *p = file->private_data;
+
+ kvm_put_kvm(p->kvm);
+ kfree(p);
+ return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct debugfs_htab_state *p = file->private_data;
+ ssize_t ret, r;
+ unsigned long i, n;
+ unsigned long v, hr, gr;
+ struct kvm *kvm;
+ __be64 *hptp;
+
+ ret = mutex_lock_interruptible(&p->mutex);
+ if (ret)
+ return ret;
+
+ if (p->chars_left) {
+ n = p->chars_left;
+ if (n > len)
+ n = len;
+ r = copy_to_user(buf, p->buf + p->buf_index, n);
+ n -= r;
+ p->chars_left -= n;
+ p->buf_index += n;
+ buf += n;
+ len -= n;
+ ret = n;
+ if (r) {
+ if (!n)
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ kvm = p->kvm;
+ i = p->hpt_index;
+ hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+ if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+ continue;
+
+ /* lock the HPTE so it's stable and read it */
+ preempt_disable();
+ while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+ cpu_relax();
+ v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+ hr = be64_to_cpu(hptp[1]);
+ gr = kvm->arch.revmap[i].guest_rpte;
+ unlock_hpte(hptp, v);
+ preempt_enable();
+
+ if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+ continue;
+
+ n = scnprintf(p->buf, sizeof(p->buf),
+ "%6lx %.16lx %.16lx %.16lx\n",
+ i, v, hr, gr);
+ p->chars_left = n;
+ if (n > len)
+ n = len;
+ r = copy_to_user(buf, p->buf, n);
+ n -= r;
+ p->chars_left -= n;
+ p->buf_index = n;
+ buf += n;
+ len -= n;
+ ret += n;
+ if (r) {
+ if (!ret)
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+ p->hpt_index = i;
+
+ out:
+ mutex_unlock(&p->mutex);
+ return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+ .owner = THIS_MODULE,
+ .open = debugfs_htab_open,
+ .release = debugfs_htab_release,
+ .read = debugfs_htab_read,
+ .write = debugfs_htab_write,
+ .llseek = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+ kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+ kvm->arch.debugfs_dir, kvm,
+ &debugfs_htab_fops);
+}
+
void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
{
struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de747563d29d..48d3c5d2ecc9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -32,6 +32,7 @@
#include <linux/page-flags.h>
#include <linux/srcu.h>
#include <linux/miscdevice.h>
+#include <linux/debugfs.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -50,6 +51,7 @@
#include <asm/hvcall.h>
#include <asm/switch_to.h>
#include <asm/smp.h>
+#include <asm/dbell.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+static bool kvmppc_ipi_thread(int cpu)
+{
+ /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ preempt_disable();
+ if (cpu_first_thread_sibling(cpu) ==
+ cpu_first_thread_sibling(smp_processor_id())) {
+ unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+ msg |= cpu_thread_in_core(cpu);
+ smp_mb();
+ __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+ preempt_enable();
+ return true;
+ }
+ preempt_enable();
+ }
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+ if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+ xics_wake_cpu(cpu);
+ return true;
+ }
+#endif
+
+ return false;
+}
+
static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
{
- int me;
int cpu = vcpu->cpu;
wait_queue_head_t *wqp;
@@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_wakeup;
}
- me = get_cpu();
+ if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+ return;
/* CPU points to the first thread of the core */
- if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_PPC_ICP_NATIVE
- int real_cpu = cpu + vcpu->arch.ptid;
- if (paca[real_cpu].kvm_hstate.xics_phys)
- xics_wake_cpu(real_cpu);
- else
-#endif
- if (cpu_online(cpu))
- smp_send_reschedule(cpu);
- }
- put_cpu();
+ if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+ smp_send_reschedule(cpu);
}
/*
@@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
/* Send the error out to userspace via KVM_RUN */
return rc;
+ case H_LOGICAL_CI_LOAD:
+ ret = kvmppc_h_logical_ci_load(vcpu);
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
+ case H_LOGICAL_CI_STORE:
+ ret = kvmppc_h_logical_ci_store(vcpu);
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
case H_SET_MODE:
ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
@@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
case H_CONFER:
case H_REGISTER_VPA:
case H_SET_MODE:
+ case H_LOGICAL_CI_LOAD:
+ case H_LOGICAL_CI_STORE:
#ifdef CONFIG_KVM_XICS
case H_XIRR:
case H_CPPR:
@@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
return vcore;
}
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+ const char *name;
+ size_t offset;
+} timings[] = {
+ {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)},
+ {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)},
+ {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)},
+ {"guest", offsetof(struct kvm_vcpu, arch.guest_time)},
+ {"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+ struct kvm_vcpu *vcpu;
+ unsigned int buflen;
+ char buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+ struct kvm_vcpu *vcpu = inode->i_private;
+ struct debugfs_timings_state *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ kvm_get_kvm(vcpu->kvm);
+ p->vcpu = vcpu;
+ file->private_data = p;
+
+ return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+ struct debugfs_timings_state *p = file->private_data;
+
+ kvm_put_kvm(p->vcpu->kvm);
+ kfree(p);
+ return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct debugfs_timings_state *p = file->private_data;
+ struct kvm_vcpu *vcpu = p->vcpu;
+ char *s, *buf_end;
+ struct kvmhv_tb_accumulator tb;
+ u64 count;
+ loff_t pos;
+ ssize_t n;
+ int i, loops;
+ bool ok;
+
+ if (!p->buflen) {
+ s = p->buf;
+ buf_end = s + sizeof(p->buf);
+ for (i = 0; i < N_TIMINGS; ++i) {
+ struct kvmhv_tb_accumulator *acc;
+
+ acc = (struct kvmhv_tb_accumulator *)
+ ((unsigned long)vcpu + timings[i].offset);
+ ok = false;
+ for (loops = 0; loops < 1000; ++loops) {
+ count = acc->seqcount;
+ if (!(count & 1)) {
+ smp_rmb();
+ tb = *acc;
+ smp_rmb();
+ if (count == acc->seqcount) {
+ ok = true;
+ break;
+ }
+ }
+ udelay(1);
+ }
+ if (!ok)
+ snprintf(s, buf_end - s, "%s: stuck\n",
+ timings[i].name);
+ else
+ snprintf(s, buf_end - s,
+ "%s: %llu %llu %llu %llu\n",
+ timings[i].name, count / 2,
+ tb_to_ns(tb.tb_total),
+ tb_to_ns(tb.tb_min),
+ tb_to_ns(tb.tb_max));
+ s += strlen(s);
+ }
+ p->buflen = s - p->buf;
+ }
+
+ pos = *ppos;
+ if (pos >= p->buflen)
+ return 0;
+ if (len > p->buflen - pos)
+ len = p->buflen - pos;
+ n = copy_to_user(buf, p->buf + pos, len);
+ if (n) {
+ if (n == len)
+ return -EFAULT;
+ len -= n;
+ }
+ *ppos = pos + len;
+ return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+ .owner = THIS_MODULE,
+ .open = debugfs_timings_open,
+ .release = debugfs_timings_release,
+ .read = debugfs_timings_read,
+ .write = debugfs_timings_write,
+ .llseek = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+ char buf[16];
+ struct kvm *kvm = vcpu->kvm;
+
+ snprintf(buf, sizeof(buf), "vcpu%u", id);
+ if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+ return;
+ vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+ if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+ return;
+ vcpu->arch.debugfs_timings =
+ debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+ vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
unsigned int id)
{
@@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
vcpu->arch.cpu_type = KVM_CPU_3S_64;
kvmppc_sanity_check(vcpu);
+ debugfs_vcpu_init(vcpu, id);
+
return vcpu;
free_vcpu:
@@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu)
tpaca = &paca[cpu];
/* Ensure the thread won't go into the kernel if it wakes */
- tpaca->kvm_hstate.hwthread_req = 1;
tpaca->kvm_hstate.kvm_vcpu = NULL;
+ tpaca->kvm_hstate.napping = 0;
+ smp_wmb();
+ tpaca->kvm_hstate.hwthread_req = 1;
/*
* If the thread is already executing in the kernel (e.g. handling
@@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
}
cpu = vc->pcpu + vcpu->arch.ptid;
tpaca = &paca[cpu];
- tpaca->kvm_hstate.kvm_vcpu = vcpu;
tpaca->kvm_hstate.kvm_vcore = vc;
tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
+ /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
smp_wmb();
-#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
- if (cpu != smp_processor_id()) {
- xics_wake_cpu(cpu);
- if (vcpu->arch.ptid)
- ++vc->n_woken;
- }
-#endif
+ tpaca->kvm_hstate.kvm_vcpu = vcpu;
+ if (cpu != smp_processor_id())
+ kvmppc_ipi_thread(cpu);
}
-static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+static void kvmppc_wait_for_nap(void)
{
- int i;
+ int cpu = smp_processor_id();
+ int i, loops;
- HMT_low();
- i = 0;
- while (vc->nap_count < vc->n_woken) {
- if (++i >= 1000000) {
- pr_err("kvmppc_wait_for_nap timeout %d %d\n",
- vc->nap_count, vc->n_woken);
- break;
+ for (loops = 0; loops < 1000000; ++loops) {
+ /*
+ * Check if all threads are finished.
+ * We set the vcpu pointer when starting a thread
+ * and the thread clears it when finished, so we look
+ * for any threads that still have a non-NULL vcpu ptr.
+ */
+ for (i = 1; i < threads_per_subcore; ++i)
+ if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+ break;
+ if (i == threads_per_subcore) {
+ HMT_medium();
+ return;
}
- cpu_relax();
+ HMT_low();
}
HMT_medium();
+ for (i = 1; i < threads_per_subcore; ++i)
+ if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+ pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
}
/*
@@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
}
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+ struct kvm_vcpu *vcpu, *vnext;
+
+ list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+ arch.run_list) {
+ if (signal_pending(vcpu->arch.run_task))
+ vcpu->arch.ret = -EINTR;
+ else if (vcpu->arch.vpa.update_pending ||
+ vcpu->arch.slb_shadow.update_pending ||
+ vcpu->arch.dtl.update_pending)
+ vcpu->arch.ret = RESUME_GUEST;
+ else
+ continue;
+ kvmppc_remove_runnable(vc, vcpu);
+ wake_up(&vcpu->arch.cpu_run);
+ }
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+ u64 now;
+ long ret;
+ struct kvm_vcpu *vcpu, *vnext;
+
+ now = get_tb();
+ list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+ arch.run_list) {
+ /* cancel pending dec exception if dec is positive */
+ if (now < vcpu->arch.dec_expires &&
+ kvmppc_core_pending_dec(vcpu))
+ kvmppc_core_dequeue_dec(vcpu);
+
+ trace_kvm_guest_exit(vcpu);
+
+ ret = RESUME_GUEST;
+ if (vcpu->arch.trap)
+ ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+ vcpu->arch.run_task);
+
+ vcpu->arch.ret = ret;
+ vcpu->arch.trap = 0;
+
+ if (vcpu->arch.ceded) {
+ if (!is_kvmppc_resume_guest(ret))
+ kvmppc_end_cede(vcpu);
+ else
+ kvmppc_set_timer(vcpu);
+ }
+ if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+ kvmppc_remove_runnable(vc, vcpu);
+ wake_up(&vcpu->arch.cpu_run);
+ }
+ }
+}
+
/*
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
*/
-static void kvmppc_run_core(struct kvmppc_vcore *vc)
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
{
- struct kvm_vcpu *vcpu, *vnext;
- long ret;
- u64 now;
- int i, need_vpa_update;
+ struct kvm_vcpu *vcpu;
+ int i;
int srcu_idx;
- struct kvm_vcpu *vcpus_to_update[threads_per_core];
- /* don't start if any threads have a signal pending */
- need_vpa_update = 0;
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
- if (signal_pending(vcpu->arch.run_task))
- return;
- if (vcpu->arch.vpa.update_pending ||
- vcpu->arch.slb_shadow.update_pending ||
- vcpu->arch.dtl.update_pending)
- vcpus_to_update[need_vpa_update++] = vcpu;
- }
+ /*
+ * Remove from the list any threads that have a signal pending
+ * or need a VPA update done
+ */
+ prepare_threads(vc);
+
+ /* if the runner is no longer runnable, let the caller pick a new one */
+ if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+ return;
/*
- * Initialize *vc, in particular vc->vcore_state, so we can
- * drop the vcore lock if necessary.
+ * Initialize *vc.
*/
- vc->n_woken = 0;
- vc->nap_count = 0;
- vc->entry_exit_count = 0;
+ vc->entry_exit_map = 0;
vc->preempt_tb = TB_NIL;
- vc->vcore_state = VCORE_STARTING;
vc->in_guest = 0;
vc->napping_threads = 0;
vc->conferring_threads = 0;
/*
- * Updating any of the vpas requires calling kvmppc_pin_guest_page,
- * which can't be called with any spinlocks held.
- */
- if (need_vpa_update) {
- spin_unlock(&vc->lock);
- for (i = 0; i < need_vpa_update; ++i)
- kvmppc_update_vpas(vcpus_to_update[i]);
- spin_lock(&vc->lock);
- }
-
- /*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
*/
if ((threads_per_core > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
vcpu->arch.ret = -EBUSY;
+ kvmppc_remove_runnable(vc, vcpu);
+ wake_up(&vcpu->arch.cpu_run);
+ }
goto out;
}
@@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
vcpu->cpu = -1;
/* wait for secondary threads to finish writing their state to memory */
- if (vc->nap_count < vc->n_woken)
- kvmppc_wait_for_nap(vc);
+ kvmppc_wait_for_nap();
for (i = 0; i < threads_per_subcore; ++i)
kvmppc_release_hwthread(vc->pcpu + i);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
@@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
kvm_guest_exit();
preempt_enable();
- cond_resched();
spin_lock(&vc->lock);
- now = get_tb();
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
- /* cancel pending dec exception if dec is positive */
- if (now < vcpu->arch.dec_expires &&
- kvmppc_core_pending_dec(vcpu))
- kvmppc_core_dequeue_dec(vcpu);
-
- trace_kvm_guest_exit(vcpu);
-
- ret = RESUME_GUEST;
- if (vcpu->arch.trap)
- ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
- vcpu->arch.run_task);
-
- vcpu->arch.ret = ret;
- vcpu->arch.trap = 0;
-
- if (vcpu->arch.ceded) {
- if (!is_kvmppc_resume_guest(ret))
- kvmppc_end_cede(vcpu);
- else
- kvmppc_set_timer(vcpu);
- }
- }
+ post_guest_process(vc);
out:
vc->vcore_state = VCORE_INACTIVE;
- list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
- arch.run_list) {
- if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
- kvmppc_remove_runnable(vc, vcpu);
- wake_up(&vcpu->arch.cpu_run);
- }
- }
-
trace_kvmppc_run_core(vc, 1);
}
@@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
* this thread straight away and have it join in.
*/
if (!signal_pending(current)) {
- if (vc->vcore_state == VCORE_RUNNING &&
- VCORE_EXIT_COUNT(vc) == 0) {
+ if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu);
trace_kvm_guest_enter(vcpu);
@@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
break;
- vc->runner = vcpu;
n_ceded = 0;
list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
if (!v->arch.pending_exceptions)
@@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
else
v->arch.ceded = 0;
}
- if (n_ceded == vc->n_runnable)
+ vc->runner = vcpu;
+ if (n_ceded == vc->n_runnable) {
kvmppc_vcore_blocked(vc);
- else
+ } else if (should_resched()) {
+ vc->vcore_state = VCORE_PREEMPT;
+ /* Let something else run */
+ cond_resched_lock(&vc->lock);
+ vc->vcore_state = VCORE_INACTIVE;
+ } else {
kvmppc_run_core(vc);
+ }
vc->runner = NULL;
}
@@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
atomic_inc(&vcpu->kvm->arch.vcpus_running);
- /* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+ /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
smp_mb();
/* On the first time here, set up HTAB and VRMA */
- if (!vcpu->kvm->arch.rma_setup_done) {
+ if (!vcpu->kvm->arch.hpte_setup_done) {
r = kvmppc_hv_setup_htab_rma(vcpu);
if (r)
goto out;
@@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
int srcu_idx;
mutex_lock(&kvm->lock);
- if (kvm->arch.rma_setup_done)
+ if (kvm->arch.hpte_setup_done)
goto out; /* another vcpu beat us to it */
/* Allocate hashed page table (if not done already) and reset it */
@@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
- /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+ /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
smp_wmb();
- kvm->arch.rma_setup_done = 1;
+ kvm->arch.hpte_setup_done = 1;
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
static int kvmppc_core_init_vm_hv(struct kvm *kvm)
{
unsigned long lpcr, lpid;
+ char buf[32];
/* Allocate the guest's logical partition ID */
@@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
*/
kvm_hv_vm_activated();
+ /*
+ * Create a debugfs directory for the VM
+ */
+ snprintf(buf, sizeof(buf), "vm%d", current->pid);
+ kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+ if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+ kvmppc_mmu_debugfs_init(kvm);
+
return 0;
}
@@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm)
static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
+ debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 1f083ff8a61a..ed2589d4593f 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -21,6 +21,10 @@
#include <asm/cputable.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
#define KVM_CMA_CHUNK_ORDER 18
@@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
int rv = H_SUCCESS; /* => don't yield */
set_bit(vcpu->arch.ptid, &vc->conferring_threads);
- while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) {
- threads_running = VCORE_ENTRY_COUNT(vc);
- threads_ceded = hweight32(vc->napping_threads);
- threads_conferring = hweight32(vc->conferring_threads);
- if (threads_ceded + threads_conferring >= threads_running) {
+ while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+ threads_running = VCORE_ENTRY_MAP(vc);
+ threads_ceded = vc->napping_threads;
+ threads_conferring = vc->conferring_threads;
+ if ((threads_ceded | threads_conferring) == threads_running) {
rv = H_TOO_HARD; /* => do yield */
break;
}
@@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
return 0;
}
EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+ return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+ if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+ return H_SUCCESS;
+
+ return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+ __asm__ __volatile__("stbcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+ unsigned long xics_phys;
+
+ /* On POWER8 for IPIs to threads in the same core, use msgsnd */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ cpu_first_thread_sibling(cpu) ==
+ cpu_first_thread_sibling(raw_smp_processor_id())) {
+ unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+ msg |= cpu_thread_in_core(cpu);
+ __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+ return;
+ }
+
+ /* Else poke the target with an IPI */
+ xics_phys = paca[cpu].kvm_hstate.xics_phys;
+ rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+ int cpu = vc->pcpu;
+
+ /* Order setting of exit map vs. msgsnd/IPI */
+ smp_mb();
+ for (; active; active >>= 1, ++cpu)
+ if (active & 1)
+ kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+ struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+ int ptid = local_paca->kvm_hstate.ptid;
+ int me, ee;
+
+ /* Set our bit in the threads-exiting-guest map in the 0xff00
+ bits of vcore->entry_exit_map */
+ me = 0x100 << ptid;
+ do {
+ ee = vc->entry_exit_map;
+ } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+ /* Are we the first here? */
+ if ((ee >> 8) != 0)
+ return;
+
+ /*
+ * Trigger the other threads in this vcore to exit the guest.
+ * If this is a hypervisor decrementer interrupt then they
+ * will be already on their way out of the guest.
+ */
+ if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+ kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..b027a89737b6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -26,11 +26,14 @@ static void *real_vmalloc_addr(void *x)
{
unsigned long addr = (unsigned long) x;
pte_t *p;
-
- p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
+ /*
+ * assume we don't have huge pages in vmalloc space...
+ * So don't worry about THP collapse/split. Called
+ * Only in realmode, hence won't need irq_save/restore.
+ */
+ p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
if (!p || !pte_present(*p))
return NULL;
- /* assume we don't have huge pages in vmalloc space... */
addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
return __va(addr);
}
@@ -131,31 +134,6 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unlock_rmap(rmap);
}
-static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
- int writing, unsigned long *pte_sizep)
-{
- pte_t *ptep;
- unsigned long ps = *pte_sizep;
- unsigned int hugepage_shift;
-
- ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
- if (!ptep)
- return __pte(0);
- if (hugepage_shift)
- *pte_sizep = 1ul << hugepage_shift;
- else
- *pte_sizep = PAGE_SIZE;
- if (ps > *pte_sizep)
- return __pte(0);
- return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
-}
-
-static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
-{
- asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
- hpte[0] = cpu_to_be64(hpte_v);
-}
-
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel,
pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -166,13 +144,13 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
struct revmap_entry *rev;
unsigned long g_ptel;
struct kvm_memory_slot *memslot;
- unsigned long pte_size;
+ unsigned hpage_shift;
unsigned long is_io;
unsigned long *rmap;
- pte_t pte;
+ pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
- unsigned long rcbits;
+ unsigned long rcbits, irq_flags = 0;
psize = hpte_page_size(pteh, ptel);
if (!psize)
@@ -208,22 +186,46 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
-
- /* Look up the Linux PTE for the backing page */
- pte_size = psize;
- pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
- if (pte_present(pte) && !pte_protnone(pte)) {
- if (writing && !pte_write(pte))
- /* make the actual HPTE be read-only */
- ptel = hpte_make_readonly(ptel);
- is_io = hpte_cache_bits(pte_val(pte));
- pa = pte_pfn(pte) << PAGE_SHIFT;
- pa |= hva & (pte_size - 1);
- pa |= gpa & ~PAGE_MASK;
+ /*
+ * If we had a page table table change after lookup, we would
+ * retry via mmu_notifier_retry.
+ */
+ if (realmode)
+ ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
+ else {
+ local_irq_save(irq_flags);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift);
}
+ if (ptep) {
+ pte_t pte;
+ unsigned int host_pte_size;
- if (pte_size < psize)
- return H_PARAMETER;
+ if (hpage_shift)
+ host_pte_size = 1ul << hpage_shift;
+ else
+ host_pte_size = PAGE_SIZE;
+ /*
+ * We should always find the guest page size
+ * to <= host page size, if host is using hugepage
+ */
+ if (host_pte_size < psize) {
+ if (!realmode)
+ local_irq_restore(flags);
+ return H_PARAMETER;
+ }
+ pte = kvmppc_read_update_linux_pte(ptep, writing);
+ if (pte_present(pte) && !pte_protnone(pte)) {
+ if (writing && !pte_write(pte))
+ /* make the actual HPTE be read-only */
+ ptel = hpte_make_readonly(ptel);
+ is_io = hpte_cache_bits(pte_val(pte));
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+ pa |= hva & (host_pte_size - 1);
+ pa |= gpa & ~PAGE_MASK;
+ }
+ }
+ if (!realmode)
+ local_irq_restore(irq_flags);
ptel &= ~(HPTE_R_PP0 - psize);
ptel |= pa;
@@ -271,10 +273,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
u64 pte;
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
- pte = be64_to_cpu(*hpte);
+ pte = be64_to_cpu(hpte[0]);
if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
break;
- *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hpte, pte);
hpte += 2;
}
if (i == 8)
@@ -290,9 +292,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
- pte = be64_to_cpu(*hpte);
+ pte = be64_to_cpu(hpte[0]);
if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
- *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hpte, pte);
return H_PTEG_FULL;
}
}
@@ -331,7 +333,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Write the first HPTE dword, unlocking the HPTE and making it valid */
eieio();
- hpte[0] = cpu_to_be64(pteh);
+ __unlock_hpte(hpte, pteh);
asm volatile("ptesync" : : : "memory");
*pte_idx_ret = pte_index;
@@ -412,7 +414,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
((flags & H_ANDCOND) && (pte & avpn) != 0)) {
- hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hpte, pte);
return H_NOT_FOUND;
}
@@ -548,7 +550,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
args[j] |= rcbits << (56 - 5);
- hp[0] = 0;
+ __unlock_hpte(hp, 0);
}
}
@@ -574,7 +576,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
pte = be64_to_cpu(hpte[0]);
if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
- hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+ __unlock_hpte(hpte, pte);
return H_NOT_FOUND;
}
@@ -755,8 +757,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
/* Return with the HPTE still locked */
return (hash << 3) + (i >> 1);
- /* Unlock and move on */
- hpte[i] = cpu_to_be64(v);
+ __unlock_hpte(&hpte[i], v);
}
if (val & HPTE_V_SECONDARY)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 7c22997de906..00e45b6d4f24 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -23,17 +23,37 @@
#define DEBUG_PASSUP
-static inline void rm_writeb(unsigned long paddr, u8 val)
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+ struct kvmppc_ics *ics, struct kvmppc_icp *icp)
{
- __asm__ __volatile__("sync; stbcix %0,0,%1"
- : : "r" (val), "r" (paddr) : "memory");
+ int i;
+
+ arch_spin_lock(&ics->lock);
+
+ for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+ struct ics_irq_state *state = &ics->irq_state[i];
+
+ if (!state->resend)
+ continue;
+
+ arch_spin_unlock(&ics->lock);
+ icp_rm_deliver_irq(xics, icp, state->number);
+ arch_spin_lock(&ics->lock);
+ }
+
+ arch_spin_unlock(&ics->lock);
}
+/* -- ICP routines -- */
+
static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
struct kvm_vcpu *this_vcpu)
{
struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
- unsigned long xics_phys;
int cpu;
/* Mark the target VCPU as having an interrupt pending */
@@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
/* In SMT cpu will always point to thread 0, we adjust it */
cpu += vcpu->arch.ptid;
- /* Not too hard, then poke the target */
- xics_phys = paca[cpu].kvm_hstate.xics_phys;
- rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+ smp_mb();
+ kvmhv_rm_send_ipi(cpu);
}
static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
@@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics,
return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
}
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+ struct kvmppc_icp *icp)
+{
+ u32 icsid;
+
+ /* Order this load with the test for need_resend in the caller */
+ smp_rmb();
+ for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+ struct kvmppc_ics *ics = xics->ics[icsid];
+
+ if (!test_and_clear_bit(icsid, icp->resend_map))
+ continue;
+ if (!ics)
+ continue;
+ ics_rm_check_resend(xics, ics, icp);
+ }
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+ u32 *reject)
+{
+ union kvmppc_icp_state old_state, new_state;
+ bool success;
+
+ do {
+ old_state = new_state = READ_ONCE(icp->state);
+
+ *reject = 0;
+
+ /* See if we can deliver */
+ success = new_state.cppr > priority &&
+ new_state.mfrr > priority &&
+ new_state.pending_pri > priority;
+
+ /*
+ * If we can, check for a rejection and perform the
+ * delivery
+ */
+ if (success) {
+ *reject = new_state.xisr;
+ new_state.xisr = irq;
+ new_state.pending_pri = priority;
+ } else {
+ /*
+ * If we failed to deliver we set need_resend
+ * so a subsequent CPPR state change causes us
+ * to try a new delivery.
+ */
+ new_state.need_resend = true;
+ }
+
+ } while (!icp_rm_try_update(icp, old_state, new_state));
+
+ return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+ u32 new_irq)
+{
+ struct ics_irq_state *state;
+ struct kvmppc_ics *ics;
+ u32 reject;
+ u16 src;
+
+ /*
+ * This is used both for initial delivery of an interrupt and
+ * for subsequent rejection.
+ *
+ * Rejection can be racy vs. resends. We have evaluated the
+ * rejection in an atomic ICP transaction which is now complete,
+ * so potentially the ICP can already accept the interrupt again.
+ *
+ * So we need to retry the delivery. Essentially the reject path
+ * boils down to a failed delivery. Always.
+ *
+ * Now the interrupt could also have moved to a different target,
+ * thus we may need to re-do the ICP lookup as well
+ */
+
+ again:
+ /* Get the ICS state and lock it */
+ ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+ if (!ics) {
+ /* Unsafe increment, but this does not need to be accurate */
+ xics->err_noics++;
+ return;
+ }
+ state = &ics->irq_state[src];
+
+ /* Get a lock on the ICS */
+ arch_spin_lock(&ics->lock);
+
+ /* Get our server */
+ if (!icp || state->server != icp->server_num) {
+ icp = kvmppc_xics_find_server(xics->kvm, state->server);
+ if (!icp) {
+ /* Unsafe increment again*/
+ xics->err_noicp++;
+ goto out;
+ }
+ }
+
+ /* Clear the resend bit of that interrupt */
+ state->resend = 0;
+
+ /*
+ * If masked, bail out
+ *
+ * Note: PAPR doesn't mention anything about masked pending
+ * when doing a resend, only when doing a delivery.
+ *
+ * However that would have the effect of losing a masked
+ * interrupt that was rejected and isn't consistent with
+ * the whole masked_pending business which is about not
+ * losing interrupts that occur while masked.
+ *
+ * I don't differentiate normal deliveries and resends, this
+ * implementation will differ from PAPR and not lose such
+ * interrupts.
+ */
+ if (state->priority == MASKED) {
+ state->masked_pending = 1;
+ goto out;
+ }
+
+ /*
+ * Try the delivery, this will set the need_resend flag
+ * in the ICP as part of the atomic transaction if the
+ * delivery is not possible.
+ *
+ * Note that if successful, the new delivery might have itself
+ * rejected an interrupt that was "delivered" before we took the
+ * ics spin lock.
+ *
+ * In this case we do the whole sequence all over again for the
+ * new guy. We cannot assume that the rejected interrupt is less
+ * favored than the new one, and thus doesn't need to be delivered,
+ * because by the time we exit icp_rm_try_to_deliver() the target
+ * processor may well have already consumed & completed it, and thus
+ * the rejected interrupt might actually be already acceptable.
+ */
+ if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+ /*
+ * Delivery was successful, did we reject somebody else ?
+ */
+ if (reject && reject != XICS_IPI) {
+ arch_spin_unlock(&ics->lock);
+ new_irq = reject;
+ goto again;
+ }
+ } else {
+ /*
+ * We failed to deliver the interrupt we need to set the
+ * resend map bit and mark the ICS state as needing a resend
+ */
+ set_bit(ics->icsid, icp->resend_map);
+ state->resend = 1;
+
+ /*
+ * If the need_resend flag got cleared in the ICP some time
+ * between icp_rm_try_to_deliver() atomic update and now, then
+ * we know it might have missed the resend_map bit. So we
+ * retry
+ */
+ smp_mb();
+ if (!icp->state.need_resend) {
+ arch_spin_unlock(&ics->lock);
+ goto again;
+ }
+ }
+ out:
+ arch_spin_unlock(&ics->lock);
+}
+
static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
u8 new_cppr)
{
@@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* separately here as well.
*/
if (resend) {
- icp->rm_action |= XICS_RM_CHECK_RESEND;
- icp->rm_resend_icp = icp;
+ icp->n_check_resend++;
+ icp_rm_check_resend(xics, icp);
}
}
@@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
}
} while (!icp_rm_try_update(icp, old_state, new_state));
- /* Pass rejects to virtual mode */
+ /* Handle reject in real mode */
if (reject && reject != XICS_IPI) {
- this_icp->rm_action |= XICS_RM_REJECT;
- this_icp->rm_reject = reject;
+ this_icp->n_reject++;
+ icp_rm_deliver_irq(xics, icp, reject);
}
- /* Pass resends to virtual mode */
+ /* Handle resends in real mode */
if (resend) {
- this_icp->rm_action |= XICS_RM_CHECK_RESEND;
- this_icp->rm_resend_icp = icp;
+ this_icp->n_check_resend++;
+ icp_rm_check_resend(xics, icp);
}
return check_too_hard(xics, this_icp);
@@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
} while (!icp_rm_try_update(icp, old_state, new_state));
- /* Pass rejects to virtual mode */
+ /*
+ * Check for rejects. They are handled by doing a new delivery
+ * attempt (see comments in icp_rm_deliver_irq).
+ */
if (reject && reject != XICS_IPI) {
- icp->rm_action |= XICS_RM_REJECT;
- icp->rm_reject = reject;
+ icp->n_reject++;
+ icp_rm_deliver_irq(xics, icp, reject);
}
bail:
return check_too_hard(xics, icp);
@@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
goto bail;
state = &ics->irq_state[src];
- /* Still asserted, resend it, we make it look like a reject */
+ /* Still asserted, resend it */
if (state->asserted) {
- icp->rm_action |= XICS_RM_REJECT;
- icp->rm_reject = irq;
+ icp->n_reject++;
+ icp_rm_deliver_irq(xics, icp, irq);
}
if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6cbf1630cb70..4d70df26c402 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
+ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+ mfspr r3, SPRN_HDEC
+ mtspr SPRN_DEC, r3
+ /*
+ * Make sure the primary has finished the MMU switch.
+ * We should never get here on a secondary thread, but
+ * check it for robustness' sake.
+ */
+ ld r5, HSTATE_KVM_VCORE(r13)
+65: lbz r0, VCORE_IN_GUEST(r5)
+ cmpwi r0, 0
+ beq 65b
+ /* Set LPCR. */
+ ld r8,VCORE_LPCR(r5)
+ mtspr SPRN_LPCR,r8
+ isync
/* set our bit in napping_threads */
ld r5, HSTATE_KVM_VCORE(r13)
lbz r7, HSTATE_PTID(r13)
@@ -182,7 +198,7 @@ kvmppc_primary_no_guest:
or r3, r3, r0
stwcx. r3, 0, r6
bne 1b
- /* order napping_threads update vs testing entry_exit_count */
+ /* order napping_threads update vs testing entry_exit_map */
isync
li r12, 0
lwz r7, VCORE_ENTRY_EXIT(r5)
@@ -191,6 +207,7 @@ kvmppc_primary_no_guest:
li r3, NAPPING_NOVCPU
stb r3, HSTATE_NAPPING(r13)
+ li r3, 0 /* Don't wake on privileged (OS) doorbell */
b kvm_do_nap
kvm_novcpu_wakeup:
@@ -202,7 +219,7 @@ kvm_novcpu_wakeup:
/* check the wake reason */
bl kvmppc_check_wake_reason
-
+
/* see if any other thread is already exiting */
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
@@ -222,13 +239,37 @@ kvm_novcpu_wakeup:
cmpdi r3, 0
bge kvm_novcpu_exit
+ /* See if our timeslice has expired (HDEC is negative) */
+ mfspr r0, SPRN_HDEC
+ li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+ cmpwi r0, 0
+ blt kvm_novcpu_exit
+
/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
ld r4, HSTATE_KVM_VCPU(r13)
cmpdi r4, 0
- bne kvmppc_got_guest
+ beq kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r4, VCPU_TB_RMENTRY
+ bl kvmhv_start_timing
+#endif
+ b kvmppc_got_guest
kvm_novcpu_exit:
- b hdec_soon
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ ld r4, HSTATE_KVM_VCPU(r13)
+ cmpdi r4, 0
+ beq 13f
+ addi r3, r4, VCPU_TB_RMEXIT
+ bl kvmhv_accumulate_time
+#endif
+13: mr r3, r12
+ stw r12, 112-4(r1)
+ bl kvmhv_commence_exit
+ nop
+ lwz r12, 112-4(r1)
+ b kvmhv_switch_to_host
/*
* We come in here when wakened from nap mode.
@@ -239,9 +280,9 @@ kvm_novcpu_exit:
kvm_start_guest:
/* Set runlatch bit the minute you wake up from nap */
- mfspr r1, SPRN_CTRLF
- ori r1, r1, 1
- mtspr SPRN_CTRLT, r1
+ mfspr r0, SPRN_CTRLF
+ ori r0, r0, 1
+ mtspr SPRN_CTRLT, r0
ld r2,PACATOC(r13)
@@ -286,26 +327,21 @@ kvm_secondary_got_guest:
ld r6, PACA_DSCR(r13)
std r6, HSTATE_DSCR(r13)
+ /* Order load of vcore, ptid etc. after load of vcpu */
+ lwsync
bl kvmppc_hv_entry
/* Back from the guest, go back to nap */
/* Clear our vcpu pointer so we don't come back in early */
li r0, 0
- std r0, HSTATE_KVM_VCPU(r13)
/*
- * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing
- * the nap_count, because once the increment to nap_count is
- * visible we could be given another vcpu.
+ * Once we clear HSTATE_KVM_VCPU(r13), the code in
+ * kvmppc_run_core() is going to assume that all our vcpu
+ * state is visible in memory. This lwsync makes sure
+ * that that is true.
*/
lwsync
-
- /* increment the nap count and then go to nap mode */
- ld r4, HSTATE_KVM_VCORE(r13)
- addi r4, r4, VCORE_NAP_COUNT
-51: lwarx r3, 0, r4
- addi r3, r3, 1
- stwcx. r3, 0, r4
- bne 51b
+ std r0, HSTATE_KVM_VCPU(r13)
/*
* At this point we have finished executing in the guest.
@@ -376,6 +412,14 @@ kvmppc_hv_entry:
li r6, KVM_GUEST_MODE_HOST_HV
stb r6, HSTATE_IN_GUEST(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ /* Store initial timestamp */
+ cmpdi r4, 0
+ beq 1f
+ addi r3, r4, VCPU_TB_RMENTRY
+ bl kvmhv_start_timing
+1:
+#endif
/* Clear out SLB */
li r6,0
slbmte r6,r6
@@ -387,21 +431,23 @@ kvmppc_hv_entry:
* We don't have to lock against concurrent tlbies,
* but we do have to coordinate across hardware threads.
*/
- /* Increment entry count iff exit count is zero. */
- ld r5,HSTATE_KVM_VCORE(r13)
- addi r9,r5,VCORE_ENTRY_EXIT
-21: lwarx r3,0,r9
- cmpwi r3,0x100 /* any threads starting to exit? */
+ /* Set bit in entry map iff exit map is zero. */
+ ld r5, HSTATE_KVM_VCORE(r13)
+ li r7, 1
+ lbz r6, HSTATE_PTID(r13)
+ sld r7, r7, r6
+ addi r9, r5, VCORE_ENTRY_EXIT
+21: lwarx r3, 0, r9
+ cmpwi r3, 0x100 /* any threads starting to exit? */
bge secondary_too_late /* if so we're too late to the party */
- addi r3,r3,1
- stwcx. r3,0,r9
+ or r3, r3, r7
+ stwcx. r3, 0, r9
bne 21b
/* Primary thread switches to guest partition. */
ld r9,VCORE_KVM(r5) /* pointer to struct kvm */
- lbz r6,HSTATE_PTID(r13)
cmpwi r6,0
- bne 20f
+ bne 10f
ld r6,KVM_SDR1(r9)
lwz r7,KVM_LPID(r9)
li r0,LPID_RSVD /* switch to reserved LPID */
@@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r0,1
stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
- b 10f
-
- /* Secondary threads wait for primary to have done partition switch */
-20: lbz r0,VCORE_IN_GUEST(r5)
- cmpwi r0,0
- beq 20b
-
- /* Set LPCR and RMOR. */
-10: ld r8,VCORE_LPCR(r5)
- mtspr SPRN_LPCR,r8
- ld r8,KVM_RMOR(r9)
- mtspr SPRN_RMOR,r8
- isync
-
- /* Check if HDEC expires soon */
- mfspr r3,SPRN_HDEC
- cmpwi r3,512 /* 1 microsecond */
- li r12,BOOK3S_INTERRUPT_HV_DECREMENTER
- blt hdec_soon
/* Do we have a guest vcpu to run? */
- cmpdi r4, 0
+10: cmpdi r4, 0
beq kvmppc_primary_no_guest
kvmppc_got_guest:
@@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
clrrdi r6,r6,1
mtspr SPRN_CTRLT,r6
4:
+ /* Secondary threads wait for primary to have done partition switch */
+ ld r5, HSTATE_KVM_VCORE(r13)
+ lbz r6, HSTATE_PTID(r13)
+ cmpwi r6, 0
+ beq 21f
+ lbz r0, VCORE_IN_GUEST(r5)
+ cmpwi r0, 0
+ bne 21f
+ HMT_LOW
+20: lbz r0, VCORE_IN_GUEST(r5)
+ cmpwi r0, 0
+ beq 20b
+ HMT_MEDIUM
+21:
+ /* Set LPCR. */
+ ld r8,VCORE_LPCR(r5)
+ mtspr SPRN_LPCR,r8
+ isync
+
+ /* Check if HDEC expires soon */
+ mfspr r3, SPRN_HDEC
+ cmpwi r3, 512 /* 1 microsecond */
+ blt hdec_soon
+
ld r6, VCPU_CTR(r4)
lwz r7, VCPU_XER(r4)
@@ -880,6 +931,12 @@ fast_guest_return:
li r9, KVM_GUEST_MODE_GUEST_HV
stb r9, HSTATE_IN_GUEST(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ /* Accumulate timing */
+ addi r3, r4, VCPU_TB_GUEST
+ bl kvmhv_accumulate_time
+#endif
+
/* Enter guest */
BEGIN_FTR_SECTION
@@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
hrfid
b .
+secondary_too_late:
+ li r12, 0
+ cmpdi r4, 0
+ beq 11f
+ stw r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r4, VCPU_TB_RMEXIT
+ bl kvmhv_accumulate_time
+#endif
+11: b kvmhv_switch_to_host
+
+hdec_soon:
+ li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+ stw r12, VCPU_TRAP(r4)
+ mr r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r4, VCPU_TB_RMEXIT
+ bl kvmhv_accumulate_time
+#endif
+ b guest_exit_cont
+
/******************************************************************************
* *
* Exit code *
@@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
stw r12,VCPU_TRAP(r9)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r9, VCPU_TB_RMINTR
+ mr r4, r9
+ bl kvmhv_accumulate_time
+ ld r5, VCPU_GPR(R5)(r9)
+ ld r6, VCPU_GPR(R6)(r9)
+ ld r7, VCPU_GPR(R7)(r9)
+ ld r8, VCPU_GPR(R8)(r9)
+#endif
+
/* Save HEIR (HV emulation assist reg) in emul_inst
if this is an HEI (HV emulation interrupt, e40) */
li r3,KVM_INST_FETCH_FAILED
@@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
bne 2f
mfspr r3,SPRN_HDEC
cmpwi r3,0
- bge ignore_hdec
+ mr r4,r9
+ bge fast_guest_return
2:
/* See if this is an hcall we can handle in real mode */
cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
beq hcall_try_real_mode
+ /* Hypervisor doorbell - exit only if host IPI flag set */
+ cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
+ bne 3f
+ lbz r0, HSTATE_HOST_IPI(r13)
+ beq 4f
+ b guest_exit_cont
+3:
/* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
- bne+ ext_interrupt_to_host
+ bne+ guest_exit_cont
/* External interrupt, first check for host_ipi. If this is
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
cmpdi r3, 0
- bgt ext_interrupt_to_host
+ bgt guest_exit_cont
/* Check if any CPU is heading out to the host, if so head out too */
- ld r5, HSTATE_KVM_VCORE(r13)
+4: ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
- bge ext_interrupt_to_host
-
- /* Return to guest after delivering any pending interrupt */
mr r4, r9
- b deliver_guest_interrupt
-
-ext_interrupt_to_host:
+ blt deliver_guest_interrupt
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save more register state */
@@ -1065,7 +1156,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
stw r7, VCPU_DSISR(r9)
/* don't overwrite fault_dar/fault_dsisr if HDSI */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
- beq 6f
+ beq mc_cont
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
@@ -1073,9 +1164,20 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_realmode
mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r9, VCPU_TB_RMEXIT
+ mr r4, r9
+ bl kvmhv_accumulate_time
+#endif
+
+ /* Increment exit count, poke other threads to exit */
+ bl kvmhv_commence_exit
+ nop
+ ld r9, HSTATE_KVM_VCPU(r13)
+ lwz r12, VCPU_TRAP(r9)
/* Save guest CTRL register, set runlatch to 1 */
-6: mfspr r6,SPRN_CTRLF
+ mfspr r6,SPRN_CTRLF
stw r6,VCPU_CTRL(r9)
andi. r0,r6,1
bne 4f
@@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
slbia
ptesync
-hdec_soon: /* r12 = trap, r13 = paca */
/*
* POWER7/POWER8 guest -> host partition switch code.
* We don't have to lock against tlbies but we do
* have to coordinate the hardware threads.
*/
- /* Increment the threads-exiting-guest count in the 0xff00
- bits of vcore->entry_exit_count */
- ld r5,HSTATE_KVM_VCORE(r13)
- addi r6,r5,VCORE_ENTRY_EXIT
-41: lwarx r3,0,r6
- addi r0,r3,0x100
- stwcx. r0,0,r6
- bne 41b
- isync /* order stwcx. vs. reading napping_threads */
-
- /*
- * At this point we have an interrupt that we have to pass
- * up to the kernel or qemu; we can't handle it in real mode.
- * Thus we have to do a partition switch, so we have to
- * collect the other threads, if we are the first thread
- * to take an interrupt. To do this, we set the HDEC to 0,
- * which causes an HDEC interrupt in all threads within 2ns
- * because the HDEC register is shared between all 4 threads.
- * However, we don't need to bother if this is an HDEC
- * interrupt, since the other threads will already be on their
- * way here in that case.
- */
- cmpwi r3,0x100 /* Are we the first here? */
- bge 43f
- cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
- beq 40f
- li r0,0
- mtspr SPRN_HDEC,r0
-40:
- /*
- * Send an IPI to any napping threads, since an HDEC interrupt
- * doesn't wake CPUs up from nap.
- */
- lwz r3,VCORE_NAPPING_THREADS(r5)
- lbz r4,HSTATE_PTID(r13)
- li r0,1
- sld r0,r0,r4
- andc. r3,r3,r0 /* no sense IPI'ing ourselves */
- beq 43f
- /* Order entry/exit update vs. IPIs */
- sync
- mulli r4,r4,PACA_SIZE /* get paca for thread 0 */
- subf r6,r4,r13
-42: andi. r0,r3,1
- beq 44f
- ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */
- li r0,IPI_PRIORITY
- li r7,XICS_MFRR
- stbcix r0,r7,r8 /* trigger the IPI */
-44: srdi. r3,r3,1
- addi r6,r6,PACA_SIZE
- bne 42b
-
-secondary_too_late:
+kvmhv_switch_to_host:
/* Secondary threads wait for primary to do partition switch */
-43: ld r5,HSTATE_KVM_VCORE(r13)
+ ld r5,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
lbz r3,HSTATE_PTID(r13)
cmpwi r3,0
@@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
1: addi r8,r8,16
.endr
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ /* Finish timing, if we have a vcpu */
+ ld r4, HSTATE_KVM_VCPU(r13)
+ cmpdi r4, 0
+ li r3, 0
+ beq 2f
+ bl kvmhv_accumulate_time
+2:
+#endif
/* Unset guest mode */
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
@@ -1696,8 +1753,10 @@ kvmppc_hisi:
* Returns to the guest if we handle it, or continues on up to
* the kernel if we can't (i.e. if we don't have a handler for
* it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
*/
- .globl hcall_try_real_mode
hcall_try_real_mode:
ld r3,VCPU_GPR(R3)(r9)
andi. r0,r11,MSR_PR
@@ -1839,13 +1898,124 @@ hcall_real_table:
.long 0 /* 0x12c */
.long 0 /* 0x130 */
.long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+ .long 0 /* 0x138 */
+ .long 0 /* 0x13c */
+ .long 0 /* 0x140 */
+ .long 0 /* 0x144 */
+ .long 0 /* 0x148 */
+ .long 0 /* 0x14c */
+ .long 0 /* 0x150 */
+ .long 0 /* 0x154 */
+ .long 0 /* 0x158 */
+ .long 0 /* 0x15c */
+ .long 0 /* 0x160 */
+ .long 0 /* 0x164 */
+ .long 0 /* 0x168 */
+ .long 0 /* 0x16c */
+ .long 0 /* 0x170 */
+ .long 0 /* 0x174 */
+ .long 0 /* 0x178 */
+ .long 0 /* 0x17c */
+ .long 0 /* 0x180 */
+ .long 0 /* 0x184 */
+ .long 0 /* 0x188 */
+ .long 0 /* 0x18c */
+ .long 0 /* 0x190 */
+ .long 0 /* 0x194 */
+ .long 0 /* 0x198 */
+ .long 0 /* 0x19c */
+ .long 0 /* 0x1a0 */
+ .long 0 /* 0x1a4 */
+ .long 0 /* 0x1a8 */
+ .long 0 /* 0x1ac */
+ .long 0 /* 0x1b0 */
+ .long 0 /* 0x1b4 */
+ .long 0 /* 0x1b8 */
+ .long 0 /* 0x1bc */
+ .long 0 /* 0x1c0 */
+ .long 0 /* 0x1c4 */
+ .long 0 /* 0x1c8 */
+ .long 0 /* 0x1cc */
+ .long 0 /* 0x1d0 */
+ .long 0 /* 0x1d4 */
+ .long 0 /* 0x1d8 */
+ .long 0 /* 0x1dc */
+ .long 0 /* 0x1e0 */
+ .long 0 /* 0x1e4 */
+ .long 0 /* 0x1e8 */
+ .long 0 /* 0x1ec */
+ .long 0 /* 0x1f0 */
+ .long 0 /* 0x1f4 */
+ .long 0 /* 0x1f8 */
+ .long 0 /* 0x1fc */
+ .long 0 /* 0x200 */
+ .long 0 /* 0x204 */
+ .long 0 /* 0x208 */
+ .long 0 /* 0x20c */
+ .long 0 /* 0x210 */
+ .long 0 /* 0x214 */
+ .long 0 /* 0x218 */
+ .long 0 /* 0x21c */
+ .long 0 /* 0x220 */
+ .long 0 /* 0x224 */
+ .long 0 /* 0x228 */
+ .long 0 /* 0x22c */
+ .long 0 /* 0x230 */
+ .long 0 /* 0x234 */
+ .long 0 /* 0x238 */
+ .long 0 /* 0x23c */
+ .long 0 /* 0x240 */
+ .long 0 /* 0x244 */
+ .long 0 /* 0x248 */
+ .long 0 /* 0x24c */
+ .long 0 /* 0x250 */
+ .long 0 /* 0x254 */
+ .long 0 /* 0x258 */
+ .long 0 /* 0x25c */
+ .long 0 /* 0x260 */
+ .long 0 /* 0x264 */
+ .long 0 /* 0x268 */
+ .long 0 /* 0x26c */
+ .long 0 /* 0x270 */
+ .long 0 /* 0x274 */
+ .long 0 /* 0x278 */
+ .long 0 /* 0x27c */
+ .long 0 /* 0x280 */
+ .long 0 /* 0x284 */
+ .long 0 /* 0x288 */
+ .long 0 /* 0x28c */
+ .long 0 /* 0x290 */
+ .long 0 /* 0x294 */
+ .long 0 /* 0x298 */
+ .long 0 /* 0x29c */
+ .long 0 /* 0x2a0 */
+ .long 0 /* 0x2a4 */
+ .long 0 /* 0x2a8 */
+ .long 0 /* 0x2ac */
+ .long 0 /* 0x2b0 */
+ .long 0 /* 0x2b4 */
+ .long 0 /* 0x2b8 */
+ .long 0 /* 0x2bc */
+ .long 0 /* 0x2c0 */
+ .long 0 /* 0x2c4 */
+ .long 0 /* 0x2c8 */
+ .long 0 /* 0x2cc */
+ .long 0 /* 0x2d0 */
+ .long 0 /* 0x2d4 */
+ .long 0 /* 0x2d8 */
+ .long 0 /* 0x2dc */
+ .long 0 /* 0x2e0 */
+ .long 0 /* 0x2e4 */
+ .long 0 /* 0x2e8 */
+ .long 0 /* 0x2ec */
+ .long 0 /* 0x2f0 */
+ .long 0 /* 0x2f4 */
+ .long 0 /* 0x2f8 */
+ .long 0 /* 0x2fc */
+ .long DOTSYM(kvmppc_h_random) - hcall_real_table
.globl hcall_real_table_end
hcall_real_table_end:
-ignore_hdec:
- mr r4,r9
- b fast_guest_return
-
_GLOBAL(kvmppc_h_set_xdabr)
andi. r0, r5, DABRX_USER | DABRX_KERNEL
beq 6f
@@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 0
blr
-_GLOBAL(kvmppc_h_cede)
+_GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
ori r11,r11,MSR_EE
std r11,VCPU_MSR(r3)
li r0,1
@@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede)
lbz r5,VCPU_PRODDED(r3)
cmpwi r5,0
bne kvm_cede_prodded
- li r0,0 /* set trap to 0 to say hcall is handled */
- stw r0,VCPU_TRAP(r3)
+ li r12,0 /* set trap to 0 to say hcall is handled */
+ stw r12,VCPU_TRAP(r3)
li r0,H_SUCCESS
std r0,VCPU_GPR(R3)(r3)
@@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede)
addi r6,r5,VCORE_NAPPING_THREADS
31: lwarx r4,0,r6
or r4,r4,r0
- PPC_POPCNTW(R7,R4)
- cmpw r7,r8
- bge kvm_cede_exit
+ cmpw r4,r8
+ beq kvm_cede_exit
stwcx. r4,0,r6
bne 31b
- /* order napping_threads update vs testing entry_exit_count */
+ /* order napping_threads update vs testing entry_exit_map */
isync
li r0,NAPPING_CEDE
stb r0,HSTATE_NAPPING(r13)
@@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede)
bl kvmppc_save_fp
/*
+ * Set DEC to the smaller of DEC and HDEC, so that we wake
+ * no later than the end of our timeslice (HDEC interrupts
+ * don't wake us from nap).
+ */
+ mfspr r3, SPRN_DEC
+ mfspr r4, SPRN_HDEC
+ mftb r5
+ cmpw r3, r4
+ ble 67f
+ mtspr SPRN_DEC, r4
+67:
+ /* save expiry time of guest decrementer */
+ extsw r3, r3
+ add r3, r3, r5
+ ld r4, HSTATE_KVM_VCPU(r13)
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r6, VCORE_TB_OFFSET(r5)
+ subf r3, r6, r3 /* convert to host TB value */
+ std r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ ld r4, HSTATE_KVM_VCPU(r13)
+ addi r3, r4, VCPU_TB_CEDE
+ bl kvmhv_accumulate_time
+#endif
+
+ lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */
+
+ /*
* Take a nap until a decrementer or external or doobell interrupt
- * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
- * runlatch bit before napping.
+ * occurs, with PECE1 and PECE0 set in LPCR.
+ * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+ * Also clear the runlatch bit before napping.
*/
kvm_do_nap:
- mfspr r2, SPRN_CTRLF
- clrrdi r2, r2, 1
- mtspr SPRN_CTRLT, r2
+ mfspr r0, SPRN_CTRLF
+ clrrdi r0, r0, 1
+ mtspr SPRN_CTRLT, r0
li r0,1
stb r0,HSTATE_HWTHREAD_REQ(r13)
mfspr r5,SPRN_LPCR
ori r5,r5,LPCR_PECE0 | LPCR_PECE1
BEGIN_FTR_SECTION
- oris r5,r5,LPCR_PECEDP@h
+ ori r5, r5, LPCR_PECEDH
+ rlwimi r5, r3, 0, LPCR_PECEDP
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_LPCR,r5
isync
@@ -1994,9 +2194,23 @@ kvm_end_cede:
/* Woken by external or decrementer interrupt */
ld r1, HSTATE_HOST_R1(r13)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+ addi r3, r4, VCPU_TB_RMINTR
+ bl kvmhv_accumulate_time
+#endif
+
/* load up FP state */
bl kvmppc_load_fp
+ /* Restore guest decrementer */
+ ld r3, VCPU_DEC_EXPIRES(r4)
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r6, VCORE_TB_OFFSET(r5)
+ add r3, r3, r6 /* convert host TB to guest TB value */
+ mftb r7
+ subf r3, r7, r3
+ mtspr SPRN_DEC, r3
+
/* Load NV GPRS */
ld r14, VCPU_GPR(R14)(r4)
ld r15, VCPU_GPR(R15)(r4)
@@ -2057,7 +2271,8 @@ kvm_cede_prodded:
/* we've ceded but we want to give control to the host */
kvm_cede_exit:
- b hcall_real_fallback
+ ld r9, HSTATE_KVM_VCPU(r13)
+ b guest_exit_cont
/* Try to handle a machine check in real mode */
machine_check_realmode:
@@ -2089,13 +2304,14 @@ machine_check_realmode:
/*
* Check the reason we woke from nap, and take appropriate action.
- * Returns:
+ * Returns (in r3):
* 0 if nothing needs to be done
* 1 if something happened that needs to be handled by the host
- * -1 if there was a guest wakeup (IPI)
+ * -1 if there was a guest wakeup (IPI or msgsnd)
*
* Also sets r12 to the interrupt vector for any interrupt that needs
* to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
*/
kvmppc_check_wake_reason:
mfspr r6, SPRN_SRR1
@@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* hypervisor doorbell */
3: li r12, BOOK3S_INTERRUPT_H_DOORBELL
+ /* see if it's a host IPI */
li r3, 1
+ lbz r0, HSTATE_HOST_IPI(r13)
+ cmpwi r0, 0
+ bnelr
+ /* if not, clear it and return -1 */
+ lis r6, (PPC_DBELL_SERVER << (63-36))@h
+ PPC_MSGCLR(6)
+ li r3, -1
blr
/*
@@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* 0 if no interrupt is pending
* 1 if an interrupt is pending that needs to be handled by the host
* -1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
*/
kvmppc_read_intr:
/* see if a host IPI is pending */
@@ -2185,6 +2410,7 @@ kvmppc_read_intr:
bne- 43f
/* OK, it's an IPI for us */
+ li r12, 0
li r3, -1
1: blr
@@ -2314,3 +2540,62 @@ kvmppc_fix_pmao:
mtspr SPRN_PMC6, r3
isync
blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+ ld r5, HSTATE_KVM_VCORE(r13)
+ lbz r6, VCORE_IN_GUEST(r5)
+ cmpwi r6, 0
+ beq 5f /* if in guest, need to */
+ ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
+5: mftb r5
+ subf r5, r6, r5
+ std r3, VCPU_CUR_ACTIVITY(r4)
+ std r5, VCPU_ACTIVITY_START(r4)
+ blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+ ld r5, HSTATE_KVM_VCORE(r13)
+ lbz r8, VCORE_IN_GUEST(r5)
+ cmpwi r8, 0
+ beq 4f /* if in guest, need to */
+ ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */
+4: ld r5, VCPU_CUR_ACTIVITY(r4)
+ ld r6, VCPU_ACTIVITY_START(r4)
+ std r3, VCPU_CUR_ACTIVITY(r4)
+ mftb r7
+ subf r7, r8, r7
+ std r7, VCPU_ACTIVITY_START(r4)
+ cmpdi r5, 0
+ beqlr
+ subf r3, r6, r7
+ ld r8, TAS_SEQCOUNT(r5)
+ cmpdi r8, 0
+ addi r8, r8, 1
+ std r8, TAS_SEQCOUNT(r5)
+ lwsync
+ ld r7, TAS_TOTAL(r5)
+ add r7, r7, r3
+ std r7, TAS_TOTAL(r5)
+ ld r6, TAS_MIN(r5)
+ ld r7, TAS_MAX(r5)
+ beq 3f
+ cmpd r3, r6
+ bge 1f
+3: std r3, TAS_MIN(r5)
+1: cmpd r3, r7
+ ble 2f
+ std r3, TAS_MAX(r5)
+2: lwsync
+ addi r8, r8, 1
+ std r8, TAS_SEQCOUNT(r5)
+ blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ce3c893d509b..f2c75a1e0536 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
}
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+ long rc;
+
+ rc = kvmppc_h_logical_ci_load(vcpu);
+ if (rc == H_TOO_HARD)
+ return EMULATE_FAIL;
+ kvmppc_set_gpr(vcpu, 3, rc);
+ return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+ long rc;
+
+ rc = kvmppc_h_logical_ci_store(vcpu);
+ if (rc == H_TOO_HARD)
+ return EMULATE_FAIL;
+ kvmppc_set_gpr(vcpu, 3, rc);
+ return EMULATE_DONE;
+}
+
static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{
long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
vcpu->stat.halt_wakeup++;
return EMULATE_DONE;
+ case H_LOGICAL_CI_LOAD:
+ return kvmppc_h_pr_logical_ci_load(vcpu);
+ case H_LOGICAL_CI_STORE:
+ return kvmppc_h_pr_logical_ci_store(vcpu);
case H_XIRR:
case H_CPPR:
case H_EOI:
@@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
case H_BULK_REMOVE:
case H_PUT_TCE:
case H_CEDE:
+ case H_LOGICAL_CI_LOAD:
+ case H_LOGICAL_CI_STORE:
#ifdef CONFIG_KVM_XICS
case H_XIRR:
case H_CPPR:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a4a8d9f0dcb7..c6ca7db64673 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -12,6 +12,7 @@
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/anon_inodes.h>
+#include <linux/spinlock.h>
#include <asm/uaccess.h>
#include <asm/kvm_book3s.h>
@@ -39,7 +40,7 @@
* LOCKING
* =======
*
- * Each ICS has a mutex protecting the information about the IRQ
+ * Each ICS has a spin lock protecting the information about the IRQ
* sources and avoiding simultaneous deliveries if the same interrupt.
*
* ICP operations are done via a single compare & swap transaction
@@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
{
int i;
- mutex_lock(&ics->lock);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
@@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
XICS_DBG("resend %#x prio %#x\n", state->number,
state->priority);
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
icp_deliver_irq(xics, icp, state->number);
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
}
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
}
static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
u32 server, u32 priority, u32 saved_priority)
{
bool deliver;
+ unsigned long flags;
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
state->server = server;
state->priority = priority;
@@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
deliver = true;
}
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
return deliver;
}
@@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
struct kvmppc_ics *ics;
struct ics_irq_state *state;
u16 src;
+ unsigned long flags;
if (!xics)
return -ENODEV;
@@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
return -EINVAL;
state = &ics->irq_state[src];
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
*server = state->server;
*priority = state->priority;
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
return 0;
}
@@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
struct kvmppc_ics *ics;
u32 reject;
u16 src;
+ unsigned long flags;
/*
* This is used both for initial delivery of an interrupt and
@@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
state = &ics->irq_state[src];
/* Get a lock on the ICS */
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
/* Get our server */
if (!icp || state->server != icp->server_num) {
@@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*
* Note that if successful, the new delivery might have itself
* rejected an interrupt that was "delivered" before we took the
- * icp mutex.
+ * ics spin lock.
*
* In this case we do the whole sequence all over again for the
* new guy. We cannot assume that the rejected interrupt is less
@@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* Delivery was successful, did we reject somebody else ?
*/
if (reject && reject != XICS_IPI) {
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
new_irq = reject;
goto again;
}
@@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
goto again;
}
}
out:
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
}
static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
- if (icp->rm_action & XICS_RM_KICK_VCPU)
+ if (icp->rm_action & XICS_RM_KICK_VCPU) {
+ icp->n_rm_kick_vcpu++;
kvmppc_fast_vcpu_kick(icp->rm_kick_target);
- if (icp->rm_action & XICS_RM_CHECK_RESEND)
+ }
+ if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+ icp->n_rm_check_resend++;
icp_check_resend(xics, icp->rm_resend_icp);
- if (icp->rm_action & XICS_RM_REJECT)
+ }
+ if (icp->rm_action & XICS_RM_REJECT) {
+ icp->n_rm_reject++;
icp_deliver_irq(xics, icp, icp->rm_reject);
- if (icp->rm_action & XICS_RM_NOTIFY_EOI)
+ }
+ if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+ icp->n_rm_notify_eoi++;
kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+ }
icp->rm_action = 0;
@@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private)
struct kvm *kvm = xics->kvm;
struct kvm_vcpu *vcpu;
int icsid, i;
+ unsigned long flags;
+ unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+ unsigned long t_rm_reject, t_rm_notify_eoi;
+ unsigned long t_reject, t_check_resend;
if (!kvm)
return 0;
+ t_rm_kick_vcpu = 0;
+ t_rm_notify_eoi = 0;
+ t_rm_check_resend = 0;
+ t_rm_reject = 0;
+ t_check_resend = 0;
+ t_reject = 0;
+
seq_printf(m, "=========\nICP state\n=========\n");
kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private)
icp->server_num, state.xisr,
state.pending_pri, state.cppr, state.mfrr,
state.out_ee, state.need_resend);
+ t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+ t_rm_notify_eoi += icp->n_rm_notify_eoi;
+ t_rm_check_resend += icp->n_rm_check_resend;
+ t_rm_reject += icp->n_rm_reject;
+ t_check_resend += icp->n_check_resend;
+ t_reject += icp->n_reject;
}
+ seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+ t_rm_kick_vcpu, t_rm_check_resend,
+ t_rm_reject, t_rm_notify_eoi);
+ seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+ t_check_resend, t_reject);
for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
struct kvmppc_ics *ics = xics->ics[icsid];
@@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
icsid);
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *irq = &ics->irq_state[i];
@@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
irq->resend, irq->masked_pending);
}
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
}
return 0;
}
@@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
if (!ics)
goto out;
- mutex_init(&ics->lock);
ics->icsid = icsid;
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
@@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
u64 __user *ubufp = (u64 __user *) addr;
u16 idx;
u64 val, prio;
+ unsigned long flags;
ics = kvmppc_xics_find_ics(xics, irq, &idx);
if (!ics)
return -ENOENT;
irqp = &ics->irq_state[idx];
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
ret = -ENOENT;
if (irqp->exists) {
val = irqp->server;
@@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
val |= KVM_XICS_PENDING;
ret = 0;
}
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
if (!ret && put_user(val, ubufp))
ret = -EFAULT;
@@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
u64 val;
u8 prio;
u32 server;
+ unsigned long flags;
if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
return -ENOENT;
@@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
kvmppc_xics_find_server(xics->kvm, server) == NULL)
return -EINVAL;
- mutex_lock(&ics->lock);
+ local_irq_save(flags);
+ arch_spin_lock(&ics->lock);
irqp->server = server;
irqp->saved_priority = prio;
if (val & KVM_XICS_MASKED)
@@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
irqp->asserted = 1;
irqp->exists = 1;
- mutex_unlock(&ics->lock);
+ arch_spin_unlock(&ics->lock);
+ local_irq_restore(flags);
if (val & KVM_XICS_PENDING)
icp_deliver_irq(xics, NULL, irqp->number);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 73f0f2723c07..56ea44f9867f 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -78,13 +78,22 @@ struct kvmppc_icp {
u32 rm_reject;
u32 rm_eoied_irq;
+ /* Counters for each reason we exited real mode */
+ unsigned long n_rm_kick_vcpu;
+ unsigned long n_rm_check_resend;
+ unsigned long n_rm_reject;
+ unsigned long n_rm_notify_eoi;
+ /* Counters for handling ICP processing in real mode */
+ unsigned long n_check_resend;
+ unsigned long n_reject;
+
/* Debug stuff for real mode */
union kvmppc_icp_state rm_dbgstate;
struct kvm_vcpu *rm_dbgtgt;
};
struct kvmppc_ics {
- struct mutex lock;
+ arch_spinlock_t lock;
u16 icsid;
struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
};
@@ -96,6 +105,8 @@ struct kvmppc_xics {
u32 max_icsid;
bool real_mode;
bool real_mode_dbg;
+ u32 err_noics;
+ u32 err_noicp;
struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
};
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index cc536d4a75ef..4d33e199edcc 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -338,6 +338,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
pte_t *ptep;
unsigned int wimg = 0;
pgd_t *pgdir;
+ unsigned long flags;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
@@ -468,15 +469,28 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
pgdir = vcpu_e500->vcpu.arch.pgdir;
- ptep = lookup_linux_ptep(pgdir, hva, &tsize_pages);
- if (pte_present(*ptep))
- wimg = (*ptep >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK;
- else {
- if (printk_ratelimit())
- pr_err("%s: pte not present: gfn %lx, pfn %lx\n",
- __func__, (long)gfn, pfn);
- ret = -EINVAL;
- goto out;
+ /*
+ * We are just looking at the wimg bits, so we don't
+ * care much about the trans splitting bit.
+ * We are holding kvm->mmu_lock so a notifier invalidate
+ * can't run hence pfn won't change.
+ */
+ local_irq_save(flags);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL);
+ if (ptep) {
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
+ MAS2_WIMGE_MASK;
+ local_irq_restore(flags);
+ } else {
+ local_irq_restore(flags);
+ pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n",
+ __func__, (long)gfn, pfn);
+ ret = -EINVAL;
+ goto out;
+ }
}
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 91bbc845ac66..ac3ddf115f3d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_PPC_RMA:
r = 0;
break;
+ case KVM_CAP_PPC_HWRNG:
+ r = kvmppc_hwrng_present();
+ break;
#endif
case KVM_CAP_SYNC_MMU:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 2c2022d16059..fda236f908eb 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1066,7 +1066,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
#endif /* CONFIG_PPC_64K_PAGES */
/* Get PTE and page size from page tables */
- ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
+ ptep = __find_linux_pte_or_hugepte(pgdir, ea, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
rc = 1;
@@ -1394,6 +1394,7 @@ tm_abort:
tm_abort(TM_CAUSE_TLBI);
}
#endif
+ return;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index fa9d5c238d22..0ce968b00b7c 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -109,7 +109,7 @@ int pgd_huge(pgd_t pgd)
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
{
/* Only called for hugetlbfs pages, hence can ignore THP */
- return find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
+ return __find_linux_pte_or_hugepte(mm->pgd, addr, NULL);
}
static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
@@ -581,6 +581,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
+ mm_dec_nr_pmds(tlb->mm);
}
static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -681,28 +682,35 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,
} while (addr = next, addr != end);
}
+/*
+ * We are holding mmap_sem, so a parallel huge page collapse cannot run.
+ * To prevent hugepage split, disable irq.
+ */
struct page *
follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
{
pte_t *ptep;
struct page *page;
unsigned shift;
- unsigned long mask;
+ unsigned long mask, flags;
/*
* Transparent hugepages are handled by generic code. We can skip them
* here.
*/
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift);
/* Verify it is a huge page else bail. */
- if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep))
+ if (!ptep || !shift || pmd_trans_huge(*(pmd_t *)ptep)) {
+ local_irq_restore(flags);
return ERR_PTR(-EINVAL);
-
+ }
mask = (1UL << shift) - 1;
page = pte_page(*ptep);
if (page)
page += (address & mask) / PAGE_SIZE;
+ local_irq_restore(flags);
return page;
}
@@ -949,9 +957,12 @@ void flush_dcache_icache_hugepage(struct page *page)
*
* So long as we atomically load page table pointers we are safe against teardown,
* we can follow the address down to the the page and take a ref on it.
+ * This function need to be called with interrupts disabled. We use this variant
+ * when we have MSR[EE] = 0 but the paca->soft_enabled = 1
*/
-pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)
+pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
+ unsigned *shift)
{
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
@@ -1003,12 +1014,11 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift
* A hugepage collapse is captured by pmd_none, because
* it mark the pmd none and do a hpte invalidate.
*
- * A hugepage split is captured by pmd_trans_splitting
- * because we mark the pmd trans splitting and do a
- * hpte invalidate
- *
+ * We don't worry about pmd_trans_splitting here, The
+ * caller if it needs to handle the splitting case
+ * should check for that.
*/
- if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ if (pmd_none(pmd))
return NULL;
if (pmd_huge(pmd) || pmd_large(pmd)) {
@@ -1030,7 +1040,7 @@ out:
*shift = pdshift;
return ret_pte;
}
-EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte);
+EXPORT_SYMBOL_GPL(__find_linux_pte_or_hugepte);
int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index ead55351b254..ff09cde20cd2 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -111,41 +111,45 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
* interrupt context, so if the access faults, we read the page tables
* to find which page (if any) is mapped and access it directly.
*/
-static int read_user_stack_slow(void __user *ptr, void *ret, int nb)
+static int read_user_stack_slow(void __user *ptr, void *buf, int nb)
{
+ int ret = -EFAULT;
pgd_t *pgdir;
pte_t *ptep, pte;
unsigned shift;
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
- unsigned long pfn;
+ unsigned long pfn, flags;
void *kaddr;
pgdir = current->mm->pgd;
if (!pgdir)
return -EFAULT;
+ local_irq_save(flags);
ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift);
+ if (!ptep)
+ goto err_out;
if (!shift)
shift = PAGE_SHIFT;
/* align address to page boundary */
offset = addr & ((1UL << shift) - 1);
- addr -= offset;
- if (ptep == NULL)
- return -EFAULT;
- pte = *ptep;
+ pte = READ_ONCE(*ptep);
if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER))
- return -EFAULT;
+ goto err_out;
pfn = pte_pfn(pte);
if (!page_is_ram(pfn))
- return -EFAULT;
+ goto err_out;
/* no highmem to worry about here */
kaddr = pfn_to_kaddr(pfn);
- memcpy(ret, kaddr + offset, nb);
- return 0;
+ memcpy(buf, kaddr + offset, nb);
+ ret = 0;
+err_out:
+ local_irq_restore(flags);
+ return ret;
}
static int read_user_stack_64(unsigned long __user *ptr, unsigned long *ret)
diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c
index 1a3429e1ccb5..1ba6307be4db 100644
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -111,7 +111,7 @@ out:
static int
spufs_setattr(struct dentry *dentry, struct iattr *attr)
{
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = d_inode(dentry);
if ((attr->ia_valid & ATTR_SIZE) &&
(attr->ia_size != inode->i_size))
@@ -163,14 +163,14 @@ static void spufs_prune_dir(struct dentry *dir)
{
struct dentry *dentry, *tmp;
- mutex_lock(&dir->d_inode->i_mutex);
+ mutex_lock(&d_inode(dir)->i_mutex);
list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) {
spin_lock(&dentry->d_lock);
- if (!(d_unhashed(dentry)) && dentry->d_inode) {
+ if (!(d_unhashed(dentry)) && d_really_is_positive(dentry)) {
dget_dlock(dentry);
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
- simple_unlink(dir->d_inode, dentry);
+ simple_unlink(d_inode(dir), dentry);
/* XXX: what was dcache_lock protecting here? Other
* filesystems (IB, configfs) release dcache_lock
* before unlink */
@@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir)
}
}
shrink_dcache_parent(dir);
- mutex_unlock(&dir->d_inode->i_mutex);
+ mutex_unlock(&d_inode(dir)->i_mutex);
}
/* Caller must hold parent->i_mutex */
@@ -192,7 +192,7 @@ static int spufs_rmdir(struct inode *parent, struct dentry *dir)
d_drop(dir);
res = simple_rmdir(parent, dir);
/* We have to give up the mm_struct */
- spu_forget(SPUFS_I(dir->d_inode)->i_ctx);
+ spu_forget(SPUFS_I(d_inode(dir))->i_ctx);
return res;
}
@@ -222,8 +222,8 @@ static int spufs_dir_close(struct inode *inode, struct file *file)
int ret;
dir = file->f_path.dentry;
- parent = dir->d_parent->d_inode;
- ctx = SPUFS_I(dir->d_inode)->i_ctx;
+ parent = d_inode(dir->d_parent);
+ ctx = SPUFS_I(d_inode(dir))->i_ctx;
mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT);
ret = spufs_rmdir(parent, dir);
@@ -460,7 +460,7 @@ spufs_create_context(struct inode *inode, struct dentry *dentry,
goto out_aff_unlock;
if (affinity) {
- spufs_set_affinity(flags, SPUFS_I(dentry->d_inode)->i_ctx,
+ spufs_set_affinity(flags, SPUFS_I(d_inode(dentry))->i_ctx,
neighbor);
if (neighbor)
put_spu_context(neighbor);
@@ -504,7 +504,7 @@ spufs_mkgang(struct inode *dir, struct dentry *dentry, umode_t mode)
d_instantiate(dentry, inode);
inc_nlink(dir);
- inc_nlink(dentry->d_inode);
+ inc_nlink(d_inode(dentry));
return ret;
out_iput:
@@ -561,7 +561,7 @@ static struct file_system_type spufs_type;
long spufs_create(struct path *path, struct dentry *dentry,
unsigned int flags, umode_t mode, struct file *filp)
{
- struct inode *dir = path->dentry->d_inode;
+ struct inode *dir = d_inode(path->dentry);
int ret;
/* check if we are on spufs */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 920c252d1f49..f8bc950efcae 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2693,7 +2693,6 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
hose->last_busno = 0xff;
}
hose->private_data = phb;
- hose->controller_ops = pnv_pci_controller_ops;
phb->hub_id = hub_id;
phb->opal_id = phb_id;
phb->type = ioda_type;
@@ -2812,6 +2811,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
pnv_pci_controller_ops.enable_device_hook = pnv_pci_enable_device_hook;
pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
pnv_pci_controller_ops.reset_secondary_bus = pnv_pci_reset_secondary_bus;
+ hose->controller_ops = pnv_pci_controller_ops;
#ifdef CONFIG_PCI_IOV
ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 80db43944afe..6eb808ff637e 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -24,12 +24,22 @@
struct powernv_rng {
void __iomem *regs;
+ void __iomem *regs_real;
unsigned long mask;
};
static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
+int powernv_hwrng_present(void)
+{
+ struct powernv_rng *rng;
+
+ rng = get_cpu_var(powernv_rng);
+ put_cpu_var(rng);
+ return rng != NULL;
+}
+
static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
{
unsigned long parity;
@@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
return val;
}
+int powernv_get_random_real_mode(unsigned long *v)
+{
+ struct powernv_rng *rng;
+
+ rng = raw_cpu_read(powernv_rng);
+
+ *v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+ return 1;
+}
+
int powernv_get_random_long(unsigned long *v)
{
struct powernv_rng *rng;
@@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
static __init int rng_create(struct device_node *dn)
{
struct powernv_rng *rng;
+ struct resource res;
unsigned long val;
rng = kzalloc(sizeof(*rng), GFP_KERNEL);
if (!rng)
return -ENOMEM;
+ if (of_address_to_resource(dn, 0, &res)) {
+ kfree(rng);
+ return -ENXIO;
+ }
+
+ rng->regs_real = (void __iomem *)res.start;
+
rng->regs = of_iomap(dn, 0);
if (!rng->regs) {
kfree(rng);
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index b4b11096ea8b..019d34aaf054 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -412,6 +412,10 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
if (rc)
return -EINVAL;
+ rc = dlpar_acquire_drc(drc_index);
+ if (rc)
+ return -EINVAL;
+
parent = of_find_node_by_path("/cpus");
if (!parent)
return -ENODEV;
@@ -422,12 +426,6 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count)
of_node_put(parent);
- rc = dlpar_acquire_drc(drc_index);
- if (rc) {
- dlpar_free_cc_nodes(dn);
- return -EINVAL;
- }
-
rc = dlpar_attach_node(dn);
if (rc) {
dlpar_release_drc(drc_index);