From d7d3c2ea99c4845611997cf728af88c4c232e908 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Thu, 29 Jul 2010 14:48:07 +0200
Subject: KVM: PPC: Add Documentation about PV interface

We just introduced a new PV interface that screams for documentation. So here
it is - a shiny new and awesome text file describing the internal works of
the PPC KVM paravirtual interface.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/ppc-pv.txt | 179 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 Documentation/kvm/ppc-pv.txt

(limited to 'Documentation/kvm/ppc-pv.txt')

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
new file mode 100644
index 000000000000..41ee16d954d7
--- /dev/null
+++ b/Documentation/kvm/ppc-pv.txt
@@ -0,0 +1,179 @@
+The PPC KVM paravirtual interface
+=================================
+
+The basic execution principle by which KVM on PowerPC works is to run all kernel
+space code in PR=1 which is user space. This way we trap all privileged
+instructions and can emulate them accordingly.
+
+Unfortunately that is also the downfall. There are quite some privileged
+instructions that needlessly return us to the hypervisor even though they
+could be handled differently.
+
+This is what the PPC PV interface helps with. It takes privileged instructions
+and transforms them into unprivileged ones with some help from the hypervisor.
+This cuts down virtualization costs by about 50% on some of my benchmarks.
+
+The code for that interface can be found in arch/powerpc/kernel/kvm*
+
+Querying for existence
+======================
+
+To find out if we're running on KVM or not, we leverage the device tree. When
+Linux is running on KVM, a node /hypervisor exists. That node contains a
+compatible property with the value "linux,kvm".
+
+Once you determined you're running under a PV capable KVM, you can now use
+hypercalls as described below.
+
+KVM hypercalls
+==============
+
+Inside the device tree's /hypervisor node there's a property called
+'hypercall-instructions'. This property contains at most 4 opcodes that make
+up the hypercall. To call a hypercall, just call these instructions.
+
+The parameters are as follows:
+
+	Register	IN			OUT
+
+	r0		-			volatile
+	r3		1st parameter		Return code
+	r4		2nd parameter		1st output value
+	r5		3rd parameter		2nd output value
+	r6		4th parameter		3rd output value
+	r7		5th parameter		4th output value
+	r8		6th parameter		5th output value
+	r9		7th parameter		6th output value
+	r10		8th parameter		7th output value
+	r11		hypercall number	8th output value
+	r12		-			volatile
+
+Hypercall definitions are shared in generic code, so the same hypercall numbers
+apply for x86 and powerpc alike with the exception that each KVM hypercall
+also needs to be ORed with the KVM vendor code which is (42 << 16).
+
+Return codes can be as follows:
+
+	Code		Meaning
+
+	0		Success
+	12		Hypercall not implemented
+	<0		Error
+
+The magic page
+==============
+
+To enable communication between the hypervisor and guest there is a new shared
+page that contains parts of supervisor visible register state. The guest can
+map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
+
+With this hypercall issued the guest always gets the magic page mapped at the
+desired location in effective and physical address space. For now, we always
+map the page to -4096. This way we can access it using absolute load and store
+functions. The following instruction reads the first field of the magic page:
+
+	ld	rX, -4096(0)
+
+The interface is designed to be extensible should there be need later to add
+additional registers to the magic page. If you add fields to the magic page,
+also define a new hypercall feature to indicate that the host can give you more
+registers. Only if the host supports the additional features, make use of them.
+
+The magic page has the following layout as described in
+arch/powerpc/include/asm/kvm_para.h:
+
+struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 dar;
+	__u64 msr;
+	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+};
+
+Additions to the page must only occur at the end. Struct fields are always 32
+or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
+
+MSR bits
+========
+
+The MSR contains bits that require hypervisor intervention and bits that do
+not require direct hypervisor intervention because they only get interpreted
+when entering the guest or don't have any impact on the hypervisor's behavior.
+
+The following bits are safe to be set inside the guest:
+
+  MSR_EE
+  MSR_RI
+  MSR_CR
+  MSR_ME
+
+If any other bit changes in the MSR, please still use mtmsr(d).
+
+Patched instructions
+====================
+
+The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions
+respectively on 32 bit systems with an added offset of 4 to accomodate for big
+endianness.
+
+The following is a list of mapping the Linux kernel performs when running as
+guest. Implementing any of those mappings is optional, as the instruction traps
+also act on the shared page. So calling privileged instructions still works as
+before.
+
+From			To
+====			==
+
+mfmsr	rX		ld	rX, magic_page->msr
+mfsprg	rX, 0		ld	rX, magic_page->sprg0
+mfsprg	rX, 1		ld	rX, magic_page->sprg1
+mfsprg	rX, 2		ld	rX, magic_page->sprg2
+mfsprg	rX, 3		ld	rX, magic_page->sprg3
+mfsrr0	rX		ld	rX, magic_page->srr0
+mfsrr1	rX		ld	rX, magic_page->srr1
+mfdar	rX		ld	rX, magic_page->dar
+mfdsisr	rX		lwz	rX, magic_page->dsisr
+
+mtmsr	rX		std	rX, magic_page->msr
+mtsprg	0, rX		std	rX, magic_page->sprg0
+mtsprg	1, rX		std	rX, magic_page->sprg1
+mtsprg	2, rX		std	rX, magic_page->sprg2
+mtsprg	3, rX		std	rX, magic_page->sprg3
+mtsrr0	rX		std	rX, magic_page->srr0
+mtsrr1	rX		std	rX, magic_page->srr1
+mtdar	rX		std	rX, magic_page->dar
+mtdsisr	rX		stw	rX, magic_page->dsisr
+
+tlbsync			nop
+
+mtmsrd	rX, 0		b	<special mtmsr section>
+mtmsr	rX		b	<special mtmsr section>
+
+mtmsrd	rX, 1		b	<special mtmsrd section>
+
+[BookE only]
+wrteei	[0|1]		b	<special wrteei section>
+
+
+Some instructions require more logic to determine what's going on than a load
+or store instruction can deliver. To enable patching of those, we keep some
+RAM around where we can live translate instructions to. What happens is the
+following:
+
+	1) copy emulation code to memory
+	2) patch that code to fit the emulated instruction
+	3) patch that code to return to the original pc + 4
+	4) patch the original instruction to branch to the new code
+
+That way we can inject an arbitrary amount of code as replacement for a single
+instruction. This allows us to check for pending interrupts when setting EE=1
+for example.
-- 
cgit v1.2.3-59-g8ed1b


From cbe487fac7fc016dbabbcbe83f11384e1803a56d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 3 Aug 2010 10:39:35 +0200
Subject: KVM: PPC: Add mtsrin PV code

This is the guest side of the mtsr acceleration. Using this a guest can now
call mtsrin with almost no overhead as long as it ensures that it only uses
it with (MSR_IR|MSR_DR) == 0. Linux does that, so we're good.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/kvm/ppc-pv.txt      |  3 ++
 arch/powerpc/kernel/asm-offsets.c |  1 +
 arch/powerpc/kernel/kvm.c         | 60 +++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/kvm_emul.S    | 50 ++++++++++++++++++++++++++++++++
 4 files changed, 114 insertions(+)

(limited to 'Documentation/kvm/ppc-pv.txt')

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
index 41ee16d954d7..922cf954bd99 100644
--- a/Documentation/kvm/ppc-pv.txt
+++ b/Documentation/kvm/ppc-pv.txt
@@ -160,6 +160,9 @@ mtmsr	rX		b	<special mtmsr section>
 
 mtmsrd	rX, 1		b	<special mtmsrd section>
 
+[Book3S only]
+mtsrin	rX, rY		b	<special mtsrin section>
+
 [BookE only]
 wrteei	[0|1]		b	<special wrteei section>
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 6d92b4e13ebf..7f0d6fcc28a3 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -478,6 +478,7 @@ int main(void)
 	DEFINE(KVM_MAGIC_MSR, offsetof(struct kvm_vcpu_arch_shared, msr));
 	DEFINE(KVM_MAGIC_CRITICAL, offsetof(struct kvm_vcpu_arch_shared,
 					    critical));
+	DEFINE(KVM_MAGIC_SR, offsetof(struct kvm_vcpu_arch_shared, sr));
 #endif
 
 #ifdef CONFIG_44x
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 226882fe85a6..c8bab24ff8ac 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -42,6 +42,7 @@
 #define KVM_INST_B_MAX		0x01ffffff
 
 #define KVM_MASK_RT		0x03e00000
+#define KVM_MASK_RB		0x0000f800
 #define KVM_INST_MFMSR		0x7c0000a6
 #define KVM_INST_MFSPR_SPRG0	0x7c1042a6
 #define KVM_INST_MFSPR_SPRG1	0x7c1142a6
@@ -69,6 +70,8 @@
 #define KVM_INST_WRTEEI_0	0x7c000146
 #define KVM_INST_WRTEEI_1	0x7c008146
 
+#define KVM_INST_MTSRIN		0x7c0001e4
+
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
 static int kvm_tmp_index;
@@ -264,6 +267,51 @@ static void kvm_patch_ins_wrteei(u32 *inst)
 
 #endif
 
+#ifdef CONFIG_PPC_BOOK3S_32
+
+extern u32 kvm_emulate_mtsrin_branch_offs;
+extern u32 kvm_emulate_mtsrin_reg1_offs;
+extern u32 kvm_emulate_mtsrin_reg2_offs;
+extern u32 kvm_emulate_mtsrin_orig_ins_offs;
+extern u32 kvm_emulate_mtsrin_len;
+extern u32 kvm_emulate_mtsrin[];
+
+static void kvm_patch_ins_mtsrin(u32 *inst, u32 rt, u32 rb)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtsrin_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtsrin_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtsrin, kvm_emulate_mtsrin_len * 4);
+	p[kvm_emulate_mtsrin_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtsrin_reg1_offs] |= (rb << 10);
+	p[kvm_emulate_mtsrin_reg2_offs] |= rt;
+	p[kvm_emulate_mtsrin_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtsrin_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
 static void kvm_map_magic_page(void *data)
 {
 	u32 *features = data;
@@ -360,6 +408,18 @@ static void kvm_check_ins(u32 *inst, u32 features)
 		break;
 	}
 
+	switch (inst_no_rt & ~KVM_MASK_RB) {
+#ifdef CONFIG_PPC_BOOK3S_32
+	case KVM_INST_MTSRIN:
+		if (features & KVM_MAGIC_FEAT_SR) {
+			u32 inst_rb = _inst & KVM_MASK_RB;
+			kvm_patch_ins_mtsrin(inst, inst_rt, inst_rb);
+		}
+		break;
+		break;
+#endif
+	}
+
 	switch (_inst) {
 #ifdef CONFIG_BOOKE
 	case KVM_INST_WRTEEI_0:
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 3199f65ede2c..a6e97e7a55e0 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -245,3 +245,53 @@ kvm_emulate_wrteei_ee_offs:
 .global kvm_emulate_wrteei_len
 kvm_emulate_wrteei_len:
 	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
+
+
+.global kvm_emulate_mtsrin
+kvm_emulate_mtsrin:
+
+	SCRATCH_SAVE
+
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	andi.	r31, r31, MSR_DR | MSR_IR
+	beq	kvm_emulate_mtsrin_reg1
+
+	SCRATCH_RESTORE
+
+kvm_emulate_mtsrin_orig_ins:
+	nop
+	b	kvm_emulate_mtsrin_branch
+
+kvm_emulate_mtsrin_reg1:
+	/* rX >> 26 */
+	rlwinm  r30,r0,6,26,29
+
+kvm_emulate_mtsrin_reg2:
+	stw	r0, (KVM_MAGIC_PAGE + KVM_MAGIC_SR)(r30)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtsrin_branch:
+	b	.
+kvm_emulate_mtsrin_end:
+
+.global kvm_emulate_mtsrin_branch_offs
+kvm_emulate_mtsrin_branch_offs:
+	.long (kvm_emulate_mtsrin_branch - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg1_offs
+kvm_emulate_mtsrin_reg1_offs:
+	.long (kvm_emulate_mtsrin_reg1 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_reg2_offs
+kvm_emulate_mtsrin_reg2_offs:
+	.long (kvm_emulate_mtsrin_reg2 - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_orig_ins_offs
+kvm_emulate_mtsrin_orig_ins_offs:
+	.long (kvm_emulate_mtsrin_orig_ins - kvm_emulate_mtsrin) / 4
+
+.global kvm_emulate_mtsrin_len
+kvm_emulate_mtsrin_len:
+	.long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4
-- 
cgit v1.2.3-59-g8ed1b


From d1e87c7ee65a20b10faf7e59dbe2cc934c32473b Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Tue, 31 Aug 2010 04:25:39 +0200
Subject: KVM: PPC: Add documentation for magic page enhancements

This documents how to detect additional features inside the magic
page when a guest maps it.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/kvm/ppc-pv.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'Documentation/kvm/ppc-pv.txt')

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
index 922cf954bd99..a7f2244b3be9 100644
--- a/Documentation/kvm/ppc-pv.txt
+++ b/Documentation/kvm/ppc-pv.txt
@@ -102,6 +102,20 @@ struct kvm_vcpu_arch_shared {
 Additions to the page must only occur at the end. Struct fields are always 32
 or 64 bit aligned, depending on them being 32 or 64 bit wide respectively.
 
+Magic page features
+===================
+
+When mapping the magic page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE,
+a second return value is passed to the guest. This second return value contains
+a bitmap of available features inside the magic page.
+
+The following enhancements to the magic page are currently available:
+
+  KVM_MAGIC_FEAT_SR		Maps SR registers r/w in the magic page
+
+For enhanced features in the magic page, please check for the existence of the
+feature before using them!
+
 MSR bits
 ========
 
-- 
cgit v1.2.3-59-g8ed1b