diff options
author | 2010-12-14 20:24:25 +0000 | |
---|---|---|
committer | 2010-12-14 20:24:25 +0000 | |
commit | 10f5ff43374bcd4127827e94c645a10eae6c9afb (patch) | |
tree | f7c2e340b821aeae57147dceb17cb66a3b5f67fb | |
parent | use the dying flag in struct usbd_bus instead of a private dying flag (diff) | |
download | wireguard-openbsd-10f5ff43374bcd4127827e94c645a10eae6c9afb.tar.xz wireguard-openbsd-10f5ff43374bcd4127827e94c645a10eae6c9afb.zip |
"Implement fast path TLB miss handling. Walk the page table without
creating a trapframe, with exceptions disabled and using only BANK1
registers. If a valid pte is found, load it and return. Otherwise
create a trapframe and proceed to the full-blown C handler."
from uwe@netbsd, ok miod@
speed-ups measured by miod@ and me were between 44% and 50%...
-rw-r--r-- | sys/arch/sh/sh/genassym.cf | 4 | ||||
-rw-r--r-- | sys/arch/sh/sh/pmap.c | 7 | ||||
-rw-r--r-- | sys/arch/sh/sh/vectors.S | 362 |
3 files changed, 301 insertions, 72 deletions
diff --git a/sys/arch/sh/sh/genassym.cf b/sys/arch/sh/sh/genassym.cf index ab305f6ad15..1e2505fc704 100644 --- a/sys/arch/sh/sh/genassym.cf +++ b/sys/arch/sh/sh/genassym.cf @@ -1,4 +1,4 @@ -# $OpenBSD: genassym.cf,v 1.5 2008/06/26 05:42:13 ray Exp $ +# $OpenBSD: genassym.cf,v 1.6 2010/12/14 20:24:25 jasper Exp $ # $NetBSD: genassym.cf,v 1.10 2005/12/11 12:19:00 christos Exp $ #- @@ -38,6 +38,7 @@ include <sh/fpu.h> include <sh/locore.h> include <sh/reg.h> include <sh/vmparam.h> +include <sh/pte.h> struct trapframe define TF_SIZE sizeof(struct trapframe) @@ -72,6 +73,7 @@ struct uvmexp UVMEXP_ member intrs export VM_MAXUSER_ADDRESS +export VM_MIN_KERNEL_ADDRESS export EFAULT export ENAMETOOLONG diff --git a/sys/arch/sh/sh/pmap.c b/sys/arch/sh/sh/pmap.c index c8a624edcac..3adb1cd3305 100644 --- a/sys/arch/sh/sh/pmap.c +++ b/sys/arch/sh/sh/pmap.c @@ -1,4 +1,4 @@ -/* $OpenBSD: pmap.c,v 1.16 2010/12/06 20:57:17 miod Exp $ */ +/* $OpenBSD: pmap.c,v 1.17 2010/12/14 20:24:25 jasper Exp $ */ /* $NetBSD: pmap.c,v 1.55 2006/08/07 23:19:36 tsutsui Exp $ */ /*- @@ -57,6 +57,9 @@ struct pmap __pmap_kernel; STATIC vaddr_t __pmap_kve; /* VA of last kernel virtual */ +/* For the fast tlb miss handler */ +pt_entry_t **curptd; /* p1 va of curlwp->...->pm_ptp */ + /* pmap pool */ STATIC struct pool __pmap_pmap_pool; @@ -277,7 +280,9 @@ pmap_activate(struct proc *p) pmap->pm_asid = __pmap_asid_alloc(); KDASSERT(pmap->pm_asid >=0 && pmap->pm_asid < 256); + sh_tlb_set_asid(pmap->pm_asid); + curptd = pmap->pm_ptp; } int diff --git a/sys/arch/sh/sh/vectors.S b/sys/arch/sh/sh/vectors.S index a8f366bd17f..bb11814c90f 100644 --- a/sys/arch/sh/sh/vectors.S +++ b/sys/arch/sh/sh/vectors.S @@ -1,4 +1,4 @@ -/* $OpenBSD: vectors.S,v 1.7 2010/07/01 07:24:26 jasper Exp $ */ +/* $OpenBSD: vectors.S,v 1.8 2010/12/14 20:24:25 jasper Exp $ */ /* $NetBSD: exception_vector.S,v 1.19 2006/08/22 21:47:57 uwe Exp $ */ /*- @@ -31,6 +31,7 @@ #include <sh/param.h> #include <sh/locore.h> +#include <sh/pte.h> #include <sh/trap.h> #include <sh/ubcreg.h> #include <sh/mmu_sh3.h> @@ -82,7 +83,7 @@ NENTRY(sh_vector_generic) cmp/hi r1, r0 bt 1f - /* tlb_exception(curlwp, trapframe, TEA); */ + /* tlb_exception(curproc, trapframe, TEA); */ mov.l .Lg_VPN_MASK, r1 and r1, r6 /* va = trunc_page(va) */ __EXCEPTION_UNBLOCK(r0, r1) @@ -138,68 +139,159 @@ VECTOR_END_MARKER(sh_vector_generic_end) /* * LINTSTUB: Var: char sh3_vector_tlbmiss[1]; * - * void sh3_vector_tlbmiss(void); - * Copied to VBR+0x400. This code should be position independent - * and maximum 512 bytes long (== 0x600 - 0x400). + * TLB miss vector. We run through the fast path first, checking if + * there's a valid mapping in curproc or kernel pmap. We do fast path + * with exceptions disabled, so no P3 addresses please (including no + * kernel stack, as we cannot wire TLB entries on sh3). We can only + * use BANK1 registers, and of those r6 and r7 are already taken. + * + * If we don't find a valid mapping in the fast path, we do context + * save and call tlb exception handler. + * + * Copied to VBR+0x400. This code should be position independent + * and maximum 512 bytes long (== 0x600 - 0x400). */ NENTRY(sh3_vector_tlbmiss) __EXCEPTION_ENTRY - mov #(SH3_TEA & 0xff), r0 - mov.l @r0, r6 /* 3rd arg: va = TEA */ - - /* if kernel stack is in P3, handle it here fast */ -#if !defined(P1_STACK) - cmp/pz r6 - bt 6f /* userspace address, proceed to handler */ - - /* Load kernel stack */ - mov.l .L3_VPN_MASK, r0 - and r6, r0 /* VPN */ - - mov.l .L3_CURUPTE, r1 - mov.l @r1, r1 /* upte = &l->l_md.md_upte[0] */ - mov #UPAGES, r3 /* loop limit */ - - /* for each page of u-area */ -4: mov.l @r1+, r7 /* upte->addr: u-area VPN */ - cmp/eq r7, r0 /* if (vpn == upte->addr) */ - bt/s 5f /* goto found; */ - dt r3 - bf/s 4b - add #4, r1 /* skip upte->data; point to next md_upte[i] */ - /* not a page of u-area, proceed to handler */ - bra 7f /* pull insn at 6f into delay slot */ - mov #(SH3_EXPEVT & 0xff), r0 - - /* load entry for this uarea page into tlb */ -5: mov #(SH3_PTEH & 0xff), r2 - mov.l @r1, r1 /* md_upte[i]->data */ - mov.l @r2, r3 /* save ASID */ - mov.l r0, @r2 /* SH3_PTEH = { VPN, ASID = 0 } */ - mov.l r1, @(4, r2) /* SH3_PTEL = md_upte[i]->data */ + mov #(SH3_PTEH & 0xff), r4 + mov.l .L3_VPN_cleanup, r0 + mov.l @r4, r5 + and r0, r5 /* trim vpn to 4K page boundary */ + /* + * For the duration of fast path we keep + * r4: SH3_PTEH - other PTE regs are addressable as @(offset, r4) + * r5: { VPN, ASID } that caused the miss + */ - ldtlb - bra 99f /* return */ - mov.l r3, @r2 /* restore ASID */ -#endif /* !P1_STACK */ - /* tlb_exception(curproc, trapframe, tea) */ -6: mov #(SH3_EXPEVT & 0xff), r0 -7: mov.l @r0, r0 - mov.l r0, @(TF_EXPEVT, r14) /* trapframe->tf_expevt = EXPEVT */ - mov.l .L3_curproc, r0 - mov.l @r0, r4 /* 1st arg: curproc */ + cmp/pz r5 /* user space address? */ + bt/s .L3_user_va + mov r5, r2 /* copy of vpn to compute indices into ptd/ptp */ + + /* + * kernel space address, use pmap_kernel(), adjust vpn for indexing + * see __pmap_kpte_lookup + */ +.L3_kernel_va: + mov.l .L3_VM_MIN_KERNEL_ADDRESS, r0 + mov.l .L3_kernptd, r1 /* pmap_kernel()->pm_ptp */ + bra .L3_fetch_pte + sub r0, r2 /* vpn -= VM_MIN_KERNEL_ADDRESS */ + + /* user space address, use curproc's pmap */ +.L3_user_va: + mov.l .L3_curptd, r1 ! curproc->...->pm_ptp + + /* see __pmap_pte_lookup */ +.L3_fetch_pte: + mov.l @r1, r3 /* fetch ptd */ + + /* + * r2: vpn, prepared for indexing into ptd + * r3: pt_entry_t **ptd => pt_entry_t *ptp => pt_entry_t pte + */ +#ifdef DEBUG + tst r3, r3 /* ptd == NULL - cannot happen */ + bt/s .L3_call_tlb_exception +#endif + mov #-22, r1 /* __PMAP_PTP_SHIFT */ + + /* __PMAP_PTP_INDEX(vpn) */ + mov r2, r0 + shld r1, r0 /* vpn >> __PMAP_PTP_SHIFT */ + mov.l .L3_ptp_index_mask, r1 + and r1, r0 /* ... & (__PMAP_PTP_N - 1) */ + shll2 r0 /* array index -> array offset */ + mov.l @(r0, r3), r3 /* ptp = ptd[idx] */ + tst r3, r3 /* if (ptp == NULL) */ + bt/s .L3_call_tlb_exception + mov #-(PGSHIFT - 2), r1 + + /* + * __PMAP_PTP_OFSET(vpn) - except we pre-shift 2 bits left to + * get the array offset directly, as we know bits 10 and 11 + * are zero (we cleaned them in r5 to get 4K aligned VPN) + */ + shld r1, r2 /* vpn >> (PGSHIFT - 2) */ + mov.l .L3_ptp_offset_mask, r0 + and r2, r0 /* ... & ((__PMAP_PTP_PG_N - 1) << 2) */ + mov.l @(r0, r3), r3 /* pte = ptp[idx] */ + + + /* r3: pte */ + /* r4: SH3_PTEH */ + /* r5: { VPN, ASID } */ + + mov.l .L3_PG_V, r0 + tst r0, r3 /* if ((pte & PG_V) == 0) */ + bt/s .L3_call_tlb_exception + nop + + mov.l .L3_PG_HW_BITS, r1 + cmp/pz r5 /* user space address? */ + and r1, r3 /* pte &= PG_HW_BITS */ + bf/s .L3_load_kernel + mov.l r3, @(0x04, r4) /* *SH3_PTEL = pte */ + + /* + * load mapping for a user space page + * we reload PTEH to enter VPN aligned to 4K page boundary + */ +.L3_load_user: + mov.l r5, @r4 /* *SH3_PTEH = { VPN, ASID } */ + ldtlb /* needs 2 insns padding before RTE */ + nop + nop + rte + nop + + /* + * load mapping for a kernel space page + * we need to temporary set ASID to 0 + */ +.L3_load_kernel: + mov.l .L3_clear_ASID, r1 + and r5, r1 /* *SH3_PTEH & ~SH3_PTEH_ASID_MASK */ + mov.l r1, @r4 /* *SH3_PTEH = { VPN, ASID = 0 } */ + ldtlb + mov.l r5, @r4 /* restore ASID */ + nop + rte + nop + + + /* + * If we haven't found a valid mapping in the fast path + * tlb_exception(curproc, trapframe, tea) + */ +.L3_call_tlb_exception: + __EXCEPTION_ENTRY + mov.l .L3_SH3_EXPEVT, r2 + mov.l .L3_curproc, r1 + mov #(SH3_TEA & 0xff), r0 + mov.l @r2, r2 /* *SH3_EXPEVT */ + mov.l @r0, r6 /* arg3: va = *SH3_TEA */ + mov.l @r1, r4 /* arg1: curproc */ __INTR_MASK(r0, r1) __EXCEPTION_UNBLOCK(r0, r1) mov.l .L3_tlb_exception, r0 + ov.l r2, @(TF_EXPEVT, r14) /* tf->tf_expevt = EXPEVT */ jsr @r0 - mov r14, r5 /* 2nd arg: trap frame */ -99: __EXCEPTION_RETURN + mov r14, r5 /* arg2: trapframe */ + __EXCEPTION_RETURN - .align 5 -.L3_curproc: .long _C_LABEL(cpu_info_store) + CI_CURPROC -.L3_tlb_exception: .long _C_LABEL(tlb_exception) -.L3_VPN_MASK: .long 0xfffff000 -.L3_CURUPTE: .long _C_LABEL(curupte) + .align 4 +.L3_VPN_cleanup: .long ~0x00000c00 +.L3_curptd: .long _C_LABEL(curptd) +.L3_kernptd: .long _C_LABEL(__pmap_kernel) +.L3_VM_MIN_KERNEL_ADDRESS: .long VM_MIN_KERNEL_ADDRESS +.L3_ptp_index_mask: .long 0x1ff +.L3_ptp_offset_mask: .long 0x3ff << 2 +.L3_PG_HW_BITS: .long PG_HW_BITS +.L3_PG_V: .long PG_V +.L3_clear_ASID: .long ~SH3_PTEH_ASID_MASK +.L3_SH3_EXPEVT: .long SH3_EXPEVT +.L3_curproc: .long _C_LABEL(cpu_info_store) + CI_CURPROC +.L3_tlb_exception: .long _C_LABEL(tlb_exception) /* LINTSTUB: Var: char sh3_vector_tlbmiss_end[1]; */ VECTOR_END_MARKER(sh3_vector_tlbmiss_end) @@ -211,31 +303,161 @@ VECTOR_END_MARKER(sh3_vector_tlbmiss_end) /* * LINTSTUB: Var: char sh4_vector_tlbmiss[1]; * - * void sh4_vector_tlbmiss(void); - * Copied to VBR+0x400. This code should be position independent - * and maximum 512 bytes long (== 0x600 - 0x400). + * TLB miss vector. We run through the fast path first, checking if + * there's a valid mapping in curproc or kernel pmap. We do fast path + * with exceptions disabled, so no P3 addresses please (though we can + * use kernel stack if need be, as its TLB entries are wired). We can + * only use BANK1 registers, and of those r6 and r7 are already taken. + * + * If we don't find a valid mapping in the fast path, we do context + * save and call tlb exception handler. + * + * Copied to VBR+0x400. This code should be relocatable + * and maximum 512 bytes long (== 0x600 - 0x400). */ NENTRY(sh4_vector_tlbmiss) + mov.l .L4_SH4_PTEH, r4 + mov.l .L4_VPN_cleanup, r0 + mov.l @r4, r5 + and r0, r5 /* trim vpn to 4K page boundary */ + /* + * For the duration of fast path we keep + * r4: SH4_PTEH - other PTE regs are addressable as @(offset, r4) + * r5: { VPN, ASID } that caused the miss + */ + + cmp/pz r5 /* user space address? */ + bt/s .L4_user_va + mov r5, r2 /* copy of vpn to compute indices into ptd/ptp */ + + /* + * kernel space address, use pmap_kernel(), adjust vpn for indexing + * see __pmap_kpte_lookup + */ +.L4_kernel_va: + mov.l .L4_VM_MIN_KERNEL_ADDRESS, r0 + mov.l .L4_kernptd, r1 /* pmap_kernel()->pm_ptp */ + bra .L4_fetch_pte + sub r0, r2 /* vpn -= VM_MIN_KERNEL_ADDRESS */ + + /* user space address, use curproc's pmap */ +.L4_user_va: + mov.l .L4_curptd, r1 /* curproc->...->pm_ptp */ + + /* see __pmap_pte_lookup */ +.L4_fetch_pte: + mov.l @r1, r3 /* fetch ptd */ + + /* + * r2: vpn, prepared for indexing into ptd + * r3: pt_entry_t **ptd => pt_entry_t *ptp => pt_entry_t pte + */ +#ifdef DEBUG + tst r3, r3 /* ptd == NULL - cannot happen */ + bt/s .L4_call_tlb_exception +#endif + mov #-22, r1 /* __PMAP_PTP_SHIFT */ + + /* __PMAP_PTP_INDEX(vpn) */ + mov r2, r0 + shld r1, r0 /* vpn >> __PMAP_PTP_SHIFT */ + mov.l .L4_ptp_index_mask, r1 + and r1, r0 /* ... & (__PMAP_PTP_N - 1) */ + shll2 r0 /* array index -> array offset */ + mov.l @(r0, r3), r3 /* ptp = ptd[idx] */ + tst r3, r3 /* if (ptp == NULL) */ + bt/s .L4_call_tlb_exception + mov #-(PGSHIFT - 2), r1 + + /* + * __PMAP_PTP_OFSET(vpn) - except we pre-shift 2 bits left to + * get the array offset directly, as we know bits 10 and 11 + * are zero (we cleaned them in r5 to get 4K aligned VPN) + */ + shld r1, r2 /* vpn >> (PGSHIFT - 2) */ + mov.l .L4_ptp_offset_mask, r0 + and r2, r0 /* ... & ((__PMAP_PTP_PG_N - 1) << 2) */ + mov.l @(r0, r3), r3 /* pte = ptp[idx] */ + + + /* r3: pte */ + /* r4: SH4_PTEH */ + /* r5: { VPN, ASID } */ + + mov.l .L4_PG_V, r0 + tst r0, r3 /* if ((pte & PG_V) == 0) */ + bt/s .L4_call_tlb_exception + mov r3, r0 /* prepare PCMCIA SA bits for SH4_PTEA */ + + mov.l .L4_PG_HW_BITS, r1 + shlr8 r0 + and r1, r3 /* pte &= PG_HW_BITS */ + shlr r0 /* pte >> _PG_PCMCIA_SHIFT */ + cmp/pz r5 /* user space address? */ + and #SH4_PTEA_SA_MASK, r0 + mov.l r3, @(0x04, r4) /* *SH4_PTEL = pte */ + bf/s .L4_load_kernel + mov.l r0, @(0x34, r4) /* *SH4_PTEA = PCMCIA space attrs */ + + /* + * Load mapping for a user space page + * we reload PTEH to enter VPN aligned to 4K page boundary + */ +.L4_load_user: + mov.l r5, @r4 /* *SH4_PTEH = { VPN, ASID } */ + ldtlb /* needs 1 insn padding before RTE */ + nop + rte + nop + + /* + * Load mapping for a kernel space page + * we need to temporary set ASID to 0 + */ +.L4_load_kernel: + mov.l .L4_clear_ASID, r1 + and r5, r1 /* *SH4_PTEH & ~SH4_PTEH_ASID_MASK */ + mov.l r1, @r4 /* *SH4_PTEH = { VPN, ASID = 0 } */ + ldtlb + mov.l r5, @r4 /* restore ASID */ + rte + nop + + + /* + * If we haven't found a valid mapping in the fast path + * tlb_exception(curproc, trapframe, tea) + */ +.L4_call_tlb_exception: __EXCEPTION_ENTRY - mov.l .L4_TEA4, r0 - mov.l @r0, r6 - mov.l .L4_EXPEVT4, r0 - mov.l @r0, r0 - mov.l r0, @(TF_EXPEVT, r14) /* trapframe->tf_expevt = EXPEVT */ - mov.l .L4_curproc, r0 - mov.l @r0, r4 /* 1st arg */ + mov.l .L4_SH4_PTEH, r0 + mov.l .L4_curproc, r1 + mov.l @(0x24, r0), r2 /* *SH4_EXPEVT */ + mov.l @(0x0c, r0), r6 /* arg3: va = *SH4_TEA */ + mov.l @r1, r4 /* arg1: curproc */ __INTR_MASK(r0, r1) __EXCEPTION_UNBLOCK(r0, r1) mov.l .L4_tlb_exception, r0 + mov.l r2, @(TF_EXPEVT, r14) /* tf->tf_expevt = EXPEVT */ jsr @r0 - mov r14, r5 /* 2nd arg */ + mov r14, r5 /* arg2: trapframe */ __EXCEPTION_RETURN .align 5 -.L4_tlb_exception: .long _C_LABEL(tlb_exception) -.L4_curproc: .long _C_LABEL(cpu_info_store) + CI_CURPROC -.L4_EXPEVT4: .long SH4_EXPEVT -.L4_TEA4: .long SH4_TEA + +.L4_SH4_PTEH: .long SH4_PTEH +.L4_VPN_cleanup: .long ~0x00000c00 +.L4_curptd: .long _C_LABEL(curptd) +.L4_kernptd: .long _C_LABEL(__pmap_kernel) +.L4_VM_MIN_KERNEL_ADDRESS: .long VM_MIN_KERNEL_ADDRESS +.L4_ptp_index_mask: .long 0x1ff +.L4_ptp_offset_mask: .long 0x3ff << 2 +.L4_PG_HW_BITS: .long PG_HW_BITS +.L4_PG_V: .long PG_V +.L4_clear_ASID: .long ~SH4_PTEH_ASID_MASK +.L4_curproc: .long _C_LABEL(cpu_info_store) + CI_CURPROC +.L4_tlb_exception: .long _C_LABEL(tlb_exception) + /* LINTSTUB: Var: char sh4_vector_tlbmiss_end[1]; */ VECTOR_END_MARKER(sh4_vector_tlbmiss_end) |