summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorvisa <visa@openbsd.org>2020-07-11 15:18:08 +0000
committervisa <visa@openbsd.org>2020-07-11 15:18:08 +0000
commit9e8849fbab72cdc16ba20d882ee170d279c64c6a (patch)
tree0321d350599778525ceb40c7fd96b0c7b023b9b0
parentOptimize rasops_vcons_copyrows() so write-only framebuffer consoles (diff)
downloadwireguard-openbsd-9e8849fbab72cdc16ba20d882ee170d279c64c6a.tar.xz
wireguard-openbsd-9e8849fbab72cdc16ba20d882ee170d279c64c6a.zip
Synchronize each core's CP0 cycle counter using the IO clock counter.
This makes the cycle counter usable as timecounter on multiprocessor machines. Idea from Linux. Tested on CN5020, CN6120, CN7130 and CN7360. Looks reasonable to kettenis@
-rw-r--r--sys/arch/mips64/include/cpu.h3
-rw-r--r--sys/arch/mips64/mips64/cpu.c3
-rw-r--r--sys/arch/mips64/mips64/mips64_machdep.c9
-rw-r--r--sys/arch/octeon/include/octeonreg.h6
-rw-r--r--sys/arch/octeon/octeon/locore.S34
-rw-r--r--sys/arch/octeon/octeon/machdep.c59
6 files changed, 103 insertions, 11 deletions
diff --git a/sys/arch/mips64/include/cpu.h b/sys/arch/mips64/include/cpu.h
index 6614cf9c321..c1ab42880e1 100644
--- a/sys/arch/mips64/include/cpu.h
+++ b/sys/arch/mips64/include/cpu.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.h,v 1.129 2020/05/31 06:23:58 dlg Exp $ */
+/* $OpenBSD: cpu.h,v 1.130 2020/07/11 15:18:08 visa Exp $ */
/*-
* Copyright (c) 1992, 1993
@@ -413,6 +413,7 @@ void signotify(struct proc *);
#if defined(_KERNEL) && !defined(_LOCORE)
extern register_t protosr;
+extern int cpu_has_synced_cp0_count;
extern int cpu_has_userlocal;
#ifdef FPUEMUL
diff --git a/sys/arch/mips64/mips64/cpu.c b/sys/arch/mips64/mips64/cpu.c
index 7a77ae28de8..8270b4ac4c3 100644
--- a/sys/arch/mips64/mips64/cpu.c
+++ b/sys/arch/mips64/mips64/cpu.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cpu.c,v 1.74 2020/05/29 04:42:24 deraadt Exp $ */
+/* $OpenBSD: cpu.c,v 1.75 2020/07/11 15:18:08 visa Exp $ */
/*
* Copyright (c) 1997-2004 Opsycon AB (www.opsycon.se)
@@ -55,6 +55,7 @@ extern void cpu_idle_cycle_wait(void);
void (*cpu_idle_cycle_func)(void) = cpu_idle_cycle_nop;
vaddr_t cache_valias_mask;
+int cpu_has_synced_cp0_count;
int cpu_has_userlocal;
struct cfattach cpu_ca = {
diff --git a/sys/arch/mips64/mips64/mips64_machdep.c b/sys/arch/mips64/mips64/mips64_machdep.c
index 8509c3331e2..5b348d9edd7 100644
--- a/sys/arch/mips64/mips64/mips64_machdep.c
+++ b/sys/arch/mips64/mips64/mips64_machdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mips64_machdep.c,v 1.32 2020/07/06 13:33:08 pirofti Exp $ */
+/* $OpenBSD: mips64_machdep.c,v 1.33 2020/07/11 15:18:08 visa Exp $ */
/*
* Copyright (c) 2009, 2010, 2012 Miodrag Vallat.
@@ -264,7 +264,6 @@ delay(int n)
}
}
-#ifndef MULTIPROCESSOR
u_int cp0_get_timecount(struct timecounter *);
struct timecounter cp0_timecounter = {
@@ -283,7 +282,6 @@ cp0_get_timecount(struct timecounter *tc)
{
return (cp0_get_count());
}
-#endif
/*
* Calibrate cpu internal counter against the TOD clock if available.
@@ -337,12 +335,13 @@ cpu_initclocks()
cp0_calibrate(ci);
#ifndef MULTIPROCESSOR
- if (cpu_setperf == NULL) {
+ cpu_has_synced_cp0_count = 1;
+#endif
+ if (cpu_setperf == NULL && cpu_has_synced_cp0_count) {
cp0_timecounter.tc_frequency =
(uint64_t)ci->ci_hw.clock / CP0_CYCLE_DIVIDER;
tc_init(&cp0_timecounter);
}
-#endif
#ifdef DIAGNOSTIC
if (md_startclock == NULL)
diff --git a/sys/arch/octeon/include/octeonreg.h b/sys/arch/octeon/include/octeonreg.h
index e18fff2ff2f..d52e785c417 100644
--- a/sys/arch/octeon/include/octeonreg.h
+++ b/sys/arch/octeon/include/octeonreg.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: octeonreg.h,v 1.10 2019/09/07 13:58:58 visa Exp $ */
+/* $OpenBSD: octeonreg.h,v 1.11 2020/07/11 15:18:08 visa Exp $ */
/*
* Copyright (c) 2003-2004 Opsycon AB (www.opsycon.com).
@@ -174,6 +174,8 @@
/* OCTEON II */
#define MIO_RST_BOOT 0x1180000001600ULL
+#define MIO_RST_BOOT_C_MUL_SHIFT 30
+#define MIO_RST_BOOT_C_MUL_MASK 0x7f
#define MIO_RST_BOOT_PNR_MUL_SHIFT 24
#define MIO_RST_BOOT_PNR_MUL_MASK 0x3f
@@ -182,6 +184,8 @@
/* OCTEON III */
#define RST_BOOT 0x1180006001600ULL
+#define RST_BOOT_C_MUL_SHIFT 30
+#define RST_BOOT_C_MUL_MASK 0x7f
#define RST_BOOT_PNR_MUL_SHIFT 24
#define RST_BOOT_PNR_MUL_MASK 0x3f
#define RST_CTL(x) (0x1180006001640ULL + 8 * (x))
diff --git a/sys/arch/octeon/octeon/locore.S b/sys/arch/octeon/octeon/locore.S
index 96e6b7534bf..609d476fd6b 100644
--- a/sys/arch/octeon/octeon/locore.S
+++ b/sys/arch/octeon/octeon/locore.S
@@ -1,4 +1,4 @@
-/* $OpenBSD: locore.S,v 1.20 2019/07/17 14:36:32 visa Exp $ */
+/* $OpenBSD: locore.S,v 1.21 2020/07/11 15:18:08 visa Exp $ */
/*
* Copyright (c) 2001-2004 Opsycon AB (www.opsycon.se / www.opsycon.com)
@@ -170,6 +170,38 @@ locore_start:
nop
.end locore_start
+/*
+ * void octeon_sync_tc(vaddr_t reg, uint64_t mul, uint64_t frac)
+ */
+LEAF(octeon_sync_tc, 0)
+ /*
+ * The measurement is done several times in a loop.
+ * The initial iterations warm up the icache and train the branch
+ * predictor to reduce jitter.
+ * The final iteration performs the actual synchronization.
+ */
+ li t0, 5 # set number of iterations
+ di t3 # disable all interrupts
+ MTC0_SR_IE_HAZARD
+1:
+ ld t1, (a0) # load data clock counter
+ dmultu t1, a1 # multiply with mul
+ mflo t1 # fetch result
+ beqz a2, 2f # skip if frac == 2^64
+ subu t0, 1
+ dmultu t1, a2 # multiply with frac
+ mfhi t1 # fetch result divided by 2^64
+2:
+ mtc0 t1, COP_0_COUNT # set core clock counter
+ MTC0_HAZARD
+ bnez t0, 1b
+ nop
+ mtc0 t3, COP_0_STATUS_REG # restore status register
+ MTC0_SR_IE_HAZARD
+ jr ra
+ nop
+END(octeon_sync_tc)
+
#if defined(MULTIPROCESSOR)
LEAF(hw_cpu_spinup_trampoline, 0)
LA t0, cpu_spinup_a0
diff --git a/sys/arch/octeon/octeon/machdep.c b/sys/arch/octeon/octeon/machdep.c
index d1ce2fbf84b..8dbabe647fc 100644
--- a/sys/arch/octeon/octeon/machdep.c
+++ b/sys/arch/octeon/octeon/machdep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: machdep.c,v 1.125 2020/07/06 13:33:08 pirofti Exp $ */
+/* $OpenBSD: machdep.c,v 1.126 2020/07/11 15:18:08 visa Exp $ */
/*
* Copyright (c) 2009, 2010 Miodrag Vallat.
@@ -99,6 +99,7 @@ struct uvm_constraint_range *uvm_md_constraints[] = { NULL };
vm_map_t exec_map;
vm_map_t phys_map;
+extern struct timecounter cp0_timecounter;
extern uint8_t dt_blob_start[];
struct boot_desc *octeon_boot_desc;
@@ -133,6 +134,7 @@ void dumpconf(void);
vaddr_t mips_init(register_t, register_t, register_t, register_t);
int is_memory_range(paddr_t, psize_t, psize_t);
void octeon_memory_init(struct boot_info *);
+void octeon_sync_tc(vaddr_t, uint64_t, uint64_t);
int octeon_cpuspeed(int *);
void octeon_tlb_init(void);
static void process_bootargs(void);
@@ -588,6 +590,7 @@ mips_init(register_t a0, register_t a1, register_t a2, register_t a3)
switch (octeon_model_family(prid)) {
case OCTEON_MODEL_FAMILY_CN73XX:
+ case OCTEON_MODEL_FAMILY_CN78XX:
ioclock_timecounter.tc_priv = (void *)FPA3_CLK_COUNT;
break;
default:
@@ -597,6 +600,9 @@ mips_init(register_t a0, register_t a1, register_t a2, register_t a3)
ioclock_timecounter.tc_frequency = octeon_ioclock_speed();
tc_init(&ioclock_timecounter);
+ cpu_has_synced_cp0_count = 1;
+ cp0_timecounter.tc_quality = 1000;
+
/*
* Return the new kernel stack pointer.
*/
@@ -695,7 +701,7 @@ octeon_ioclock_speed(void)
void
octeon_tlb_init(void)
{
- uint64_t cvmmemctl;
+ uint64_t clk_reg, cvmmemctl, frac, cmul, imul, val;
uint32_t hwrena = 0;
uint32_t pgrain = 0;
int chipid;
@@ -718,6 +724,55 @@ octeon_tlb_init(void)
setsr(getsr() & ~SR_COP_2_BIT);
/*
+ * Synchronize this core's cycle counter with the system-wide
+ * IO clock counter.
+ *
+ * The IO clock counter's value has to be scaled from the IO clock
+ * frequency domain to the core clock frequency domain:
+ *
+ * cclk / cmul = iclk / imul
+ * cclk = iclk * cmul / imul
+ *
+ * Division is very slow and possibly variable-time on the system,
+ * so the synchronization routine uses multiplication:
+ *
+ * cclk = iclk * cmul * frac / 2^64,
+ *
+ * where frac = 2^64 / imul is precomputed.
+ */
+ switch (octeon_model_family(chipid)) {
+ case OCTEON_MODEL_FAMILY_CN73XX:
+ case OCTEON_MODEL_FAMILY_CN78XX:
+ clk_reg = FPA3_CLK_COUNT;
+ break;
+ default:
+ clk_reg = IPD_CLK_COUNT;
+ break;
+ }
+ switch (octeon_ver) {
+ case OCTEON_2:
+ val = octeon_xkphys_read_8(MIO_RST_BOOT);
+ cmul = (val >> MIO_RST_BOOT_C_MUL_SHIFT) &
+ MIO_RST_BOOT_C_MUL_MASK;
+ imul = (val >> MIO_RST_BOOT_PNR_MUL_SHIFT) &
+ MIO_RST_BOOT_PNR_MUL_MASK;
+ break;
+ case OCTEON_3:
+ val = octeon_xkphys_read_8(RST_BOOT);
+ cmul = (val >> RST_BOOT_C_MUL_SHIFT) &
+ RST_BOOT_C_MUL_MASK;
+ imul = (val >> RST_BOOT_PNR_MUL_SHIFT) &
+ RST_BOOT_PNR_MUL_MASK;
+ break;
+ default:
+ cmul = 1;
+ imul = 1;
+ break;
+ }
+ frac = ((1ULL << 63) / imul) * 2;
+ octeon_sync_tc(PHYS_TO_XKPHYS(clk_reg, CCA_NC), cmul, frac);
+
+ /*
* If the UserLocal register is available, let userspace
* access it using the RDHWR instruction.
*/