From c15ac4fd60d5ffdb151bb2c7805f377fd7f90363 Mon Sep 17 00:00:00 2001
From: Alan Kao <alankao@andestech.com>
Date: Tue, 13 Feb 2018 13:13:17 +0800
Subject: riscv/ftrace: Add dynamic function tracer support

We now have dynamic ftrace with the following added items:

* ftrace_make_call, ftrace_make_nop (in kernel/ftrace.c)
  The two functions turn each recorded call site of filtered functions
  into a call to ftrace_caller or nops

* ftracce_update_ftrace_func (in kernel/ftrace.c)
  turns the nops at ftrace_call into a call to a generic entry for
  function tracers.

* ftrace_caller (in kernel/mcount-dyn.S)
  The entry where each _mcount call sites calls to once they are
  filtered to be traced.

Also, this patch fixes the semantic problems in mcount.S, which will be
treated as only a reference implementation once we have the dynamic
ftrace.

Cc: Greentime Hu <greentime@andestech.com>
Signed-off-by: Alan Kao <alankao@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/ftrace.h | 54 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 66d4175eb13e..078743aacfd3 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -8,3 +8,57 @@
 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_FRAME_POINTER)
 #define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
+
+#ifndef __ASSEMBLY__
+void _mcount(void);
+static inline unsigned long ftrace_call_adjust(unsigned long addr)
+{
+	return addr;
+}
+
+struct dyn_arch_ftrace {
+};
+#endif
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ * A general call in RISC-V is a pair of insts:
+ * 1) auipc: setting high-20 pc-related bits to ra register
+ * 2) jalr: setting low-12 offset to ra, jump to ra, and set ra to
+ *          return address (original pc + 4)
+ *
+ * Dynamic ftrace generates probes to call sites, so we must deal with
+ * both auipc and jalr at the same time.
+ */
+
+#define MCOUNT_ADDR		((unsigned long)_mcount)
+#define JALR_SIGN_MASK		(0x00000800)
+#define JALR_OFFSET_MASK	(0x00000fff)
+#define AUIPC_OFFSET_MASK	(0xfffff000)
+#define AUIPC_PAD		(0x00001000)
+#define JALR_SHIFT		20
+#define JALR_BASIC		(0x000080e7)
+#define AUIPC_BASIC		(0x00000097)
+#define NOP4			(0x00000013)
+
+#define make_call(caller, callee, call)					\
+do {									\
+	call[0] = to_auipc_insn((unsigned int)((unsigned long)callee -	\
+				(unsigned long)caller));		\
+	call[1] = to_jalr_insn((unsigned int)((unsigned long)callee -	\
+			       (unsigned long)caller));			\
+} while (0)
+
+#define to_jalr_insn(offset)						\
+	(((offset & JALR_OFFSET_MASK) << JALR_SHIFT) | JALR_BASIC)
+
+#define to_auipc_insn(offset)						\
+	((offset & JALR_SIGN_MASK) ?					\
+	(((offset & AUIPC_OFFSET_MASK) + AUIPC_PAD) | AUIPC_BASIC) :	\
+	((offset & AUIPC_OFFSET_MASK) | AUIPC_BASIC))
+
+/*
+ * Let auipc+jalr be the basic *mcount unit*, so we make it 8 bytes here.
+ */
+#define MCOUNT_INSN_SIZE 8
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 71e736a7d65551e49136c1efc4759e5902729cc2 Mon Sep 17 00:00:00 2001
From: Alan Kao <alankao@andestech.com>
Date: Tue, 13 Feb 2018 13:13:19 +0800
Subject: riscv/ftrace: Add ARCH_SUPPORTS_FTRACE_OPS support

Cc: Greentime Hu <greentime@andestech.com>
Signed-off-by: Alan Kao <alankao@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/ftrace.h | 1 +
 arch/riscv/kernel/mcount-dyn.S  | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index 078743aacfd3..fedadc40e358 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -9,6 +9,7 @@
 #define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
 
+#define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
 void _mcount(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 739e07a6fd85..6bbc3f88fcb3 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -74,9 +74,12 @@ ENTRY(ftrace_caller)
 	/*
 	 * a0: the address in the caller when calling ftrace_caller
 	 * a1: the caller's return address
+	 * a2: the address of global variable function_trace_op
 	 */
 	ld	a1, -8(s0)
 	addi	a0, ra, -MCOUNT_INSN_SIZE
+	la	t5, function_trace_op
+	ld	a2, 0(t5)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From aea4c671fb985e6a9ffc365c43ea6f5e0d737fea Mon Sep 17 00:00:00 2001
From: Alan Kao <alankao@andestech.com>
Date: Tue, 13 Feb 2018 13:13:20 +0800
Subject: riscv/ftrace: Add DYNAMIC_FTRACE_WITH_REGS support

Cc: Greentime Hu <greentime@andestech.com>
Signed-off-by: Alan Kao <alankao@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/Kconfig              |   1 +
 arch/riscv/include/asm/ftrace.h |   1 +
 arch/riscv/kernel/ftrace.c      |  17 ++++++
 arch/riscv/kernel/mcount-dyn.S  | 122 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 141 insertions(+)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 1e9d878c1ac4..61dd82709898 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -116,6 +116,7 @@ config ARCH_RV64I
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_DYNAMIC_FTRACE
+	select HAVE_DYNAMIC_FTRACE_WITH_REGS
 
 endchoice
 
diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
index fedadc40e358..c6dcc5291f97 100644
--- a/arch/riscv/include/asm/ftrace.h
+++ b/arch/riscv/include/asm/ftrace.h
@@ -8,6 +8,7 @@
 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_FRAME_POINTER)
 #define HAVE_FUNCTION_GRAPH_FP_TEST
 #endif
+#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
 
 #define ARCH_SUPPORTS_FTRACE_OPS 1
 #ifndef __ASSEMBLY__
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 5bbe1afd9463..48b5353691c3 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -106,6 +106,23 @@ int __init ftrace_dyn_arch_init(void)
 }
 #endif
 
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
+		       unsigned long addr)
+{
+	unsigned int call[2];
+	int ret;
+
+	make_call(rec->ip, old_addr, call);
+	ret = ftrace_check_current_call(rec->ip, call);
+
+	if (ret)
+		return ret;
+
+	return __ftrace_modify_call(rec->ip, addr, true);
+}
+#endif
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /*
  * Most of this function is copied from arm64.
diff --git a/arch/riscv/kernel/mcount-dyn.S b/arch/riscv/kernel/mcount-dyn.S
index 6bbc3f88fcb3..35a6ed76cb8b 100644
--- a/arch/riscv/kernel/mcount-dyn.S
+++ b/arch/riscv/kernel/mcount-dyn.S
@@ -115,3 +115,125 @@ ftrace_call:
 	RESTORE_ABI_STATE
 	ret
 ENDPROC(ftrace_caller)
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+	.macro SAVE_ALL
+	addi	sp, sp, -(PT_SIZE_ON_STACK+16)
+	sd	s0, (PT_SIZE_ON_STACK)(sp)
+	sd	ra, (PT_SIZE_ON_STACK+8)(sp)
+	addi	s0, sp, (PT_SIZE_ON_STACK+16)
+
+	sd x1,  PT_RA(sp)
+	sd x2,  PT_SP(sp)
+	sd x3,  PT_GP(sp)
+	sd x4,  PT_TP(sp)
+	sd x5,  PT_T0(sp)
+	sd x6,  PT_T1(sp)
+	sd x7,  PT_T2(sp)
+	sd x8,  PT_S0(sp)
+	sd x9,  PT_S1(sp)
+	sd x10, PT_A0(sp)
+	sd x11, PT_A1(sp)
+	sd x12, PT_A2(sp)
+	sd x13, PT_A3(sp)
+	sd x14, PT_A4(sp)
+	sd x15, PT_A5(sp)
+	sd x16, PT_A6(sp)
+	sd x17, PT_A7(sp)
+	sd x18, PT_S2(sp)
+	sd x19, PT_S3(sp)
+	sd x20, PT_S4(sp)
+	sd x21, PT_S5(sp)
+	sd x22, PT_S6(sp)
+	sd x23, PT_S7(sp)
+	sd x24, PT_S8(sp)
+	sd x25, PT_S9(sp)
+	sd x26, PT_S10(sp)
+	sd x27, PT_S11(sp)
+	sd x28, PT_T3(sp)
+	sd x29, PT_T4(sp)
+	sd x30, PT_T5(sp)
+	sd x31, PT_T6(sp)
+	.endm
+
+	.macro RESTORE_ALL
+	ld x1,  PT_RA(sp)
+	ld x2,  PT_SP(sp)
+	ld x3,  PT_GP(sp)
+	ld x4,  PT_TP(sp)
+	ld x5,  PT_T0(sp)
+	ld x6,  PT_T1(sp)
+	ld x7,  PT_T2(sp)
+	ld x8,  PT_S0(sp)
+	ld x9,  PT_S1(sp)
+	ld x10, PT_A0(sp)
+	ld x11, PT_A1(sp)
+	ld x12, PT_A2(sp)
+	ld x13, PT_A3(sp)
+	ld x14, PT_A4(sp)
+	ld x15, PT_A5(sp)
+	ld x16, PT_A6(sp)
+	ld x17, PT_A7(sp)
+	ld x18, PT_S2(sp)
+	ld x19, PT_S3(sp)
+	ld x20, PT_S4(sp)
+	ld x21, PT_S5(sp)
+	ld x22, PT_S6(sp)
+	ld x23, PT_S7(sp)
+	ld x24, PT_S8(sp)
+	ld x25, PT_S9(sp)
+	ld x26, PT_S10(sp)
+	ld x27, PT_S11(sp)
+	ld x28, PT_T3(sp)
+	ld x29, PT_T4(sp)
+	ld x30, PT_T5(sp)
+	ld x31, PT_T6(sp)
+
+	ld	s0, (PT_SIZE_ON_STACK)(sp)
+	ld	ra, (PT_SIZE_ON_STACK+8)(sp)
+	addi	sp, sp, (PT_SIZE_ON_STACK+16)
+	.endm
+
+	.macro RESTORE_GRAPH_REG_ARGS
+	ld	a0, PT_T0(sp)
+	ld	a1, PT_T1(sp)
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+	ld	a2, PT_T2(sp)
+#endif
+	.endm
+
+/*
+ * Most of the contents are the same as ftrace_caller.
+ */
+ENTRY(ftrace_regs_caller)
+	/*
+	 * a3: the address of all registers in the stack
+	 */
+	ld	a1, -8(s0)
+	addi	a0, ra, -MCOUNT_INSN_SIZE
+	la	t5, function_trace_op
+	ld	a2, 0(t5)
+	addi	a3, sp, -(PT_SIZE_ON_STACK+16)
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	addi	t0, s0, -8
+	mv	t1, a0
+#ifdef HAVE_FUNCTION_GRAPH_FP_TEST
+	ld	t2, -16(s0)
+#endif
+#endif
+	SAVE_ALL
+
+ftrace_regs_call:
+	.global ftrace_regs_call
+	call	ftrace_stub
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	RESTORE_GRAPH_REG_ARGS
+	call	ftrace_graph_caller
+#endif
+
+	RESTORE_ALL
+	ret
+ENDPROC(ftrace_regs_caller)
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_REGS */
-- 
cgit v1.2.3-59-g8ed1b


From 8d235b174af5d0af35ff206c15041fc2b02a0993 Mon Sep 17 00:00:00 2001
From: Andrea Parri <parri.andrea@gmail.com>
Date: Tue, 27 Feb 2018 03:24:11 +0100
Subject: riscv/barrier: Define __smp_{store_release,load_acquire}

Introduce __smp_{store_release,load_acquire}, and rely on the generic
definitions for smp_{store_release,load_acquire}. This avoids the use
of full ("rw,rw") fences on SMP.

Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/barrier.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/barrier.h b/arch/riscv/include/asm/barrier.h
index 5510366d169a..d4628e4b3a5e 100644
--- a/arch/riscv/include/asm/barrier.h
+++ b/arch/riscv/include/asm/barrier.h
@@ -38,6 +38,21 @@
 #define __smp_rmb()	RISCV_FENCE(r,r)
 #define __smp_wmb()	RISCV_FENCE(w,w)
 
+#define __smp_store_release(p, v)					\
+do {									\
+	compiletime_assert_atomic_type(*p);				\
+	RISCV_FENCE(rw,w);						\
+	WRITE_ONCE(*p, v);						\
+} while (0)
+
+#define __smp_load_acquire(p)						\
+({									\
+	typeof(*p) ___p1 = READ_ONCE(*p);				\
+	compiletime_assert_atomic_type(*p);				\
+	RISCV_FENCE(r,rw);						\
+	___p1;								\
+})
+
 /*
  * This is a very specific barrier: it's currently only used in two places in
  * the kernel, both in the scheduler.  See include/linux/spinlock.h for the two
-- 
cgit v1.2.3-59-g8ed1b


From 0123f4d76ca63b7b895f40089be0ce4809e392d8 Mon Sep 17 00:00:00 2001
From: Andrea Parri <parri.andrea@gmail.com>
Date: Fri, 9 Mar 2018 13:13:20 +0100
Subject: riscv/spinlock: Strengthen implementations with fences

Current implementations map locking operations using .rl and .aq
annotations.  However, this mapping is unsound w.r.t. the kernel
memory consistency model (LKMM) [1]:

Referring to the "unlock-lock-read-ordering" test reported below,
Daniel wrote:

  "I think an RCpc interpretation of .aq and .rl would in fact
   allow the two normal loads in P1 to be reordered [...]

   The intuition would be that the amoswap.w.aq can forward from
   the amoswap.w.rl while that's still in the store buffer, and
   then the lw x3,0(x4) can also perform while the amoswap.w.rl
   is still in the store buffer, all before the l1 x1,0(x2)
   executes.  That's not forbidden unless the amoswaps are RCsc,
   unless I'm missing something.

   Likewise even if the unlock()/lock() is between two stores.
   A control dependency might originate from the load part of
   the amoswap.w.aq, but there still would have to be something
   to ensure that this load part in fact performs after the store
   part of the amoswap.w.rl performs globally, and that's not
   automatic under RCpc."

Simulation of the RISC-V memory consistency model confirmed this
expectation.

In order to "synchronize" LKMM and RISC-V's implementation, this
commit strengthens the implementations of the locking operations
by replacing .rl and .aq with the use of ("lightweigth") fences,
resp., "fence rw,  w" and "fence r , rw".

C unlock-lock-read-ordering

{}
/* s initially owned by P1 */

P0(int *x, int *y)
{
        WRITE_ONCE(*x, 1);
        smp_wmb();
        WRITE_ONCE(*y, 1);
}

P1(int *x, int *y, spinlock_t *s)
{
        int r0;
        int r1;

        r0 = READ_ONCE(*y);
        spin_unlock(s);
        spin_lock(s);
        r1 = READ_ONCE(*x);
}

exists (1:r0=1 /\ 1:r1=0)

[1] https://marc.info/?l=linux-kernel&m=151930201102853&w=2
    https://groups.google.com/a/groups.riscv.org/forum/#!topic/isa-dev/hKywNHBkAXM
    https://marc.info/?l=linux-kernel&m=151633436614259&w=2

Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Albert Ou <albert@sifive.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-riscv@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/fence.h    | 12 ++++++++++++
 arch/riscv/include/asm/spinlock.h | 29 +++++++++++++++--------------
 2 files changed, 27 insertions(+), 14 deletions(-)
 create mode 100644 arch/riscv/include/asm/fence.h

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/fence.h b/arch/riscv/include/asm/fence.h
new file mode 100644
index 000000000000..2b443a3a487f
--- /dev/null
+++ b/arch/riscv/include/asm/fence.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_RISCV_FENCE_H
+#define _ASM_RISCV_FENCE_H
+
+#ifdef CONFIG_SMP
+#define RISCV_ACQUIRE_BARRIER		"\tfence r , rw\n"
+#define RISCV_RELEASE_BARRIER		"\tfence rw,  w\n"
+#else
+#define RISCV_ACQUIRE_BARRIER
+#define RISCV_RELEASE_BARRIER
+#endif
+
+#endif	/* _ASM_RISCV_FENCE_H */
diff --git a/arch/riscv/include/asm/spinlock.h b/arch/riscv/include/asm/spinlock.h
index 2fd27e8ef1fd..8eb26d1ede81 100644
--- a/arch/riscv/include/asm/spinlock.h
+++ b/arch/riscv/include/asm/spinlock.h
@@ -17,6 +17,7 @@
 
 #include <linux/kernel.h>
 #include <asm/current.h>
+#include <asm/fence.h>
 
 /*
  * Simple spin lock operations.  These provide no fairness guarantees.
@@ -28,10 +29,7 @@
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
-	__asm__ __volatile__ (
-		"amoswap.w.rl x0, x0, %0"
-		: "=A" (lock->lock)
-		:: "memory");
+	smp_store_release(&lock->lock, 0);
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -39,7 +37,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 	int tmp = 1, busy;
 
 	__asm__ __volatile__ (
-		"amoswap.w.aq %0, %2, %1"
+		"	amoswap.w %0, %2, %1\n"
+		RISCV_ACQUIRE_BARRIER
 		: "=r" (busy), "+A" (lock->lock)
 		: "r" (tmp)
 		: "memory");
@@ -68,8 +67,9 @@ static inline void arch_read_lock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bltz	%1, 1b\n"
 		"	addi	%1, %1, 1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		: "+A" (lock->lock), "=&r" (tmp)
 		:: "memory");
 }
@@ -82,8 +82,9 @@ static inline void arch_write_lock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bnez	%1, 1b\n"
 		"	li	%1, -1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		: "+A" (lock->lock), "=&r" (tmp)
 		:: "memory");
 }
@@ -96,8 +97,9 @@ static inline int arch_read_trylock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bltz	%1, 1f\n"
 		"	addi	%1, %1, 1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		"1:\n"
 		: "+A" (lock->lock), "=&r" (busy)
 		:: "memory");
@@ -113,8 +115,9 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 		"1:	lr.w	%1, %0\n"
 		"	bnez	%1, 1f\n"
 		"	li	%1, -1\n"
-		"	sc.w.aq	%1, %1, %0\n"
+		"	sc.w	%1, %1, %0\n"
 		"	bnez	%1, 1b\n"
+		RISCV_ACQUIRE_BARRIER
 		"1:\n"
 		: "+A" (lock->lock), "=&r" (busy)
 		:: "memory");
@@ -125,7 +128,8 @@ static inline int arch_write_trylock(arch_rwlock_t *lock)
 static inline void arch_read_unlock(arch_rwlock_t *lock)
 {
 	__asm__ __volatile__(
-		"amoadd.w.rl x0, %1, %0"
+		RISCV_RELEASE_BARRIER
+		"	amoadd.w x0, %1, %0\n"
 		: "+A" (lock->lock)
 		: "r" (-1)
 		: "memory");
@@ -133,10 +137,7 @@ static inline void arch_read_unlock(arch_rwlock_t *lock)
 
 static inline void arch_write_unlock(arch_rwlock_t *lock)
 {
-	__asm__ __volatile__ (
-		"amoswap.w.rl x0, x0, %0"
-		: "=A" (lock->lock)
-		:: "memory");
+	smp_store_release(&lock->lock, 0);
 }
 
 #endif /* _ASM_RISCV_SPINLOCK_H */
-- 
cgit v1.2.3-59-g8ed1b


From 5ce6c1f3535fa8d2134468547377b7b737042834 Mon Sep 17 00:00:00 2001
From: Andrea Parri <parri.andrea@gmail.com>
Date: Fri, 9 Mar 2018 13:13:40 +0100
Subject: riscv/atomic: Strengthen implementations with fences

Atomics present the same issue with locking: release and acquire
variants need to be strengthened to meet the constraints defined
by the Linux-kernel memory consistency model [1].

Atomics present a further issue: implementations of atomics such
as atomic_cmpxchg() and atomic_add_unless() rely on LR/SC pairs,
which do not give full-ordering with .aqrl; for example, current
implementations allow the "lr-sc-aqrl-pair-vs-full-barrier" test
below to end up with the state indicated in the "exists" clause.

In order to "synchronize" LKMM and RISC-V's implementation, this
commit strengthens the implementations of the atomics operations
by replacing .rl and .aq with the use of ("lightweigth") fences,
and by replacing .aqrl LR/SC pairs in sequences such as:

  0:      lr.w.aqrl  %0, %addr
          bne        %0, %old, 1f
          ...
          sc.w.aqrl  %1, %new, %addr
          bnez       %1, 0b
  1:

with sequences of the form:

  0:      lr.w       %0, %addr
          bne        %0, %old, 1f
          ...
          sc.w.rl    %1, %new, %addr   /* SC-release   */
          bnez       %1, 0b
          fence      rw, rw            /* "full" fence */
  1:

following Daniel's suggestion.

These modifications were validated with simulation of the RISC-V
memory consistency model.

C lr-sc-aqrl-pair-vs-full-barrier

{}

P0(int *x, int *y, atomic_t *u)
{
	int r0;
	int r1;

	WRITE_ONCE(*x, 1);
	r0 = atomic_cmpxchg(u, 0, 1);
	r1 = READ_ONCE(*y);
}

P1(int *x, int *y, atomic_t *v)
{
	int r0;
	int r1;

	WRITE_ONCE(*y, 1);
	r0 = atomic_cmpxchg(v, 0, 1);
	r1 = READ_ONCE(*x);
}

exists (u=1 /\ v=1 /\ 0:r1=0 /\ 1:r1=0)

[1] https://marc.info/?l=linux-kernel&m=151930201102853&w=2
    https://groups.google.com/a/groups.riscv.org/forum/#!topic/isa-dev/hKywNHBkAXM
    https://marc.info/?l=linux-kernel&m=151633436614259&w=2

Suggested-by: Daniel Lustig <dlustig@nvidia.com>
Signed-off-by: Andrea Parri <parri.andrea@gmail.com>
Cc: Palmer Dabbelt <palmer@sifive.com>
Cc: Albert Ou <albert@sifive.com>
Cc: Daniel Lustig <dlustig@nvidia.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Jade Alglave <j.alglave@ucl.ac.uk>
Cc: Luc Maranget <luc.maranget@inria.fr>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Akira Yokosawa <akiyks@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-riscv@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/atomic.h  | 417 +++++++++++++++++++++++++--------------
 arch/riscv/include/asm/cmpxchg.h | 391 +++++++++++++++++++++++++++++-------
 2 files changed, 588 insertions(+), 220 deletions(-)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index e65d1cd89e28..855115ace98c 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -24,6 +24,20 @@
 #include <asm/barrier.h>
 
 #define ATOMIC_INIT(i)	{ (i) }
+
+#define __atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory");	\
+	__ret;								\
+})
+
+#define __atomic_op_release(op, args...)				\
+({									\
+	__asm__ __volatile__(RISCV_RELEASE_BARRIER "" ::: "memory");	\
+	op##_relaxed(args);						\
+})
+
 static __always_inline int atomic_read(const atomic_t *v)
 {
 	return READ_ONCE(v->counter);
@@ -50,22 +64,23 @@ static __always_inline void atomic64_set(atomic64_t *v, long i)
  * have the AQ or RL bits set.  These don't return anything, so there's only
  * one version to worry about.
  */
-#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)				\
-static __always_inline void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)	\
-{											\
-	__asm__ __volatile__ (								\
-		"amo" #asm_op "." #asm_type " zero, %1, %0"				\
-		: "+A" (v->counter)							\
-		: "r" (I)								\
-		: "memory");								\
-}
+#define ATOMIC_OP(op, asm_op, I, asm_type, c_type, prefix)		\
+static __always_inline							\
+void atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
+{									\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type " zero, %1, %0"	\
+		: "+A" (v->counter)					\
+		: "r" (I)						\
+		: "memory");						\
+}									\
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, I)			\
+#define ATOMIC_OPS(op, asm_op, I)					\
         ATOMIC_OP (op, asm_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, I)			\
-        ATOMIC_OP (op, asm_op, I, w,  int,   )	\
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_OP (op, asm_op, I, w,  int,   )				\
         ATOMIC_OP (op, asm_op, I, d, long, 64)
 #endif
 
@@ -79,75 +94,115 @@ ATOMIC_OPS(xor, xor,  i)
 #undef ATOMIC_OPS
 
 /*
- * Atomic ops that have ordered, relaxed, acquire, and relese variants.
+ * Atomic ops that have ordered, relaxed, acquire, and release variants.
  * There's two flavors of these: the arithmatic ops have both fetch and return
  * versions, while the logical ops only have fetch versions.
  */
-#define ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, asm_type, c_type, prefix)				\
-static __always_inline c_type atomic##prefix##_fetch_##op##c_or(c_type i, atomic##prefix##_t *v)	\
-{													\
-	register c_type ret;										\
-	__asm__ __volatile__ (										\
-		"amo" #asm_op "." #asm_type #asm_or " %1, %2, %0"					\
-		: "+A" (v->counter), "=r" (ret)								\
-		: "r" (I)										\
-		: "memory");										\
-	return ret;											\
+#define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix)	\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op##_relaxed(c_type i,			\
+					     atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type " %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
+{									\
+	register c_type ret;						\
+	__asm__ __volatile__ (						\
+		"	amo" #asm_op "." #asm_type ".aqrl  %1, %2, %0"	\
+		: "+A" (v->counter), "=r" (ret)				\
+		: "r" (I)						\
+		: "memory");						\
+	return ret;							\
 }
 
-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, asm_type, c_type, prefix)			\
-static __always_inline c_type atomic##prefix##_##op##_return##c_or(c_type i, atomic##prefix##_t *v)	\
-{													\
-        return atomic##prefix##_fetch_##op##c_or(i, v) c_op I;						\
+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix)	\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return_relaxed(c_type i,			\
+					      atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;	\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op(i, v) c_op I;		\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_FETCH_OP (op, asm_op,       I, asm_or, c_or, d, long, 64)	\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_FETCH_OP( op, asm_op,       I, w,  int,   )		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, w,  int,   )		\
+        ATOMIC_FETCH_OP( op, asm_op,       I, d, long, 64)		\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I, d, long, 64)
 #endif
 
-ATOMIC_OPS(add, add, +,  i,      , _relaxed)
-ATOMIC_OPS(add, add, +,  i, .aq  , _acquire)
-ATOMIC_OPS(add, add, +,  i, .rl  , _release)
-ATOMIC_OPS(add, add, +,  i, .aqrl,         )
+ATOMIC_OPS(add, add, +,  i)
+ATOMIC_OPS(sub, add, +, -i)
+
+#define atomic_add_return_relaxed	atomic_add_return_relaxed
+#define atomic_sub_return_relaxed	atomic_sub_return_relaxed
+#define atomic_add_return		atomic_add_return
+#define atomic_sub_return		atomic_sub_return
 
-ATOMIC_OPS(sub, add, +, -i,      , _relaxed)
-ATOMIC_OPS(sub, add, +, -i, .aq  , _acquire)
-ATOMIC_OPS(sub, add, +, -i, .rl  , _release)
-ATOMIC_OPS(sub, add, +, -i, .aqrl,         )
+#define atomic_fetch_add_relaxed	atomic_fetch_add_relaxed
+#define atomic_fetch_sub_relaxed	atomic_fetch_sub_relaxed
+#define atomic_fetch_add		atomic_fetch_add
+#define atomic_fetch_sub		atomic_fetch_sub
+
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_add_return_relaxed	atomic64_add_return_relaxed
+#define atomic64_sub_return_relaxed	atomic64_sub_return_relaxed
+#define atomic64_add_return		atomic64_add_return
+#define atomic64_sub_return		atomic64_sub_return
+
+#define atomic64_fetch_add_relaxed	atomic64_fetch_add_relaxed
+#define atomic64_fetch_sub_relaxed	atomic64_fetch_sub_relaxed
+#define atomic64_fetch_add		atomic64_fetch_add
+#define atomic64_fetch_sub		atomic64_fetch_sub
+#endif
 
 #undef ATOMIC_OPS
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, I, asm_or, c_or)				\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, w,  int,   )	\
-        ATOMIC_FETCH_OP(op, asm_op, I, asm_or, c_or, d, long, 64)
+#define ATOMIC_OPS(op, asm_op, I)					\
+        ATOMIC_FETCH_OP(op, asm_op, I, w,  int,   )			\
+        ATOMIC_FETCH_OP(op, asm_op, I, d, long, 64)
 #endif
 
-ATOMIC_OPS(and, and, i,      , _relaxed)
-ATOMIC_OPS(and, and, i, .aq  , _acquire)
-ATOMIC_OPS(and, and, i, .rl  , _release)
-ATOMIC_OPS(and, and, i, .aqrl,         )
+ATOMIC_OPS(and, and, i)
+ATOMIC_OPS( or,  or, i)
+ATOMIC_OPS(xor, xor, i)
 
-ATOMIC_OPS( or,  or, i,      , _relaxed)
-ATOMIC_OPS( or,  or, i, .aq  , _acquire)
-ATOMIC_OPS( or,  or, i, .rl  , _release)
-ATOMIC_OPS( or,  or, i, .aqrl,         )
+#define atomic_fetch_and_relaxed	atomic_fetch_and_relaxed
+#define atomic_fetch_or_relaxed		atomic_fetch_or_relaxed
+#define atomic_fetch_xor_relaxed	atomic_fetch_xor_relaxed
+#define atomic_fetch_and		atomic_fetch_and
+#define atomic_fetch_or			atomic_fetch_or
+#define atomic_fetch_xor		atomic_fetch_xor
 
-ATOMIC_OPS(xor, xor, i,      , _relaxed)
-ATOMIC_OPS(xor, xor, i, .aq  , _acquire)
-ATOMIC_OPS(xor, xor, i, .rl  , _release)
-ATOMIC_OPS(xor, xor, i, .aqrl,         )
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_fetch_and_relaxed	atomic64_fetch_and_relaxed
+#define atomic64_fetch_or_relaxed	atomic64_fetch_or_relaxed
+#define atomic64_fetch_xor_relaxed	atomic64_fetch_xor_relaxed
+#define atomic64_fetch_and		atomic64_fetch_and
+#define atomic64_fetch_or		atomic64_fetch_or
+#define atomic64_fetch_xor		atomic64_fetch_xor
+#endif
 
 #undef ATOMIC_OPS
 
@@ -157,22 +212,24 @@ ATOMIC_OPS(xor, xor, i, .aqrl,         )
 /*
  * The extra atomic operations that are constructed from one of the core
  * AMO-based operations above (aside from sub, which is easier to fit above).
- * These are required to perform a barrier, but they're OK this way because
- * atomic_*_return is also required to perform a barrier.
+ * These are required to perform a full barrier, but they're OK this way
+ * because atomic_*_return is also required to perform a full barrier.
+ *
  */
-#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)			\
-static __always_inline bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v) \
-{										\
-	return atomic##prefix##_##func_op##_return(i, v) comp_op I;		\
+#define ATOMIC_OP(op, func_op, comp_op, I, c_type, prefix)		\
+static __always_inline							\
+bool atomic##prefix##_##op(c_type i, atomic##prefix##_t *v)		\
+{									\
+	return atomic##prefix##_##func_op##_return(i, v) comp_op I;	\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, func_op, comp_op, I)			\
-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )
+#define ATOMIC_OPS(op, func_op, comp_op, I)				\
+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )
 #else
-#define ATOMIC_OPS(op, func_op, comp_op, I)			\
-        ATOMIC_OP (op, func_op, comp_op, I,  int,   )		\
-        ATOMIC_OP (op, func_op, comp_op, I, long, 64)
+#define ATOMIC_OPS(op, func_op, comp_op, I)				\
+        ATOMIC_OP(op, func_op, comp_op, I,  int,   )			\
+        ATOMIC_OP(op, func_op, comp_op, I, long, 64)
 #endif
 
 ATOMIC_OPS(add_and_test, add, ==, 0)
@@ -182,51 +239,87 @@ ATOMIC_OPS(add_negative, add,  <, 0)
 #undef ATOMIC_OP
 #undef ATOMIC_OPS
 
-#define ATOMIC_OP(op, func_op, I, c_type, prefix)				\
-static __always_inline void atomic##prefix##_##op(atomic##prefix##_t *v)	\
-{										\
-	atomic##prefix##_##func_op(I, v);					\
+#define ATOMIC_OP(op, func_op, I, c_type, prefix)			\
+static __always_inline							\
+void atomic##prefix##_##op(atomic##prefix##_t *v)			\
+{									\
+	atomic##prefix##_##func_op(I, v);				\
 }
 
-#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)					\
-static __always_inline c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)	\
-{											\
-	return atomic##prefix##_fetch_##func_op(I, v);					\
+#define ATOMIC_FETCH_OP(op, func_op, I, c_type, prefix)			\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op##_relaxed(atomic##prefix##_t *v)	\
+{									\
+	return atomic##prefix##_fetch_##func_op##_relaxed(I, v);	\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_fetch_##op(atomic##prefix##_t *v)		\
+{									\
+	return atomic##prefix##_fetch_##func_op(I, v);			\
 }
 
-#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)				\
-static __always_inline c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)	\
-{											\
-        return atomic##prefix##_fetch_##op(v) c_op I;					\
+#define ATOMIC_OP_RETURN(op, asm_op, c_op, I, c_type, prefix)		\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return_relaxed(atomic##prefix##_t *v)	\
+{									\
+        return atomic##prefix##_fetch_##op##_relaxed(v) c_op I;		\
+}									\
+static __always_inline							\
+c_type atomic##prefix##_##op##_return(atomic##prefix##_t *v)		\
+{									\
+        return atomic##prefix##_fetch_##op(v) c_op I;			\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )
 #else
-#define ATOMIC_OPS(op, asm_op, c_op, I)						\
-        ATOMIC_OP       (op, asm_op,       I,  int,   )				\
-        ATOMIC_FETCH_OP (op, asm_op,       I,  int,   )				\
-        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )				\
-        ATOMIC_OP       (op, asm_op,       I, long, 64)				\
-        ATOMIC_FETCH_OP (op, asm_op,       I, long, 64)				\
+#define ATOMIC_OPS(op, asm_op, c_op, I)					\
+        ATOMIC_OP(       op, asm_op,       I,  int,   )			\
+        ATOMIC_FETCH_OP( op, asm_op,       I,  int,   )			\
+        ATOMIC_OP_RETURN(op, asm_op, c_op, I,  int,   )			\
+        ATOMIC_OP(       op, asm_op,       I, long, 64)			\
+        ATOMIC_FETCH_OP( op, asm_op,       I, long, 64)			\
         ATOMIC_OP_RETURN(op, asm_op, c_op, I, long, 64)
 #endif
 
 ATOMIC_OPS(inc, add, +,  1)
 ATOMIC_OPS(dec, add, +, -1)
 
+#define atomic_inc_return_relaxed	atomic_inc_return_relaxed
+#define atomic_dec_return_relaxed	atomic_dec_return_relaxed
+#define atomic_inc_return		atomic_inc_return
+#define atomic_dec_return		atomic_dec_return
+
+#define atomic_fetch_inc_relaxed	atomic_fetch_inc_relaxed
+#define atomic_fetch_dec_relaxed	atomic_fetch_dec_relaxed
+#define atomic_fetch_inc		atomic_fetch_inc
+#define atomic_fetch_dec		atomic_fetch_dec
+
+#ifndef CONFIG_GENERIC_ATOMIC64
+#define atomic64_inc_return_relaxed	atomic64_inc_return_relaxed
+#define atomic64_dec_return_relaxed	atomic64_dec_return_relaxed
+#define atomic64_inc_return		atomic64_inc_return
+#define atomic64_dec_return		atomic64_dec_return
+
+#define atomic64_fetch_inc_relaxed	atomic64_fetch_inc_relaxed
+#define atomic64_fetch_dec_relaxed	atomic64_fetch_dec_relaxed
+#define atomic64_fetch_inc		atomic64_fetch_inc
+#define atomic64_fetch_dec		atomic64_fetch_dec
+#endif
+
 #undef ATOMIC_OPS
 #undef ATOMIC_OP
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_OP_RETURN
 
-#define ATOMIC_OP(op, func_op, comp_op, I, prefix)				\
-static __always_inline bool atomic##prefix##_##op(atomic##prefix##_t *v)	\
-{										\
-	return atomic##prefix##_##func_op##_return(v) comp_op I;		\
+#define ATOMIC_OP(op, func_op, comp_op, I, prefix)			\
+static __always_inline							\
+bool atomic##prefix##_##op(atomic##prefix##_t *v)			\
+{									\
+	return atomic##prefix##_##func_op##_return(v) comp_op I;	\
 }
 
 ATOMIC_OP(inc_and_test, inc, ==, 0,   )
@@ -238,19 +331,19 @@ ATOMIC_OP(dec_and_test, dec, ==, 0, 64)
 
 #undef ATOMIC_OP
 
-/* This is required to provide a barrier on success. */
+/* This is required to provide a full barrier on success. */
 static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
        int prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.w.aqrl  %[p],  %[c]\n\t"
-		"beq        %[p],  %[u], 1f\n\t"
-		"add       %[rc],  %[p], %[a]\n\t"
-		"sc.w.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc], 0b\n\t"
-		"1:"
+		"0:	lr.w     %[p],  %[c]\n"
+		"	beq      %[p],  %[u], 1f\n"
+		"	add      %[rc], %[p], %[a]\n"
+		"	sc.w.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [a]"r" (a), [u]"r" (u)
 		: "memory");
@@ -263,13 +356,13 @@ static __always_inline long __atomic64_add_unless(atomic64_t *v, long a, long u)
        long prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.d.aqrl  %[p],  %[c]\n\t"
-		"beq        %[p],  %[u], 1f\n\t"
-		"add       %[rc],  %[p], %[a]\n\t"
-		"sc.d.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc], 0b\n\t"
-		"1:"
+		"0:	lr.d     %[p],  %[c]\n"
+		"	beq      %[p],  %[u], 1f\n"
+		"	add      %[rc], %[p], %[a]\n"
+		"	sc.d.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [a]"r" (a), [u]"r" (u)
 		: "memory");
@@ -300,37 +393,63 @@ static __always_inline long atomic64_inc_not_zero(atomic64_t *v)
 
 /*
  * atomic_{cmp,}xchg is required to have exactly the same ordering semantics as
- * {cmp,}xchg and the operations that return, so they need a barrier.
- */
-/*
- * FIXME: atomic_cmpxchg_{acquire,release,relaxed} are all implemented by
- * assigning the same barrier to both the LR and SC operations, but that might
- * not make any sense.  We're waiting on a memory model specification to
- * determine exactly what the right thing to do is here.
+ * {cmp,}xchg and the operations that return, so they need a full barrier.
  */
-#define ATOMIC_OP(c_t, prefix, c_or, size, asm_or)						\
-static __always_inline c_t atomic##prefix##_cmpxchg##c_or(atomic##prefix##_t *v, c_t o, c_t n) 	\
-{												\
-	return __cmpxchg(&(v->counter), o, n, size, asm_or, asm_or);				\
-}												\
-static __always_inline c_t atomic##prefix##_xchg##c_or(atomic##prefix##_t *v, c_t n) 		\
-{												\
-	return __xchg(n, &(v->counter), size, asm_or);						\
+#define ATOMIC_OP(c_t, prefix, size)					\
+static __always_inline							\
+c_t atomic##prefix##_xchg_relaxed(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_relaxed(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg_acquire(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_acquire(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg_release(atomic##prefix##_t *v, c_t n)		\
+{									\
+	return __xchg_release(&(v->counter), n, size);			\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_xchg(atomic##prefix##_t *v, c_t n)			\
+{									\
+	return __xchg(&(v->counter), n, size);				\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_relaxed(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_relaxed(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_acquire(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_acquire(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg_release(atomic##prefix##_t *v,		\
+				     c_t o, c_t n)			\
+{									\
+	return __cmpxchg_release(&(v->counter), o, n, size);		\
+}									\
+static __always_inline							\
+c_t atomic##prefix##_cmpxchg(atomic##prefix##_t *v, c_t o, c_t n)	\
+{									\
+	return __cmpxchg(&(v->counter), o, n, size);			\
 }
 
 #ifdef CONFIG_GENERIC_ATOMIC64
-#define ATOMIC_OPS(c_or, asm_or)			\
-	ATOMIC_OP( int,   , c_or, 4, asm_or)
+#define ATOMIC_OPS()							\
+	ATOMIC_OP( int,   , 4)
 #else
-#define ATOMIC_OPS(c_or, asm_or)			\
-	ATOMIC_OP( int,   , c_or, 4, asm_or)		\
-	ATOMIC_OP(long, 64, c_or, 8, asm_or)
+#define ATOMIC_OPS()							\
+	ATOMIC_OP( int,   , 4)						\
+	ATOMIC_OP(long, 64, 8)
 #endif
 
-ATOMIC_OPS(        , .aqrl)
-ATOMIC_OPS(_acquire,   .aq)
-ATOMIC_OPS(_release,   .rl)
-ATOMIC_OPS(_relaxed,      )
+ATOMIC_OPS()
 
 #undef ATOMIC_OPS
 #undef ATOMIC_OP
@@ -340,13 +459,13 @@ static __always_inline int atomic_sub_if_positive(atomic_t *v, int offset)
        int prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.w.aqrl  %[p],  %[c]\n\t"
-		"sub       %[rc],  %[p], %[o]\n\t"
-		"bltz      %[rc],    1f\n\t"
-		"sc.w.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc],    0b\n\t"
-		"1:"
+		"0:	lr.w     %[p],  %[c]\n"
+		"	sub      %[rc], %[p], %[o]\n"
+		"	bltz     %[rc], 1f\n"
+		"	sc.w.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [o]"r" (offset)
 		: "memory");
@@ -361,13 +480,13 @@ static __always_inline long atomic64_sub_if_positive(atomic64_t *v, int offset)
        long prev, rc;
 
 	__asm__ __volatile__ (
-		"0:\n\t"
-		"lr.d.aqrl  %[p],  %[c]\n\t"
-		"sub       %[rc],  %[p], %[o]\n\t"
-		"bltz      %[rc],    1f\n\t"
-		"sc.d.aqrl %[rc], %[rc], %[c]\n\t"
-		"bnez      %[rc],    0b\n\t"
-		"1:"
+		"0:	lr.d     %[p],  %[c]\n"
+		"	sub      %[rc], %[p], %[o]\n"
+		"	bltz     %[rc], 1f\n"
+		"	sc.d.rl  %[rc], %[rc], %[c]\n"
+		"	bnez     %[rc], 0b\n"
+		"	fence    rw, rw\n"
+		"1:\n"
 		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
 		: [o]"r" (offset)
 		: "memory");
diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
index db249dbc7b97..c12833f7b6bd 100644
--- a/arch/riscv/include/asm/cmpxchg.h
+++ b/arch/riscv/include/asm/cmpxchg.h
@@ -17,45 +17,153 @@
 #include <linux/bug.h>
 
 #include <asm/barrier.h>
+#include <asm/fence.h>
 
-#define __xchg(new, ptr, size, asm_or)				\
-({								\
-	__typeof__(ptr) __ptr = (ptr);				\
-	__typeof__(new) __new = (new);				\
-	__typeof__(*(ptr)) __ret;				\
-	switch (size) {						\
-	case 4:							\
-		__asm__ __volatile__ (				\
-			"amoswap.w" #asm_or " %0, %2, %1"	\
-			: "=r" (__ret), "+A" (*__ptr)		\
-			: "r" (__new)				\
-			: "memory");				\
-		break;						\
-	case 8:							\
-		__asm__ __volatile__ (				\
-			"amoswap.d" #asm_or " %0, %2, %1"	\
-			: "=r" (__ret), "+A" (*__ptr)		\
-			: "r" (__new)				\
-			: "memory");				\
-		break;						\
-	default:						\
-		BUILD_BUG();					\
-	}							\
-	__ret;							\
-})
-
-#define xchg(ptr, x)    (__xchg((x), (ptr), sizeof(*(ptr)), .aqrl))
-
-#define xchg32(ptr, x)				\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	xchg((ptr), (x));			\
-})
-
-#define xchg64(ptr, x)				\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	xchg((ptr), (x));			\
+#define __xchg_relaxed(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_relaxed(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg_acquire(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w %0, %2, %1\n"			\
+			RISCV_ACQUIRE_BARRIER				\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d %0, %2, %1\n"			\
+			RISCV_ACQUIRE_BARRIER				\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_acquire(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_acquire((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg_release(ptr, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"	amoswap.w %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"	amoswap.d %0, %2, %1\n"			\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg_release(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_release((ptr),			\
+					    _x_, sizeof(*(ptr)));	\
+})
+
+#define __xchg(ptr, new, size)						\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(new) __new = (new);					\
+	__typeof__(*(ptr)) __ret;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.w.aqrl %0, %2, %1\n"		\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"	amoswap.d.aqrl %0, %2, %1\n"		\
+			: "=r" (__ret), "+A" (*__ptr)			\
+			: "r" (__new)					\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define xchg(ptr, x)							\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg((ptr), _x_, sizeof(*(ptr)));	\
+})
+
+#define xchg32(ptr, x)							\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	xchg((ptr), (x));						\
+})
+
+#define xchg64(ptr, x)							\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	xchg((ptr), (x));						\
 })
 
 /*
@@ -63,7 +171,51 @@
  * store NEW in MEM.  Return the initial value in MEM.  Success is
  * indicated by comparing RETURN with OLD.
  */
-#define __cmpxchg(ptr, old, new, size, lrb, scb)			\
+#define __cmpxchg_relaxed(ptr, old, new, size)				\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define cmpxchg_relaxed(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
+
+#define __cmpxchg_acquire(ptr, old, new, size)				\
 ({									\
 	__typeof__(ptr) __ptr = (ptr);					\
 	__typeof__(*(ptr)) __old = (old);				\
@@ -73,24 +225,24 @@
 	switch (size) {							\
 	case 4:								\
 		__asm__ __volatile__ (					\
-		"0:"							\
-			"lr.w" #scb " %0, %2\n"				\
-			"bne         %0, %z3, 1f\n"			\
-			"sc.w" #lrb " %1, %z4, %2\n"			\
-			"bnez        %1, 0b\n"				\
-		"1:"							\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			RISCV_ACQUIRE_BARRIER				\
+			"1:\n"						\
 			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
 			: "rJ" (__old), "rJ" (__new)			\
 			: "memory");					\
 		break;							\
 	case 8:								\
 		__asm__ __volatile__ (					\
-		"0:"							\
-			"lr.d" #scb " %0, %2\n"				\
-			"bne         %0, %z3, 1f\n"			\
-			"sc.d" #lrb " %1, %z4, %2\n"			\
-			"bnez        %1, 0b\n"				\
-		"1:"							\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			RISCV_ACQUIRE_BARRIER				\
+			"1:\n"						\
 			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
 			: "rJ" (__old), "rJ" (__new)			\
 			: "memory");					\
@@ -101,34 +253,131 @@
 	__ret;								\
 })
 
-#define cmpxchg(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), .aqrl, .aqrl))
+#define cmpxchg_acquire(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
 
-#define cmpxchg_local(ptr, o, n) \
-	(__cmpxchg((ptr), (o), (n), sizeof(*(ptr)), , ))
+#define __cmpxchg_release(ptr, old, new, size)				\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			RISCV_RELEASE_BARRIER				\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
+
+#define cmpxchg_release(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_release((ptr),			\
+					_o_, _n_, sizeof(*(ptr)));	\
+})
+
+#define __cmpxchg(ptr, old, new, size)					\
+({									\
+	__typeof__(ptr) __ptr = (ptr);					\
+	__typeof__(*(ptr)) __old = (old);				\
+	__typeof__(*(ptr)) __new = (new);				\
+	__typeof__(*(ptr)) __ret;					\
+	register unsigned int __rc;					\
+	switch (size) {							\
+	case 4:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.w %0, %2\n"				\
+			"	bne  %0, %z3, 1f\n"			\
+			"	sc.w.rl %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"	fence rw, rw\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	case 8:								\
+		__asm__ __volatile__ (					\
+			"0:	lr.d %0, %2\n"				\
+			"	bne %0, %z3, 1f\n"			\
+			"	sc.d.rl %1, %z4, %2\n"			\
+			"	bnez %1, 0b\n"				\
+			"	fence rw, rw\n"				\
+			"1:\n"						\
+			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
+			: "rJ" (__old), "rJ" (__new)			\
+			: "memory");					\
+		break;							\
+	default:							\
+		BUILD_BUG();						\
+	}								\
+	__ret;								\
+})
 
-#define cmpxchg32(ptr, o, n)			\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	cmpxchg((ptr), (o), (n));		\
+#define cmpxchg(ptr, o, n)						\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg((ptr),				\
+				       _o_, _n_, sizeof(*(ptr)));	\
 })
 
-#define cmpxchg32_local(ptr, o, n)		\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 4);	\
-	cmpxchg_local((ptr), (o), (n));		\
+#define cmpxchg_local(ptr, o, n)					\
+	(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
+
+#define cmpxchg32(ptr, o, n)						\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	cmpxchg((ptr), (o), (n));					\
 })
 
-#define cmpxchg64(ptr, o, n)			\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	cmpxchg((ptr), (o), (n));		\
+#define cmpxchg32_local(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	cmpxchg_relaxed((ptr), (o), (n))				\
 })
 
-#define cmpxchg64_local(ptr, o, n)		\
-({						\
-	BUILD_BUG_ON(sizeof(*(ptr)) != 8);	\
-	cmpxchg_local((ptr), (o), (n));		\
+#define cmpxchg64(ptr, o, n)						\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg((ptr), (o), (n));					\
+})
+
+#define cmpxchg64_local(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_relaxed((ptr), (o), (n));				\
 })
 
 #endif /* _ASM_RISCV_CMPXCHG_H */
-- 
cgit v1.2.3-59-g8ed1b


From ab1ef68e54019937cf859f2c86c9ead6f3e62f19 Mon Sep 17 00:00:00 2001
From: Zong Li <zong@andestech.com>
Date: Thu, 15 Mar 2018 16:50:41 +0800
Subject: RISC-V: Add sections of PLT and GOT for kernel module

The address of external symbols will locate more than 32-bit offset
in 64-bit kernel with sv39 or sv48 virtual addressing.

Module loader emits the GOT and PLT entries for data symbols and
function symbols respectively.

The PLT entry is a trampoline code for jumping to the 64-bit
real address. The GOT entry is just the data symbol address.

Signed-off-by: Zong Li <zong@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/Kconfig                  |   5 ++
 arch/riscv/Makefile                 |   5 ++
 arch/riscv/include/asm/module.h     | 103 ++++++++++++++++++++++++++
 arch/riscv/kernel/Makefile          |   1 +
 arch/riscv/kernel/module-sections.c | 139 ++++++++++++++++++++++++++++++++++++
 arch/riscv/kernel/module.lds        |   7 ++
 6 files changed, 260 insertions(+)
 create mode 100644 arch/riscv/include/asm/module.h
 create mode 100644 arch/riscv/kernel/module-sections.c
 create mode 100644 arch/riscv/kernel/module.lds

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 04807c7f64cc..90ff52059794 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -131,6 +131,10 @@ choice
 		bool "medium any code model"
 endchoice
 
+config MODULE_SECTIONS
+	bool
+	select HAVE_MOD_ARCH_SPECIFIC
+
 choice
 	prompt "Maximum Physical Memory"
 	default MAXPHYSMEM_2GB if 32BIT
@@ -141,6 +145,7 @@ choice
 		bool "2GiB"
 	config MAXPHYSMEM_128GB
 		depends on 64BIT && CMODEL_MEDANY
+		select MODULE_SECTIONS if MODULES
 		bool "128GiB"
 endchoice
 
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 6719dd30ec5b..c72d408c05c0 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -56,6 +56,11 @@ endif
 ifeq ($(CONFIG_CMODEL_MEDANY),y)
 	KBUILD_CFLAGS += -mcmodel=medany
 endif
+ifeq ($(CONFIG_MODULE_SECTIONS),y)
+	KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/riscv/kernel/module.lds
+endif
+
+KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax)
 
 # GCC versions that support the "-mstrict-align" option default to allowing
 # unaligned accesses.  While unaligned accesses are explicitly allowed in the
diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h
new file mode 100644
index 000000000000..e61d73f82d4d
--- /dev/null
+++ b/arch/riscv/include/asm/module.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2017 Andes Technology Corporation */
+
+#ifndef _ASM_RISCV_MODULE_H
+#define _ASM_RISCV_MODULE_H
+
+#include <asm-generic/module.h>
+
+#define MODULE_ARCH_VERMAGIC    "riscv"
+
+u64 module_emit_got_entry(struct module *mod, u64 val);
+u64 module_emit_plt_entry(struct module *mod, u64 val);
+
+#ifdef CONFIG_MODULE_SECTIONS
+struct mod_section {
+	struct elf64_shdr *shdr;
+	int num_entries;
+	int max_entries;
+};
+
+struct mod_arch_specific {
+	struct mod_section got;
+	struct mod_section plt;
+};
+
+struct got_entry {
+	u64 symbol_addr;	/* the real variable address */
+};
+
+static inline struct got_entry emit_got_entry(u64 val)
+{
+	return (struct got_entry) {val};
+}
+
+static inline struct got_entry *get_got_entry(u64 val,
+					      const struct mod_section *sec)
+{
+	struct got_entry *got = (struct got_entry *)sec->shdr->sh_addr;
+	int i;
+	for (i = 0; i < sec->num_entries; i++) {
+		if (got[i].symbol_addr == val)
+			return &got[i];
+	}
+	return NULL;
+}
+
+struct plt_entry {
+	/*
+	 * Trampoline code to real target address. The return address
+	 * should be the original (pc+4) before entring plt entry.
+	 * For 8 byte alignment of symbol_addr,
+	 * don't pack structure to remove the padding.
+	 */
+	u32 insn_auipc;		/* auipc t0, 0x0                       */
+	u32 insn_ld;		/* ld    t1, 0x10(t0)                  */
+	u32 insn_jr;		/* jr    t1                            */
+	u64 symbol_addr;	/* the real jump target address        */
+};
+
+#define OPC_AUIPC  0x0017
+#define OPC_LD     0x3003
+#define OPC_JALR   0x0067
+#define REG_T0     0x5
+#define REG_T1     0x6
+#define IMM_OFFSET 0x10
+
+static inline struct plt_entry emit_plt_entry(u64 val)
+{
+	/*
+	 * U-Type encoding:
+	 * +------------+----------+----------+
+	 * | imm[31:12] | rd[11:7] | opc[6:0] |
+	 * +------------+----------+----------+
+	 *
+	 * I-Type encoding:
+	 * +------------+------------+--------+----------+----------+
+	 * | imm[31:20] | rs1[19:15] | funct3 | rd[11:7] | opc[6:0] |
+	 * +------------+------------+--------+----------+----------+
+	 *
+	 */
+	return (struct plt_entry) {
+		OPC_AUIPC | (REG_T0 << 7),
+		OPC_LD | (IMM_OFFSET << 20) | (REG_T0 << 15) | (REG_T1 << 7),
+		OPC_JALR | (REG_T1 << 15),
+		val
+	};
+}
+
+static inline struct plt_entry *get_plt_entry(u64 val,
+					      const struct mod_section *sec)
+{
+	struct plt_entry *plt = (struct plt_entry *)sec->shdr->sh_addr;
+	int i;
+	for (i = 0; i < sec->num_entries; i++) {
+		if (plt[i].symbol_addr == val)
+			return &plt[i];
+	}
+	return NULL;
+}
+
+#endif /* CONFIG_MODULE_SECTIONS */
+
+#endif /* _ASM_RISCV_MODULE_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 196f62ffc428..d355e3c18278 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -34,6 +34,7 @@ CFLAGS_setup.o := -mcmodel=medany
 obj-$(CONFIG_SMP)		+= smpboot.o
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_MODULES)		+= module.o
+obj-$(CONFIG_MODULE_SECTIONS)	+= module-sections.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 
diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
new file mode 100644
index 000000000000..94ba1551eac3
--- /dev/null
+++ b/arch/riscv/kernel/module-sections.c
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
+ *
+ * Copyright (C) 2018 Andes Technology Corporation <zong@andestech.com>
+ */
+
+#include <linux/elf.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+u64 module_emit_got_entry(struct module *mod, u64 val)
+{
+	struct mod_section *got_sec = &mod->arch.got;
+	int i = got_sec->num_entries;
+	struct got_entry *got = get_got_entry(val, got_sec);
+
+	if (got)
+		return (u64)got;
+
+	/* There is no duplicate entry, create a new one */
+	got = (struct got_entry *)got_sec->shdr->sh_addr;
+	got[i] = emit_got_entry(val);
+
+	got_sec->num_entries++;
+	BUG_ON(got_sec->num_entries > got_sec->max_entries);
+
+	return (u64)&got[i];
+}
+
+u64 module_emit_plt_entry(struct module *mod, u64 val)
+{
+	struct mod_section *plt_sec = &mod->arch.plt;
+	struct plt_entry *plt = get_plt_entry(val, plt_sec);
+	int i = plt_sec->num_entries;
+
+	if (plt)
+		return (u64)plt;
+
+	/* There is no duplicate entry, create a new one */
+	plt = (struct plt_entry *)plt_sec->shdr->sh_addr;
+	plt[i] = emit_plt_entry(val);
+
+	plt_sec->num_entries++;
+	BUG_ON(plt_sec->num_entries > plt_sec->max_entries);
+
+	return (u64)&plt[i];
+}
+
+static int is_rela_equal(const Elf64_Rela *x, const Elf64_Rela *y)
+{
+	return x->r_info == y->r_info && x->r_addend == y->r_addend;
+}
+
+static bool duplicate_rela(const Elf64_Rela *rela, int idx)
+{
+	int i;
+	for (i = 0; i < idx; i++) {
+		if (is_rela_equal(&rela[i], &rela[idx]))
+			return true;
+	}
+	return false;
+}
+
+static void count_max_entries(Elf64_Rela *relas, int num,
+			      unsigned int *plts, unsigned int *gots)
+{
+	unsigned int type, i;
+
+	for (i = 0; i < num; i++) {
+		type = ELF64_R_TYPE(relas[i].r_info);
+		if (type == R_RISCV_CALL_PLT) {
+			if (!duplicate_rela(relas, i))
+				(*plts)++;
+		} else if (type == R_RISCV_GOT_HI20) {
+			if (!duplicate_rela(relas, i))
+				(*gots)++;
+		}
+	}
+}
+
+int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+			      char *secstrings, struct module *mod)
+{
+	unsigned int num_plts = 0;
+	unsigned int num_gots = 0;
+	int i;
+
+	/*
+	 * Find the empty .got and .plt sections.
+	 */
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		if (!strcmp(secstrings + sechdrs[i].sh_name, ".plt"))
+			mod->arch.plt.shdr = sechdrs + i;
+		else if (!strcmp(secstrings + sechdrs[i].sh_name, ".got"))
+			mod->arch.got.shdr = sechdrs + i;
+	}
+
+	if (!mod->arch.plt.shdr) {
+		pr_err("%s: module PLT section(s) missing\n", mod->name);
+		return -ENOEXEC;
+	}
+	if (!mod->arch.got.shdr) {
+		pr_err("%s: module GOT section(s) missing\n", mod->name);
+		return -ENOEXEC;
+	}
+
+	/* Calculate the maxinum number of entries */
+	for (i = 0; i < ehdr->e_shnum; i++) {
+		Elf64_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
+		int num_rela = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+		Elf64_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
+
+		if (sechdrs[i].sh_type != SHT_RELA)
+			continue;
+
+		/* ignore relocations that operate on non-exec sections */
+		if (!(dst_sec->sh_flags & SHF_EXECINSTR))
+			continue;
+
+		count_max_entries(relas, num_rela, &num_plts, &num_gots);
+	}
+
+	mod->arch.plt.shdr->sh_type = SHT_NOBITS;
+	mod->arch.plt.shdr->sh_flags = SHF_EXECINSTR | SHF_ALLOC;
+	mod->arch.plt.shdr->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.plt.shdr->sh_size = (num_plts + 1) * sizeof(struct plt_entry);
+	mod->arch.plt.num_entries = 0;
+	mod->arch.plt.max_entries = num_plts;
+
+	mod->arch.got.shdr->sh_type = SHT_NOBITS;
+	mod->arch.got.shdr->sh_flags = SHF_ALLOC;
+	mod->arch.got.shdr->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.got.shdr->sh_size = (num_gots + 1) * sizeof(struct got_entry);
+	mod->arch.got.num_entries = 0;
+	mod->arch.got.max_entries = num_gots;
+
+	return 0;
+}
diff --git a/arch/riscv/kernel/module.lds b/arch/riscv/kernel/module.lds
new file mode 100644
index 000000000000..7ef580e62883
--- /dev/null
+++ b/arch/riscv/kernel/module.lds
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2017 Andes Technology Corporation */
+
+SECTIONS {
+	.plt (NOLOAD) : { BYTE(0) }
+	.got (NOLOAD) : { BYTE(0) }
+}
-- 
cgit v1.2.3-59-g8ed1b


From b8bde0ef12bd43f013d879973a1900930bfb95ee Mon Sep 17 00:00:00 2001
From: Zong Li <zong@andestech.com>
Date: Thu, 15 Mar 2018 16:50:42 +0800
Subject: RISC-V: Add section of GOT.PLT for kernel module

Separate the function symbol address from .plt to .got.plt section.

The original plt entry has trampoline code with symbol address,
there is a 32-bit padding bwtween jar instruction and symbol address.

Extract the symbol address to .got.plt to reduce the module size.

Signed-off-by: Zong Li <zong@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/asm/module.h     | 40 +++++++++++++++++++++++--------------
 arch/riscv/kernel/module-sections.c | 21 +++++++++++++++++--
 arch/riscv/kernel/module.lds        |  1 +
 3 files changed, 45 insertions(+), 17 deletions(-)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h
index e61d73f82d4d..349df33808c4 100644
--- a/arch/riscv/include/asm/module.h
+++ b/arch/riscv/include/asm/module.h
@@ -21,6 +21,7 @@ struct mod_section {
 struct mod_arch_specific {
 	struct mod_section got;
 	struct mod_section plt;
+	struct mod_section got_plt;
 };
 
 struct got_entry {
@@ -48,13 +49,10 @@ struct plt_entry {
 	/*
 	 * Trampoline code to real target address. The return address
 	 * should be the original (pc+4) before entring plt entry.
-	 * For 8 byte alignment of symbol_addr,
-	 * don't pack structure to remove the padding.
 	 */
 	u32 insn_auipc;		/* auipc t0, 0x0                       */
 	u32 insn_ld;		/* ld    t1, 0x10(t0)                  */
 	u32 insn_jr;		/* jr    t1                            */
-	u64 symbol_addr;	/* the real jump target address        */
 };
 
 #define OPC_AUIPC  0x0017
@@ -62,9 +60,8 @@ struct plt_entry {
 #define OPC_JALR   0x0067
 #define REG_T0     0x5
 #define REG_T1     0x6
-#define IMM_OFFSET 0x10
 
-static inline struct plt_entry emit_plt_entry(u64 val)
+static inline struct plt_entry emit_plt_entry(u64 val, u64 plt, u64 got_plt)
 {
 	/*
 	 * U-Type encoding:
@@ -78,24 +75,37 @@ static inline struct plt_entry emit_plt_entry(u64 val)
 	 * +------------+------------+--------+----------+----------+
 	 *
 	 */
+	u64 offset = got_plt - plt;
+	u32 hi20 = (offset + 0x800) & 0xfffff000;
+	u32 lo12 = (offset - hi20);
 	return (struct plt_entry) {
-		OPC_AUIPC | (REG_T0 << 7),
-		OPC_LD | (IMM_OFFSET << 20) | (REG_T0 << 15) | (REG_T1 << 7),
-		OPC_JALR | (REG_T1 << 15),
-		val
+		OPC_AUIPC | (REG_T0 << 7) | hi20,
+		OPC_LD | (lo12 << 20) | (REG_T0 << 15) | (REG_T1 << 7),
+		OPC_JALR | (REG_T1 << 15)
 	};
 }
 
-static inline struct plt_entry *get_plt_entry(u64 val,
-					      const struct mod_section *sec)
+static inline int get_got_plt_idx(u64 val, const struct mod_section *sec)
 {
-	struct plt_entry *plt = (struct plt_entry *)sec->shdr->sh_addr;
+	struct got_entry *got_plt = (struct got_entry *)sec->shdr->sh_addr;
 	int i;
 	for (i = 0; i < sec->num_entries; i++) {
-		if (plt[i].symbol_addr == val)
-			return &plt[i];
+		if (got_plt[i].symbol_addr == val)
+			return i;
 	}
-	return NULL;
+	return -1;
+}
+
+static inline struct plt_entry *get_plt_entry(u64 val,
+				      const struct mod_section *sec_plt,
+				      const struct mod_section *sec_got_plt)
+{
+	struct plt_entry *plt = (struct plt_entry *)sec_plt->shdr->sh_addr;
+	int got_plt_idx = get_got_plt_idx(val, sec_got_plt);
+	if (got_plt_idx >= 0)
+		return plt + got_plt_idx;
+	else
+		return NULL;
 }
 
 #endif /* CONFIG_MODULE_SECTIONS */
diff --git a/arch/riscv/kernel/module-sections.c b/arch/riscv/kernel/module-sections.c
index 94ba1551eac3..bbbd26e19bfd 100644
--- a/arch/riscv/kernel/module-sections.c
+++ b/arch/riscv/kernel/module-sections.c
@@ -30,18 +30,23 @@ u64 module_emit_got_entry(struct module *mod, u64 val)
 
 u64 module_emit_plt_entry(struct module *mod, u64 val)
 {
+	struct mod_section *got_plt_sec = &mod->arch.got_plt;
+	struct got_entry *got_plt;
 	struct mod_section *plt_sec = &mod->arch.plt;
-	struct plt_entry *plt = get_plt_entry(val, plt_sec);
+	struct plt_entry *plt = get_plt_entry(val, plt_sec, got_plt_sec);
 	int i = plt_sec->num_entries;
 
 	if (plt)
 		return (u64)plt;
 
 	/* There is no duplicate entry, create a new one */
+	got_plt = (struct got_entry *)got_plt_sec->shdr->sh_addr;
+	got_plt[i] = emit_got_entry(val);
 	plt = (struct plt_entry *)plt_sec->shdr->sh_addr;
-	plt[i] = emit_plt_entry(val);
+	plt[i] = emit_plt_entry(val, (u64)&plt[i], (u64)&got_plt[i]);
 
 	plt_sec->num_entries++;
+	got_plt_sec->num_entries++;
 	BUG_ON(plt_sec->num_entries > plt_sec->max_entries);
 
 	return (u64)&plt[i];
@@ -94,6 +99,8 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			mod->arch.plt.shdr = sechdrs + i;
 		else if (!strcmp(secstrings + sechdrs[i].sh_name, ".got"))
 			mod->arch.got.shdr = sechdrs + i;
+		else if (!strcmp(secstrings + sechdrs[i].sh_name, ".got.plt"))
+			mod->arch.got_plt.shdr = sechdrs + i;
 	}
 
 	if (!mod->arch.plt.shdr) {
@@ -104,6 +111,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		pr_err("%s: module GOT section(s) missing\n", mod->name);
 		return -ENOEXEC;
 	}
+	if (!mod->arch.got_plt.shdr) {
+		pr_err("%s: module GOT.PLT section(s) missing\n", mod->name);
+		return -ENOEXEC;
+	}
 
 	/* Calculate the maxinum number of entries */
 	for (i = 0; i < ehdr->e_shnum; i++) {
@@ -135,5 +146,11 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 	mod->arch.got.num_entries = 0;
 	mod->arch.got.max_entries = num_gots;
 
+	mod->arch.got_plt.shdr->sh_type = SHT_NOBITS;
+	mod->arch.got_plt.shdr->sh_flags = SHF_ALLOC;
+	mod->arch.got_plt.shdr->sh_addralign = L1_CACHE_BYTES;
+	mod->arch.got_plt.shdr->sh_size = (num_plts + 1) * sizeof(struct got_entry);
+	mod->arch.got_plt.num_entries = 0;
+	mod->arch.got_plt.max_entries = num_plts;
 	return 0;
 }
diff --git a/arch/riscv/kernel/module.lds b/arch/riscv/kernel/module.lds
index 7ef580e62883..295ecfb341a2 100644
--- a/arch/riscv/kernel/module.lds
+++ b/arch/riscv/kernel/module.lds
@@ -4,4 +4,5 @@
 SECTIONS {
 	.plt (NOLOAD) : { BYTE(0) }
 	.got (NOLOAD) : { BYTE(0) }
+	.got.plt (NOLOAD) : { BYTE(0) }
 }
-- 
cgit v1.2.3-59-g8ed1b


From e21d54219c7a698b10d5f1e6a1023ebde284cd7b Mon Sep 17 00:00:00 2001
From: Zong Li <zong@andestech.com>
Date: Thu, 15 Mar 2018 16:50:51 +0800
Subject: RISC-V: Add definition of relocation types

Signed-off-by: Zong Li <zong@andestech.com>
Signed-off-by: Palmer Dabbelt <palmer@sifive.com>
---
 arch/riscv/include/uapi/asm/elf.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/riscv/include')

diff --git a/arch/riscv/include/uapi/asm/elf.h b/arch/riscv/include/uapi/asm/elf.h
index a510edfa8226..5cae4c30cd8e 100644
--- a/arch/riscv/include/uapi/asm/elf.h
+++ b/arch/riscv/include/uapi/asm/elf.h
@@ -79,5 +79,12 @@ typedef union __riscv_fp_state elf_fpregset_t;
 #define R_RISCV_TPREL_I		49
 #define R_RISCV_TPREL_S		50
 #define R_RISCV_RELAX		51
+#define R_RISCV_SUB6		52
+#define R_RISCV_SET6		53
+#define R_RISCV_SET8		54
+#define R_RISCV_SET16		55
+#define R_RISCV_SET32		56
+#define R_RISCV_32_PCREL	57
+
 
 #endif /* _UAPI_ASM_ELF_H */
-- 
cgit v1.2.3-59-g8ed1b