aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--Documentation/features/sched/membarrier-sync-core/arch-support.txt18
-rw-r--r--Documentation/scheduler/index.rst1
-rw-r--r--Documentation/scheduler/membarrier.rst39
-rw-r--r--MAINTAINERS4
-rw-r--r--arch/riscv/Kconfig4
-rw-r--r--arch/riscv/include/asm/membarrier.h50
-rw-r--r--arch/riscv/include/asm/sync_core.h29
-rw-r--r--arch/riscv/mm/context.c2
-rw-r--r--include/linux/sync_core.h16
-rw-r--r--init/Kconfig3
-rw-r--r--kernel/sched/core.c16
-rw-r--r--kernel/sched/membarrier.c13
12 files changed, 185 insertions, 10 deletions
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index d96b778b87ed..7425d2b994a3 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -10,6 +10,22 @@
# Rely on implicit context synchronization as a result of exception return
# when returning from IPI handler, and when returning to user-space.
#
+# * riscv
+#
+# riscv uses xRET as return from interrupt and to return to user-space.
+#
+# Given that xRET is not core serializing, we rely on FENCE.I for providing
+# core serialization:
+#
+# - by calling sync_core_before_usermode() on return from interrupt (cf.
+# ipi_sync_core()),
+#
+# - via switch_mm() and sync_core_before_usermode() (respectively, for
+# uthread->uthread and kthread->uthread transitions) before returning
+# to user-space.
+#
+# The serialization in switch_mm() is activated by prepare_sync_core_cmd().
+#
# * x86
#
# x86-32 uses IRET as return from interrupt, which takes care of the IPI.
@@ -43,7 +59,7 @@
| openrisc: | TODO |
| parisc: | TODO |
| powerpc: | ok |
- | riscv: | TODO |
+ | riscv: | ok |
| s390: | ok |
| sh: | TODO |
| sparc: | TODO |
diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst
index 3170747226f6..43bd8a145b7a 100644
--- a/Documentation/scheduler/index.rst
+++ b/Documentation/scheduler/index.rst
@@ -7,6 +7,7 @@ Scheduler
completion
+ membarrier
sched-arch
sched-bwc
sched-deadline
diff --git a/Documentation/scheduler/membarrier.rst b/Documentation/scheduler/membarrier.rst
new file mode 100644
index 000000000000..2387804b1c63
--- /dev/null
+++ b/Documentation/scheduler/membarrier.rst
@@ -0,0 +1,39 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+membarrier() System Call
+========================
+
+MEMBARRIER_CMD_{PRIVATE,GLOBAL}_EXPEDITED - Architecture requirements
+=====================================================================
+
+Memory barriers before updating rq->curr
+----------------------------------------
+
+The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
+require each architecture to have a full memory barrier after coming from
+user-space, before updating rq->curr. This barrier is implied by the sequence
+rq_lock(); smp_mb__after_spinlock() in __schedule(). The barrier matches a full
+barrier in the proximity of the membarrier system call exit, cf.
+membarrier_{private,global}_expedited().
+
+Memory barriers after updating rq->curr
+---------------------------------------
+
+The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
+require each architecture to have a full memory barrier after updating rq->curr,
+before returning to user-space. The schemes providing this barrier on the various
+architectures are as follows.
+
+ - alpha, arc, arm, hexagon, mips rely on the full barrier implied by
+ spin_unlock() in finish_lock_switch().
+
+ - arm64 relies on the full barrier implied by switch_to().
+
+ - powerpc, riscv, s390, sparc, x86 rely on the full barrier implied by
+ switch_mm(), if mm is not NULL; they rely on the full barrier implied
+ by mmdrop(), otherwise. On powerpc and riscv, switch_mm() relies on
+ membarrier_arch_switch_mm().
+
+The barrier matches a full barrier in the proximity of the membarrier system call
+entry, cf. membarrier_{private,global}_expedited().
diff --git a/MAINTAINERS b/MAINTAINERS
index 8d1052fa6a69..cc80968ec355 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14039,7 +14039,9 @@ M: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
M: "Paul E. McKenney" <paulmck@kernel.org>
L: linux-kernel@vger.kernel.org
S: Supported
-F: arch/powerpc/include/asm/membarrier.h
+F: Documentation/scheduler/membarrier.rst
+F: arch/*/include/asm/membarrier.h
+F: arch/*/include/asm/sync_core.h
F: include/uapi/linux/membarrier.h
F: kernel/sched/membarrier.c
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index b49016bb5077..85c899d0133a 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -27,14 +27,18 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+ select ARCH_HAS_MEMBARRIER_CALLBACKS
+ select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MMIOWB
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
+ select ARCH_HAS_PREPARE_SYNC_CORE_CMD
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP if MMU
select ARCH_HAS_SET_MEMORY if MMU
select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
+ select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
select ARCH_HAS_SYSCALL_WRAPPER
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h
new file mode 100644
index 000000000000..47b240d0d596
--- /dev/null
+++ b/arch/riscv/include/asm/membarrier.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_RISCV_MEMBARRIER_H
+#define _ASM_RISCV_MEMBARRIER_H
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ /*
+ * Only need the full barrier when switching between processes.
+ * Barrier when switching from kernel to userspace is not
+ * required here, given that it is implied by mmdrop(). Barrier
+ * when switching from userspace to kernel is not needed after
+ * store to rq->curr.
+ */
+ if (IS_ENABLED(CONFIG_SMP) &&
+ likely(!(atomic_read(&next->membarrier_state) &
+ (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+ MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
+ return;
+
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * This barrier is also needed for the SYNC_CORE command when
+ * switching between processes; in particular, on a transition
+ * from a thread belonging to another mm to a thread belonging
+ * to the mm for which a membarrier SYNC_CORE is done on CPU0:
+ *
+ * - [CPU0] sets all bits in the mm icache_stale_mask (in
+ * prepare_sync_core_cmd());
+ *
+ * - [CPU1] stores to rq->curr (by the scheduler);
+ *
+ * - [CPU0] loads rq->curr within membarrier and observes
+ * cpu_rq(1)->curr->mm != mm, so the IPI is skipped on
+ * CPU1; this means membarrier relies on switch_mm() to
+ * issue the sync-core;
+ *
+ * - [CPU1] switch_mm() loads icache_stale_mask; if the bit
+ * is zero, switch_mm() may incorrectly skip the sync-core.
+ *
+ * Matches a full barrier in the proximity of the membarrier
+ * system call entry.
+ */
+ smp_mb();
+}
+
+#endif /* _ASM_RISCV_MEMBARRIER_H */
diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
new file mode 100644
index 000000000000..9153016da8f1
--- /dev/null
+++ b/arch/riscv/include/asm/sync_core.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_SYNC_CORE_H
+#define _ASM_RISCV_SYNC_CORE_H
+
+/*
+ * RISC-V implements return to user-space through an xRET instruction,
+ * which is not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+ asm volatile ("fence.i" ::: "memory");
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Ensure the next switch_mm() on every CPU issues a core serializing
+ * instruction for the given @mm.
+ */
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+ cpumask_setall(&mm->context.icache_stale_mask);
+}
+#else
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SMP */
+
+#endif /* _ASM_RISCV_SYNC_CORE_H */
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 217fd4de6134..ba8eb3944687 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (unlikely(prev == next))
return;
+ membarrier_arch_switch_mm(prev, next, task);
+
/*
* Mark the current MM context as inactive, and the next as
* active. This is at least used by the icache flushing
diff --git a/include/linux/sync_core.h b/include/linux/sync_core.h
index 013da4b8b327..67bb9794b875 100644
--- a/include/linux/sync_core.h
+++ b/include/linux/sync_core.h
@@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void)
}
#endif
-#endif /* _LINUX_SYNC_CORE_H */
+#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD
+#include <asm/sync_core.h>
+#else
+/*
+ * This is a dummy prepare_sync_core_cmd() implementation that can be used on
+ * all architectures which provide unconditional core serializing instructions
+ * in switch_mm().
+ * If your architecture doesn't provide such core serializing instructions in
+ * switch_mm(), you may need to write your own functions.
+ */
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+}
+#endif
+#endif /* _LINUX_SYNC_CORE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 8df18f3a9748..c3994b92333d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1970,6 +1970,9 @@ source "kernel/Kconfig.locks"
config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
bool
+config ARCH_HAS_PREPARE_SYNC_CORE_CMD
+ bool
+
config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
bool
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..e4a87bcf28d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6638,7 +6638,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
* if (signal_pending_state()) if (p->state & @state)
*
* Also, the membarrier system call requires a full memory barrier
- * after coming from user-space, before storing to rq->curr.
+ * after coming from user-space, before storing to rq->curr; this
+ * barrier matches a full barrier in the proximity of the membarrier
+ * system call exit.
*/
rq_lock(rq, &rf);
smp_mb__after_spinlock();
@@ -6709,12 +6711,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
*
* Here are the schemes providing that barrier on the
* various architectures:
- * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
- * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+ * RISC-V. switch_mm() relies on membarrier_arch_switch_mm()
+ * on PowerPC and on RISC-V.
* - finish_lock_switch() for weakly-ordered
* architectures where spin_unlock is a full barrier,
* - switch_to() for arm64 (weakly-ordered, spin_unlock
* is a RELEASE barrier),
+ *
+ * The barrier matches a full barrier in the proximity of
+ * the membarrier system call entry.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
++*switch_count;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 2ad881d07752..703e8d80a576 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -251,7 +251,7 @@ static int membarrier_global_expedited(void)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
*/
smp_mb(); /* system call entry is not a mb. */
@@ -300,7 +300,7 @@ static int membarrier_global_expedited(void)
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */
@@ -320,6 +320,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
ipi_func = ipi_sync_core;
+ prepare_sync_core_cmd(mm);
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
if (!IS_ENABLED(CONFIG_RSEQ))
return -EINVAL;
@@ -339,8 +340,12 @@ static int membarrier_private_expedited(int flags, int cpu_id)
return 0;
/*
- * Matches memory barriers around rq->curr modification in
+ * Matches memory barriers after rq->curr modification in
* scheduler.
+ *
+ * On RISC-V, this barrier pairing is also needed for the
+ * SYNC_CORE command when switching between processes, cf.
+ * the inline comments in membarrier_arch_switch_mm().
*/
smp_mb(); /* system call entry is not a mb. */
@@ -415,7 +420,7 @@ out:
/*
* Memory barrier on the caller thread _after_ we finished
- * waiting for the last IPI. Matches memory barriers around
+ * waiting for the last IPI. Matches memory barriers before
* rq->curr modification in scheduler.
*/
smp_mb(); /* exit from system call is not a mb */