diff options
Diffstat (limited to 'tools')
107 files changed, 3647 insertions, 331 deletions
diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 568854b14d0a..52c6262e6bfd 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -201,6 +201,21 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +/** + * for_each_insn_prefix() -- Iterate prefixes in the instruction + * @insn: Pointer to struct insn. + * @idx: Index storage. + * @prefix: Prefix byte. + * + * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix + * and the index is stored in @idx (note that this @idx is just for a cursor, + * do not change it.) + * Since prefixes.nbytes can be bigger than 4 if some prefixes + * are repeated, it cannot be used for looping over the prefixes. + */ +#define for_each_insn_prefix(insn, idx, prefix) \ + for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) + #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c index 8ab142ff5eac..2afb7d5b1aca 100644 --- a/tools/bpf/bpftool/btf.c +++ b/tools/bpf/bpftool/btf.c @@ -693,6 +693,7 @@ build_btf_type_table(struct btf_attach_table *tab, enum bpf_obj_type type, obj_node = calloc(1, sizeof(*obj_node)); if (!obj_node) { p_err("failed to allocate memory: %s", strerror(errno)); + err = -ENOMEM; goto err_free; } diff --git a/tools/bpf/bpftool/pids.c b/tools/bpf/bpftool/pids.c index df7d8ec76036..477e55d59c34 100644 --- a/tools/bpf/bpftool/pids.c +++ b/tools/bpf/bpftool/pids.c @@ -89,9 +89,9 @@ libbpf_print_none(__maybe_unused enum libbpf_print_level level, int build_obj_refs_table(struct obj_refs_table *table, enum bpf_obj_type type) { - char buf[4096]; - struct pid_iter_bpf *skel; struct pid_iter_entry *e; + char buf[4096 / sizeof(*e) * sizeof(*e)]; + struct pid_iter_bpf *skel; int err, ret, fd = -1, i; libbpf_print_fn_t default_print; diff --git a/tools/debugging/kernel-chktaint b/tools/debugging/kernel-chktaint index 2240cb56e6e5..607b2b280945 100755 --- a/tools/debugging/kernel-chktaint +++ b/tools/debugging/kernel-chktaint @@ -72,7 +72,7 @@ if [ `expr $T % 2` -eq 0 ]; then addout " " else addout "S" - echo " * SMP kernel oops on an officially SMP incapable processor (#2)" + echo " * kernel running on an out of specification system (#2)" fi T=`expr $T / 2` diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h index 2551e9b71167..e61d36cd4e50 100644 --- a/tools/include/nolibc/nolibc.h +++ b/tools/include/nolibc/nolibc.h @@ -107,7 +107,7 @@ static int errno; #endif /* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memry page. + * because they all correspond to the highest addressable memory page. */ #define MAX_ERRNO 4095 @@ -231,7 +231,7 @@ struct rusage { #define DT_SOCK 12 /* all the *at functions */ -#ifndef AT_FDWCD +#ifndef AT_FDCWD #define AT_FDCWD -100 #endif diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..556216dc9703 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3897,8 +3897,8 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ - FN(bpf_per_cpu_ptr), \ - FN(bpf_this_cpu_ptr), \ + FN(per_cpu_ptr), \ + FN(this_cpu_ptr), \ FN(redirect_peer), \ /* */ diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index d199a3694be8..b0bf56c5f120 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -742,7 +742,11 @@ class DebugfsProvider(Provider): The fields are all available KVM debugfs files """ - return self.walkdir(PATH_DEBUGFS_KVM)[2] + exempt_list = ['halt_poll_fail_ns', 'halt_poll_success_ns'] + fields = [field for field in self.walkdir(PATH_DEBUGFS_KVM)[2] + if field not in exempt_list] + + return fields def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 5c6522c89af1..98537ff2679e 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -278,7 +278,7 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) err = ringbuf_process_ring(ring); if (err < 0) return err; - res += cnt; + res += err; } return cnt < 0 ? -errno : res; } diff --git a/tools/memory-model/Documentation/README b/tools/memory-model/Documentation/README new file mode 100644 index 000000000000..db90a26dbdf4 --- /dev/null +++ b/tools/memory-model/Documentation/README @@ -0,0 +1,76 @@ +It has been said that successful communication requires first identifying +what your audience knows and then building a bridge from their current +knowledge to what they need to know. Unfortunately, the expected +Linux-kernel memory model (LKMM) audience might be anywhere from novice +to expert both in kernel hacking and in understanding LKMM. + +This document therefore points out a number of places to start reading, +depending on what you know and what you would like to learn. Please note +that the documents later in this list assume that the reader understands +the material provided by documents earlier in this list. + +o You are new to Linux-kernel concurrency: simple.txt + +o You have some background in Linux-kernel concurrency, and would + like an overview of the types of low-level concurrency primitives + that the Linux kernel provides: ordering.txt + + Here, "low level" means atomic operations to single variables. + +o You are familiar with the Linux-kernel concurrency primitives + that you need, and just want to get started with LKMM litmus + tests: litmus-tests.txt + +o You are familiar with Linux-kernel concurrency, and would + like a detailed intuitive understanding of LKMM, including + situations involving more than two threads: recipes.txt + +o You would like a detailed understanding of what your compiler can + and cannot do to control dependencies: control-dependencies.txt + +o You are familiar with Linux-kernel concurrency and the use of + LKMM, and would like a quick reference: cheatsheet.txt + +o You are familiar with Linux-kernel concurrency and the use + of LKMM, and would like to learn about LKMM's requirements, + rationale, and implementation: explanation.txt + +o You are interested in the publications related to LKMM, including + hardware manuals, academic literature, standards-committee + working papers, and LWN articles: references.txt + + +==================== +DESCRIPTION OF FILES +==================== + +README + This file. + +cheatsheet.txt + Quick-reference guide to the Linux-kernel memory model. + +control-dependencies.txt + Guide to preventing compiler optimizations from destroying + your control dependencies. + +explanation.txt + Detailed description of the memory model. + +litmus-tests.txt + The format, features, capabilities, and limitations of the litmus + tests that LKMM can evaluate. + +ordering.txt + Overview of the Linux kernel's low-level memory-ordering + primitives by category. + +recipes.txt + Common memory-ordering patterns. + +references.txt + Background information. + +simple.txt + Starting point for someone new to Linux-kernel concurrency. + And also a reminder of the simpler approaches to concurrency! diff --git a/tools/memory-model/Documentation/control-dependencies.txt b/tools/memory-model/Documentation/control-dependencies.txt new file mode 100644 index 000000000000..8b743d20fe27 --- /dev/null +++ b/tools/memory-model/Documentation/control-dependencies.txt @@ -0,0 +1,258 @@ +CONTROL DEPENDENCIES +==================== + +A major difficulty with control dependencies is that current compilers +do not support them. One purpose of this document is therefore to +help you prevent your compiler from breaking your code. However, +control dependencies also pose other challenges, which leads to the +second purpose of this document, namely to help you to avoid breaking +your own code, even in the absence of help from your compiler. + +One such challenge is that control dependencies order only later stores. +Therefore, a load-load control dependency will not preserve ordering +unless a read memory barrier is provided. Consider the following code: + + q = READ_ONCE(a); + if (q) + p = READ_ONCE(b); + +This is not guaranteed to provide any ordering because some types of CPUs +are permitted to predict the result of the load from "b". This prediction +can cause other CPUs to see this load as having happened before the load +from "a". This means that an explicit read barrier is required, for example +as follows: + + q = READ_ONCE(a); + if (q) { + smp_rmb(); + p = READ_ONCE(b); + } + +However, stores are not speculated. This means that ordering is +(usually) guaranteed for load-store control dependencies, as in the +following example: + + q = READ_ONCE(a); + if (q) + WRITE_ONCE(b, 1); + +Control dependencies can pair with each other and with other types +of ordering. But please note that neither the READ_ONCE() nor the +WRITE_ONCE() are optional. Without the READ_ONCE(), the compiler might +fuse the load from "a" with other loads. Without the WRITE_ONCE(), +the compiler might fuse the store to "b" with other stores. Worse yet, +the compiler might convert the store into a load and a check followed +by a store, and this compiler-generated load would not be ordered by +the control dependency. + +Furthermore, if the compiler is able to prove that the value of variable +"a" is always non-zero, it would be well within its rights to optimize +the original example by eliminating the "if" statement as follows: + + q = a; + b = 1; /* BUG: Compiler and CPU can both reorder!!! */ + +So don't leave out either the READ_ONCE() or the WRITE_ONCE(). +In particular, although READ_ONCE() does force the compiler to emit a +load, it does *not* force the compiler to actually use the loaded value. + +It is tempting to try use control dependencies to enforce ordering on +identical stores on both branches of the "if" statement as follows: + + q = READ_ONCE(a); + if (q) { + barrier(); + WRITE_ONCE(b, 1); + do_something(); + } else { + barrier(); + WRITE_ONCE(b, 1); + do_something_else(); + } + +Unfortunately, current compilers will transform this as follows at high +optimization levels: + + q = READ_ONCE(a); + barrier(); + WRITE_ONCE(b, 1); /* BUG: No ordering vs. load from a!!! */ + if (q) { + /* WRITE_ONCE(b, 1); -- moved up, BUG!!! */ + do_something(); + } else { + /* WRITE_ONCE(b, 1); -- moved up, BUG!!! */ + do_something_else(); + } + +Now there is no conditional between the load from "a" and the store to +"b", which means that the CPU is within its rights to reorder them: The +conditional is absolutely required, and must be present in the final +assembly code, after all of the compiler and link-time optimizations +have been applied. Therefore, if you need ordering in this example, +you must use explicit memory ordering, for example, smp_store_release(): + + q = READ_ONCE(a); + if (q) { + smp_store_release(&b, 1); + do_something(); + } else { + smp_store_release(&b, 1); + do_something_else(); + } + +Without explicit memory ordering, control-dependency-based ordering is +guaranteed only when the stores differ, for example: + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +The initial READ_ONCE() is still required to prevent the compiler from +knowing too much about the value of "a". + +But please note that you need to be careful what you do with the local +variable "q", otherwise the compiler might be able to guess the value +and again remove the conditional branch that is absolutely required to +preserve ordering. For example: + + q = READ_ONCE(a); + if (q % MAX) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +If MAX is compile-time defined to be 1, then the compiler knows that +(q % MAX) must be equal to zero, regardless of the value of "q". +The compiler is therefore within its rights to transform the above code +into the following: + + q = READ_ONCE(a); + WRITE_ONCE(b, 2); + do_something_else(); + +Given this transformation, the CPU is not required to respect the ordering +between the load from variable "a" and the store to variable "b". It is +tempting to add a barrier(), but this does not help. The conditional +is gone, and the barrier won't bring it back. Therefore, if you need +to relying on control dependencies to produce this ordering, you should +make sure that MAX is greater than one, perhaps as follows: + + q = READ_ONCE(a); + BUILD_BUG_ON(MAX <= 1); /* Order load from a with store to b. */ + if (q % MAX) { + WRITE_ONCE(b, 1); + do_something(); + } else { + WRITE_ONCE(b, 2); + do_something_else(); + } + +Please note once again that each leg of the "if" statement absolutely +must store different values to "b". As in previous examples, if the two +values were identical, the compiler could pull this store outside of the +"if" statement, destroying the control dependency's ordering properties. + +You must also be careful avoid relying too much on boolean short-circuit +evaluation. Consider this example: + + q = READ_ONCE(a); + if (q || 1 > 0) + WRITE_ONCE(b, 1); + +Because the first condition cannot fault and the second condition is +always true, the compiler can transform this example as follows, again +destroying the control dependency's ordering: + + q = READ_ONCE(a); + WRITE_ONCE(b, 1); + +This is yet another example showing the importance of preventing the +compiler from out-guessing your code. Again, although READ_ONCE() really +does force the compiler to emit code for a given load, the compiler is +within its rights to discard the loaded value. + +In addition, control dependencies apply only to the then-clause and +else-clause of the "if" statement in question. In particular, they do +not necessarily order the code following the entire "if" statement: + + q = READ_ONCE(a); + if (q) { + WRITE_ONCE(b, 1); + } else { + WRITE_ONCE(b, 2); + } + WRITE_ONCE(c, 1); /* BUG: No ordering against the read from "a". */ + +It is tempting to argue that there in fact is ordering because the +compiler cannot reorder volatile accesses and also cannot reorder +the writes to "b" with the condition. Unfortunately for this line +of reasoning, the compiler might compile the two writes to "b" as +conditional-move instructions, as in this fanciful pseudo-assembly +language: + + ld r1,a + cmp r1,$0 + cmov,ne r4,$1 + cmov,eq r4,$2 + st r4,b + st $1,c + +The control dependencies would then extend only to the pair of cmov +instructions and the store depending on them. This means that a weakly +ordered CPU would have no dependency of any sort between the load from +"a" and the store to "c". In short, control dependencies provide ordering +only to the stores in the then-clause and else-clause of the "if" statement +in question (including functions invoked by those two clauses), and not +to code following that "if" statement. + + +In summary: + + (*) Control dependencies can order prior loads against later stores. + However, they do *not* guarantee any other sort of ordering: + Not prior loads against later loads, nor prior stores against + later anything. If you need these other forms of ordering, use + smp_load_acquire(), smp_store_release(), or, in the case of prior + stores and later loads, smp_mb(). + + (*) If both legs of the "if" statement contain identical stores to + the same variable, then you must explicitly order those stores, + either by preceding both of them with smp_mb() or by using + smp_store_release(). Please note that it is *not* sufficient to use + barrier() at beginning and end of each leg of the "if" statement + because, as shown by the example above, optimizing compilers can + destroy the control dependency while respecting the letter of the + barrier() law. + + (*) Control dependencies require at least one run-time conditional + between the prior load and the subsequent store, and this + conditional must involve the prior load. If the compiler is able + to optimize the conditional away, it will have also optimized + away the ordering. Careful use of READ_ONCE() and WRITE_ONCE() + can help to preserve the needed conditional. + + (*) Control dependencies require that the compiler avoid reordering the + dependency into nonexistence. Careful use of READ_ONCE() or + atomic{,64}_read() can help to preserve your control dependency. + + (*) Control dependencies apply only to the then-clause and else-clause + of the "if" statement containing the control dependency, including + any functions that these two clauses call. Control dependencies + do *not* apply to code beyond the end of that "if" statement. + + (*) Control dependencies pair normally with other types of barriers. + + (*) Control dependencies do *not* provide multicopy atomicity. If you + need all the CPUs to agree on the ordering of a given store against + all other accesses, use smp_mb(). + + (*) Compilers do not understand control dependencies. It is therefore + your job to ensure that they do not break your code. diff --git a/tools/memory-model/Documentation/glossary.txt b/tools/memory-model/Documentation/glossary.txt new file mode 100644 index 000000000000..79acb75d56ea --- /dev/null +++ b/tools/memory-model/Documentation/glossary.txt @@ -0,0 +1,172 @@ +This document contains brief definitions of LKMM-related terms. Like most +glossaries, it is not intended to be read front to back (except perhaps +as a way of confirming a diagnosis of OCD), but rather to be searched +for specific terms. + + +Address Dependency: When the address of a later memory access is computed + based on the value returned by an earlier load, an "address + dependency" extends from that load extending to the later access. + Address dependencies are quite common in RCU read-side critical + sections: + + 1 rcu_read_lock(); + 2 p = rcu_dereference(gp); + 3 do_something(p->a); + 4 rcu_read_unlock(); + + In this case, because the address of "p->a" on line 3 is computed + from the value returned by the rcu_dereference() on line 2, the + address dependency extends from that rcu_dereference() to that + "p->a". In rare cases, optimizing compilers can destroy address + dependencies. Please see Documentation/RCU/rcu_dereference.txt + for more information. + + See also "Control Dependency" and "Data Dependency". + +Acquire: With respect to a lock, acquiring that lock, for example, + using spin_lock(). With respect to a non-lock shared variable, + a special operation that includes a load and which orders that + load before later memory references running on that same CPU. + An example special acquire operation is smp_load_acquire(), + but atomic_read_acquire() and atomic_xchg_acquire() also include + acquire loads. + + When an acquire load returns the value stored by a release store + to that same variable, then all operations preceding that store + happen before any operations following that load acquire. + + See also "Relaxed" and "Release". + +Coherence (co): When one CPU's store to a given variable overwrites + either the value from another CPU's store or some later value, + there is said to be a coherence link from the second CPU to + the first. + + It is also possible to have a coherence link within a CPU, which + is a "coherence internal" (coi) link. The term "coherence + external" (coe) link is used when it is necessary to exclude + the coi case. + + See also "From-reads" and "Reads-from". + +Control Dependency: When a later store's execution depends on a test + of a value computed from a value returned by an earlier load, + a "control dependency" extends from that load to that store. + For example: + + 1 if (READ_ONCE(x)) + 2 WRITE_ONCE(y, 1); + + Here, the control dependency extends from the READ_ONCE() on + line 1 to the WRITE_ONCE() on line 2. Control dependencies are + fragile, and can be easily destroyed by optimizing compilers. + Please see control-dependencies.txt for more information. + + See also "Address Dependency" and "Data Dependency". + +Cycle: Memory-barrier pairing is restricted to a pair of CPUs, as the + name suggests. And in a great many cases, a pair of CPUs is all + that is required. In other cases, the notion of pairing must be + extended to additional CPUs, and the result is called a "cycle". + In a cycle, each CPU's ordering interacts with that of the next: + + CPU 0 CPU 1 CPU 2 + WRITE_ONCE(x, 1); WRITE_ONCE(y, 1); WRITE_ONCE(z, 1); + smp_mb(); smp_mb(); smp_mb(); + r0 = READ_ONCE(y); r1 = READ_ONCE(z); r2 = READ_ONCE(x); + + CPU 0's smp_mb() interacts with that of CPU 1, which interacts + with that of CPU 2, which in turn interacts with that of CPU 0 + to complete the cycle. Because of the smp_mb() calls between + each pair of memory accesses, the outcome where r0, r1, and r2 + are all equal to zero is forbidden by LKMM. + + See also "Pairing". + +Data Dependency: When the data written by a later store is computed based + on the value returned by an earlier load, a "data dependency" + extends from that load to that later store. For example: + + 1 r1 = READ_ONCE(x); + 2 WRITE_ONCE(y, r1 + 1); + + In this case, the data dependency extends from the READ_ONCE() + on line 1 to the WRITE_ONCE() on line 2. Data dependencies are + fragile and can be easily destroyed by optimizing compilers. + Because optimizing compilers put a great deal of effort into + working out what values integer variables might have, this is + especially true in cases where the dependency is carried through + an integer. + + See also "Address Dependency" and "Control Dependency". + +From-Reads (fr): When one CPU's store to a given variable happened + too late to affect the value returned by another CPU's + load from that same variable, there is said to be a from-reads + link from the load to the store. + + It is also possible to have a from-reads link within a CPU, which + is a "from-reads internal" (fri) link. The term "from-reads + external" (fre) link is used when it is necessary to exclude + the fri case. + + See also "Coherence" and "Reads-from". + +Fully Ordered: An operation such as smp_mb() that orders all of + its CPU's prior accesses with all of that CPU's subsequent + accesses, or a marked access such as atomic_add_return() + that orders all of its CPU's prior accesses, itself, and + all of its CPU's subsequent accesses. + +Marked Access: An access to a variable that uses an special function or + macro such as "r1 = READ_ONCE(x)" or "smp_store_release(&a, 1)". + + See also "Unmarked Access". + +Pairing: "Memory-barrier pairing" reflects the fact that synchronizing + data between two CPUs requires that both CPUs their accesses. + Memory barriers thus tend to come in pairs, one executed by + one of the CPUs and the other by the other CPU. Of course, + pairing also occurs with other types of operations, so that a + smp_store_release() pairs with an smp_load_acquire() that reads + the value stored. + + See also "Cycle". + +Reads-From (rf): When one CPU's load returns the value stored by some other + CPU, there is said to be a reads-from link from the second + CPU's store to the first CPU's load. Reads-from links have the + nice property that time must advance from the store to the load, + which means that algorithms using reads-from links can use lighter + weight ordering and synchronization compared to algorithms using + coherence and from-reads links. + + It is also possible to have a reads-from link within a CPU, which + is a "reads-from internal" (rfi) link. The term "reads-from + external" (rfe) link is used when it is necessary to exclude + the rfi case. + + See also Coherence" and "From-reads". + +Relaxed: A marked access that does not imply ordering, for example, a + READ_ONCE(), WRITE_ONCE(), a non-value-returning read-modify-write + operation, or a value-returning read-modify-write operation whose + name ends in "_relaxed". + + See also "Acquire" and "Release". + +Release: With respect to a lock, releasing that lock, for example, + using spin_unlock(). With respect to a non-lock shared variable, + a special operation that includes a store and which orders that + store after earlier memory references that ran on that same CPU. + An example special release store is smp_store_release(), but + atomic_set_release() and atomic_cmpxchg_release() also include + release stores. + + See also "Acquire" and "Relaxed". + +Unmarked Access: An access to a variable that uses normal C-language + syntax, for example, "a = b[2]"; + + See also "Marked Access". diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 2f840dcd15cf..8a9d5d2787f9 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -946,6 +946,23 @@ Limitations of the Linux-kernel memory model (LKMM) include: carrying a dependency, then the compiler can break that dependency by substituting a constant of that value. + Conversely, LKMM sometimes doesn't recognize that a particular + optimization is not allowed, and as a result, thinks that a + dependency is not present (because the optimization would break it). + The memory model misses some pretty obvious control dependencies + because of this limitation. A simple example is: + + r1 = READ_ONCE(x); + if (r1 == 0) + smp_mb(); + WRITE_ONCE(y, 1); + + There is a control dependency from the READ_ONCE to the WRITE_ONCE, + even when r1 is nonzero, but LKMM doesn't realize this and thinks + that the write may execute before the read if r1 != 0. (Yes, that + doesn't make sense if you think about it, but the memory model's + intelligence is limited.) + 2. Multiple access sizes for a single variable are not supported, and neither are misaligned or partially overlapping accesses. diff --git a/tools/memory-model/Documentation/ordering.txt b/tools/memory-model/Documentation/ordering.txt new file mode 100644 index 000000000000..9b0949d3f5ec --- /dev/null +++ b/tools/memory-model/Documentation/ordering.txt @@ -0,0 +1,556 @@ +This document gives an overview of the categories of memory-ordering +operations provided by the Linux-kernel memory model (LKMM). + + +Categories of Ordering +====================== + +This section lists LKMM's three top-level categories of memory-ordering +operations in decreasing order of strength: + +1. Barriers (also known as "fences"). A barrier orders some or + all of the CPU's prior operations against some or all of its + subsequent operations. + +2. Ordered memory accesses. These operations order themselves + against some or all of the CPU's prior accesses or some or all + of the CPU's subsequent accesses, depending on the subcategory + of the operation. + +3. Unordered accesses, as the name indicates, have no ordering + properties except to the extent that they interact with an + operation in the previous categories. This being the real world, + some of these "unordered" operations provide limited ordering + in some special situations. + +Each of the above categories is described in more detail by one of the +following sections. + + +Barriers +======== + +Each of the following categories of barriers is described in its own +subsection below: + +a. Full memory barriers. + +b. Read-modify-write (RMW) ordering augmentation barriers. + +c. Write memory barrier. + +d. Read memory barrier. + +e. Compiler barrier. + +Note well that many of these primitives generate absolutely no code +in kernels built with CONFIG_SMP=n. Therefore, if you are writing +a device driver, which must correctly order accesses to a physical +device even in kernels built with CONFIG_SMP=n, please use the +ordering primitives provided for that purpose. For example, instead of +smp_mb(), use mb(). See the "Linux Kernel Device Drivers" book or the +https://lwn.net/Articles/698014/ article for more information. + + +Full Memory Barriers +-------------------- + +The Linux-kernel primitives that provide full ordering include: + +o The smp_mb() full memory barrier. + +o Value-returning RMW atomic operations whose names do not end in + _acquire, _release, or _relaxed. + +o RCU's grace-period primitives. + +First, the smp_mb() full memory barrier orders all of the CPU's prior +accesses against all subsequent accesses from the viewpoint of all CPUs. +In other words, all CPUs will agree that any earlier action taken +by that CPU happened before any later action taken by that same CPU. +For example, consider the following: + + WRITE_ONCE(x, 1); + smp_mb(); // Order store to x before load from y. + r1 = READ_ONCE(y); + +All CPUs will agree that the store to "x" happened before the load +from "y", as indicated by the comment. And yes, please comment your +memory-ordering primitives. It is surprisingly hard to remember their +purpose after even a few months. + +Second, some RMW atomic operations provide full ordering. These +operations include value-returning RMW atomic operations (that is, those +with non-void return types) whose names do not end in _acquire, _release, +or _relaxed. Examples include atomic_add_return(), atomic_dec_and_test(), +cmpxchg(), and xchg(). Note that conditional RMW atomic operations such +as cmpxchg() are only guaranteed to provide ordering when they succeed. +When RMW atomic operations provide full ordering, they partition the +CPU's accesses into three groups: + +1. All code that executed prior to the RMW atomic operation. + +2. The RMW atomic operation itself. + +3. All code that executed after the RMW atomic operation. + +All CPUs will agree that any operation in a given partition happened +before any operation in a higher-numbered partition. + +In contrast, non-value-returning RMW atomic operations (that is, those +with void return types) do not guarantee any ordering whatsoever. Nor do +value-returning RMW atomic operations whose names end in _relaxed. +Examples of the former include atomic_inc() and atomic_dec(), +while examples of the latter include atomic_cmpxchg_relaxed() and +atomic_xchg_relaxed(). Similarly, value-returning non-RMW atomic +operations such as atomic_read() do not guarantee full ordering, and +are covered in the later section on unordered operations. + +Value-returning RMW atomic operations whose names end in _acquire or +_release provide limited ordering, and will be described later in this +document. + +Finally, RCU's grace-period primitives provide full ordering. These +primitives include synchronize_rcu(), synchronize_rcu_expedited(), +synchronize_srcu() and so on. However, these primitives have orders +of magnitude greater overhead than smp_mb(), atomic_xchg(), and so on. +Furthermore, RCU's grace-period primitives can only be invoked in +sleepable contexts. Therefore, RCU's grace-period primitives are +typically instead used to provide ordering against RCU read-side critical +sections, as documented in their comment headers. But of course if you +need a synchronize_rcu() to interact with readers, it costs you nothing +to also rely on its additional full-memory-barrier semantics. Just please +carefully comment this, otherwise your future self will hate you. + + +RMW Ordering Augmentation Barriers +---------------------------------- + +As noted in the previous section, non-value-returning RMW operations +such as atomic_inc() and atomic_dec() guarantee no ordering whatsoever. +Nevertheless, a number of popular CPU families, including x86, provide +full ordering for these primitives. One way to obtain full ordering on +all architectures is to add a call to smp_mb(): + + WRITE_ONCE(x, 1); + atomic_inc(&my_counter); + smp_mb(); // Inefficient on x86!!! + r1 = READ_ONCE(y); + +This works, but the added smp_mb() adds needless overhead for +x86, on which atomic_inc() provides full ordering all by itself. +The smp_mb__after_atomic() primitive can be used instead: + + WRITE_ONCE(x, 1); + atomic_inc(&my_counter); + smp_mb__after_atomic(); // Order store to x before load from y. + r1 = READ_ONCE(y); + +The smp_mb__after_atomic() primitive emits code only on CPUs whose +atomic_inc() implementations do not guarantee full ordering, thus +incurring no unnecessary overhead on x86. There are a number of +variations on the smp_mb__*() theme: + +o smp_mb__before_atomic(), which provides full ordering prior + to an unordered RMW atomic operation. + +o smp_mb__after_atomic(), which, as shown above, provides full + ordering subsequent to an unordered RMW atomic operation. + +o smp_mb__after_spinlock(), which provides full ordering subsequent + to a successful spinlock acquisition. Note that spin_lock() is + always successful but spin_trylock() might not be. + +o smp_mb__after_srcu_read_unlock(), which provides full ordering + subsequent to an srcu_read_unlock(). + +It is bad practice to place code between the smp__*() primitive and the +operation whose ordering that it is augmenting. The reason is that the +ordering of this intervening code will differ from one CPU architecture +to another. + + +Write Memory Barrier +-------------------- + +The Linux kernel's write memory barrier is smp_wmb(). If a CPU executes +the following code: + + WRITE_ONCE(x, 1); + smp_wmb(); + WRITE_ONCE(y, 1); + +Then any given CPU will see the write to "x" has having happened before +the write to "y". However, you are usually better off using a release +store, as described in the "Release Operations" section below. + +Note that smp_wmb() might fail to provide ordering for unmarked C-language +stores because profile-driven optimization could determine that the +value being overwritten is almost always equal to the new value. Such a +compiler might then reasonably decide to transform "x = 1" and "y = 1" +as follows: + + if (x != 1) + x = 1; + smp_wmb(); // BUG: does not order the reads!!! + if (y != 1) + y = 1; + +Therefore, if you need to use smp_wmb() with unmarked C-language writes, +you will need to make sure that none of the compilers used to build +the Linux kernel carry out this sort of transformation, both now and in +the future. + + +Read Memory Barrier +------------------- + +The Linux kernel's read memory barrier is smp_rmb(). If a CPU executes +the following code: + + r0 = READ_ONCE(y); + smp_rmb(); + r1 = READ_ONCE(x); + +Then any given CPU will see the read from "y" as having preceded the read from +"x". However, you are usually better off using an acquire load, as described +in the "Acquire Operations" section below. + +Compiler Barrier +---------------- + +The Linux kernel's compiler barrier is barrier(). This primitive +prohibits compiler code-motion optimizations that might move memory +references across the point in the code containing the barrier(), but +does not constrain hardware memory ordering. For example, this can be +used to prevent to compiler from moving code across an infinite loop: + + WRITE_ONCE(x, 1); + while (dontstop) + barrier(); + r1 = READ_ONCE(y); + +Without the barrier(), the compiler would be within its rights to move the +WRITE_ONCE() to follow the loop. This code motion could be problematic +in the case where an interrupt handler terminates the loop. Another way +to handle this is to use READ_ONCE() for the load of "dontstop". + +Note that the barriers discussed previously use barrier() or its low-level +equivalent in their implementations. + + +Ordered Memory Accesses +======================= + +The Linux kernel provides a wide variety of ordered memory accesses: + +a. Release operations. + +b. Acquire operations. + +c. RCU read-side ordering. + +d. Control dependencies. + +Each of the above categories has its own section below. + + +Release Operations +------------------ + +Release operations include smp_store_release(), atomic_set_release(), +rcu_assign_pointer(), and value-returning RMW operations whose names +end in _release. These operations order their own store against all +of the CPU's prior memory accesses. Release operations often provide +improved readability and performance compared to explicit barriers. +For example, use of smp_store_release() saves a line compared to the +smp_wmb() example above: + + WRITE_ONCE(x, 1); + smp_store_release(&y, 1); + +More important, smp_store_release() makes it easier to connect up the +different pieces of the concurrent algorithm. The variable stored to +by the smp_store_release(), in this case "y", will normally be used in +an acquire operation in other parts of the concurrent algorithm. + +To see the performance advantages, suppose that the above example read +from "x" instead of writing to it. Then an smp_wmb() could not guarantee +ordering, and an smp_mb() would be needed instead: + + r1 = READ_ONCE(x); + smp_mb(); + WRITE_ONCE(y, 1); + +But smp_mb() often incurs much higher overhead than does +smp_store_release(), which still provides the needed ordering of "x" +against "y". On x86, the version using smp_store_release() might compile +to a simple load instruction followed by a simple store instruction. +In contrast, the smp_mb() compiles to an expensive instruction that +provides the needed ordering. + +There is a wide variety of release operations: + +o Store operations, including not only the aforementioned + smp_store_release(), but also atomic_set_release(), and + atomic_long_set_release(). + +o RCU's rcu_assign_pointer() operation. This is the same as + smp_store_release() except that: (1) It takes the pointer to + be assigned to instead of a pointer to that pointer, (2) It + is intended to be used in conjunction with rcu_dereference() + and similar rather than smp_load_acquire(), and (3) It checks + for an RCU-protected pointer in "sparse" runs. + +o Value-returning RMW operations whose names end in _release, + such as atomic_fetch_add_release() and cmpxchg_release(). + Note that release ordering is guaranteed only against the + memory-store portion of the RMW operation, and not against the + memory-load portion. Note also that conditional operations such + as cmpxchg_release() are only guaranteed to provide ordering + when they succeed. + +As mentioned earlier, release operations are often paired with acquire +operations, which are the subject of the next section. + + +Acquire Operations +------------------ + +Acquire operations include smp_load_acquire(), atomic_read_acquire(), +and value-returning RMW operations whose names end in _acquire. These +operations order their own load against all of the CPU's subsequent +memory accesses. Acquire operations often provide improved performance +and readability compared to explicit barriers. For example, use of +smp_load_acquire() saves a line compared to the smp_rmb() example above: + + r0 = smp_load_acquire(&y); + r1 = READ_ONCE(x); + +As with smp_store_release(), this also makes it easier to connect +the different pieces of the concurrent algorithm by looking for the +smp_store_release() that stores to "y". In addition, smp_load_acquire() +improves upon smp_rmb() by ordering against subsequent stores as well +as against subsequent loads. + +There are a couple of categories of acquire operations: + +o Load operations, including not only the aforementioned + smp_load_acquire(), but also atomic_read_acquire(), and + atomic64_read_acquire(). + +o Value-returning RMW operations whose names end in _acquire, + such as atomic_xchg_acquire() and atomic_cmpxchg_acquire(). + Note that acquire ordering is guaranteed only against the + memory-load portion of the RMW operation, and not against the + memory-store portion. Note also that conditional operations + such as atomic_cmpxchg_acquire() are only guaranteed to provide + ordering when they succeed. + +Symmetry being what it is, acquire operations are often paired with the +release operations covered earlier. For example, consider the following +example, where task0() and task1() execute concurrently: + + void task0(void) + { + WRITE_ONCE(x, 1); + smp_store_release(&y, 1); + } + + void task1(void) + { + r0 = smp_load_acquire(&y); + r1 = READ_ONCE(x); + } + +If "x" and "y" are both initially zero, then either r0's final value +will be zero or r1's final value will be one, thus providing the required +ordering. + + +RCU Read-Side Ordering +---------------------- + +This category includes read-side markers such as rcu_read_lock() +and rcu_read_unlock() as well as pointer-traversal primitives such as +rcu_dereference() and srcu_dereference(). + +Compared to locking primitives and RMW atomic operations, markers +for RCU read-side critical sections incur very low overhead because +they interact only with the corresponding grace-period primitives. +For example, the rcu_read_lock() and rcu_read_unlock() markers interact +with synchronize_rcu(), synchronize_rcu_expedited(), and call_rcu(). +The way this works is that if a given call to synchronize_rcu() cannot +prove that it started before a given call to rcu_read_lock(), then +that synchronize_rcu() must block until the matching rcu_read_unlock() +is reached. For more information, please see the synchronize_rcu() +docbook header comment and the material in Documentation/RCU. + +RCU's pointer-traversal primitives, including rcu_dereference() and +srcu_dereference(), order their load (which must be a pointer) against any +of the CPU's subsequent memory accesses whose address has been calculated +from the value loaded. There is said to be an *address dependency* +from the value returned by the rcu_dereference() or srcu_dereference() +to that subsequent memory access. + +A call to rcu_dereference() for a given RCU-protected pointer is +usually paired with a call to a call to rcu_assign_pointer() for that +same pointer in much the same way that a call to smp_load_acquire() is +paired with a call to smp_store_release(). Calls to rcu_dereference() +and rcu_assign_pointer are often buried in other APIs, for example, +the RCU list API members defined in include/linux/rculist.h. For more +information, please see the docbook headers in that file, the most +recent LWN article on the RCU API (https://lwn.net/Articles/777036/), +and of course the material in Documentation/RCU. + +If the pointer value is manipulated between the rcu_dereference() +that returned it and a later dereference(), please read +Documentation/RCU/rcu_dereference.rst. It can also be quite helpful to +review uses in the Linux kernel. + + +Control Dependencies +-------------------- + +A control dependency extends from a marked load (READ_ONCE() or stronger) +through an "if" condition to a marked store (WRITE_ONCE() or stronger) +that is executed only by one of the legs of that "if" statement. +Control dependencies are so named because they are mediated by +control-flow instructions such as comparisons and conditional branches. + +In short, you can use a control dependency to enforce ordering between +an READ_ONCE() and a WRITE_ONCE() when there is an "if" condition +between them. The canonical example is as follows: + + q = READ_ONCE(a); + if (q) + WRITE_ONCE(b, 1); + +In this case, all CPUs would see the read from "a" as happening before +the write to "b". + +However, control dependencies are easily destroyed by compiler +optimizations, so any use of control dependencies must take into account +all of the compilers used to build the Linux kernel. Please see the +"control-dependencies.txt" file for more information. + + +Unordered Accesses +================== + +Each of these two categories of unordered accesses has a section below: + +a. Unordered marked operations. + +b. Unmarked C-language accesses. + + +Unordered Marked Operations +--------------------------- + +Unordered operations to different variables are just that, unordered. +However, if a group of CPUs apply these operations to a single variable, +all the CPUs will agree on the operation order. Of course, the ordering +of unordered marked accesses can also be constrained using the mechanisms +described earlier in this document. + +These operations come in three categories: + +o Marked writes, such as WRITE_ONCE() and atomic_set(). These + primitives required the compiler to emit the corresponding store + instructions in the expected execution order, thus suppressing + a number of destructive optimizations. However, they provide no + hardware ordering guarantees, and in fact many CPUs will happily + reorder marked writes with each other or with other unordered + operations, unless these operations are to the same variable. + +o Marked reads, such as READ_ONCE() and atomic_read(). These + primitives required the compiler to emit the corresponding load + instructions in the expected execution order, thus suppressing + a number of destructive optimizations. However, they provide no + hardware ordering guarantees, and in fact many CPUs will happily + reorder marked reads with each other or with other unordered + operations, unless these operations are to the same variable. + +o Unordered RMW atomic operations. These are non-value-returning + RMW atomic operations whose names do not end in _acquire or + _release, and also value-returning RMW operations whose names + end in _relaxed. Examples include atomic_add(), atomic_or(), + and atomic64_fetch_xor_relaxed(). These operations do carry + out the specified RMW operation atomically, for example, five + concurrent atomic_inc() operations applied to a given variable + will reliably increase the value of that variable by five. + However, many CPUs will happily reorder these operations with + each other or with other unordered operations. + + This category of operations can be efficiently ordered using + smp_mb__before_atomic() and smp_mb__after_atomic(), as was + discussed in the "RMW Ordering Augmentation Barriers" section. + +In short, these operations can be freely reordered unless they are all +operating on a single variable or unless they are constrained by one of +the operations called out earlier in this document. + + +Unmarked C-Language Accesses +---------------------------- + +Unmarked C-language accesses are normal variable accesses to normal +variables, that is, to variables that are not "volatile" and are not +C11 atomic variables. These operations provide no ordering guarantees, +and further do not guarantee "atomic" access. For example, the compiler +might (and sometimes does) split a plain C-language store into multiple +smaller stores. A load from that same variable running on some other +CPU while such a store is executing might see a value that is a mashup +of the old value and the new value. + +Unmarked C-language accesses are unordered, and are also subject to +any number of compiler optimizations, many of which can break your +concurrent code. It is possible to used unmarked C-language accesses for +shared variables that are subject to concurrent access, but great care +is required on an ongoing basis. The compiler-constraining barrier() +primitive can be helpful, as can the various ordering primitives discussed +in this document. It nevertheless bears repeating that use of unmarked +C-language accesses requires careful attention to not just your code, +but to all the compilers that might be used to build it. Such compilers +might replace a series of loads with a single load, and might replace +a series of stores with a single store. Some compilers will even split +a single store into multiple smaller stores. + +But there are some ways of using unmarked C-language accesses for shared +variables without such worries: + +o Guard all accesses to a given variable by a particular lock, + so that there are never concurrent conflicting accesses to + that variable. (There are "conflicting accesses" when + (1) at least one of the concurrent accesses to a variable is an + unmarked C-language access and (2) when at least one of those + accesses is a write, whether marked or not.) + +o As above, but using other synchronization primitives such + as reader-writer locks or sequence locks. + +o Use locking or other means to ensure that all concurrent accesses + to a given variable are reads. + +o Restrict use of a given variable to statistics or heuristics + where the occasional bogus value can be tolerated. + +o Declare the accessed variables as C11 atomics. + https://lwn.net/Articles/691128/ + +o Declare the accessed variables as "volatile". + +If you need to live more dangerously, please do take the time to +understand the compilers. One place to start is these two LWN +articles: + +Who's afraid of a big bad optimizing compiler? + https://lwn.net/Articles/793253 +Calibrating your fear of big bad optimizing compilers + https://lwn.net/Articles/799218 + +Used properly, unmarked C-language accesses can reduce overhead on +fastpaths. However, the price is great care and continual attention +to your compiler as new versions come out and as new optimizations +are enabled. diff --git a/tools/memory-model/README b/tools/memory-model/README index c8144d4aafa0..39d08d1f0443 100644 --- a/tools/memory-model/README +++ b/tools/memory-model/README @@ -161,26 +161,8 @@ running LKMM litmus tests. DESCRIPTION OF FILES ==================== -Documentation/cheatsheet.txt - Quick-reference guide to the Linux-kernel memory model. - -Documentation/explanation.txt - Describes the memory model in detail. - -Documentation/litmus-tests.txt - Describes the format, features, capabilities, and limitations - of the litmus tests that LKMM can evaluate. - -Documentation/recipes.txt - Lists common memory-ordering patterns. - -Documentation/references.txt - Provides background reading. - -Documentation/simple.txt - Starting point for someone new to Linux-kernel concurrency. - And also for those needing a reminder of the simpler approaches - to concurrency! +Documentation/README + Guide to the other documents in the Documentation/ directory. linux-kernel.bell Categorizes the relevant instructions, including memory diff --git a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus index 967f9f2a6226..772544f03fb5 100644 --- a/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoRR+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoRR+poonceonce+Once * reads from the same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus index 4635739f3974..5faae98f7ffb 100644 --- a/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoRW+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoRW+poonceonce+Once * a given variable and a later write to that same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus index bb068c92d8da..77c9cc9f8dc6 100644 --- a/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/CoWR+poonceonce+Once.litmus @@ -7,7 +7,9 @@ C CoWR+poonceonce+Once * given variable and a later read from that same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus index 0d9f0a958799..85ef746f511a 100644 --- a/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus +++ b/tools/memory-model/litmus-tests/CoWW+poonceonce.litmus @@ -7,7 +7,9 @@ C CoWW+poonceonce * writes to the same variable are ordered. *) -{} +{ + int x; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus index e729d2776e89..87aa900125ab 100644 --- a/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus +++ b/tools/memory-model/litmus-tests/IRIW+fencembonceonces+OnceOnce.litmus @@ -10,7 +10,10 @@ C IRIW+fencembonceonces+OnceOnce * process? This litmus test exercises LKMM's "propagation" rule. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus index 4b54dd6a6cd9..f84022dca555 100644 --- a/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus +++ b/tools/memory-model/litmus-tests/IRIW+poonceonces+OnceOnce.litmus @@ -10,7 +10,10 @@ C IRIW+poonceonces+OnceOnce * different process? *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus index 094d58df7789..398f624daa77 100644 --- a/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/ISA2+pooncelock+pooncelock+pombonce.litmus @@ -7,7 +7,12 @@ C ISA2+pooncelock+pooncelock+pombonce * (in P0() and P1()) is visible to external process P2(). *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus index b321aa6f4ea5..212a432ba16b 100644 --- a/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/ISA2+poonceonces.litmus @@ -9,7 +9,11 @@ C ISA2+poonceonces * of the smp_load_acquire() invocations are replaced by READ_ONCE()? *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus index 025b0462ec9b..7afd85672ccd 100644 --- a/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/ISA2+pooncerelease+poacquirerelease+poacquireonce.litmus @@ -11,7 +11,11 @@ C ISA2+pooncerelease+poacquirerelease+poacquireonce * (AKA non-rf) link, so release-acquire is all that is needed. *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus index 4727f5aaf03b..c8a93c7ee556 100644 --- a/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus +++ b/tools/memory-model/litmus-tests/LB+fencembonceonce+ctrlonceonce.litmus @@ -11,7 +11,10 @@ C LB+fencembonceonce+ctrlonceonce * another control dependency and order would still be maintained.) *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus index 07b9904b0e49..2fa029568fa1 100644 --- a/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus +++ b/tools/memory-model/litmus-tests/LB+poacquireonce+pooncerelease.litmus @@ -8,7 +8,10 @@ C LB+poacquireonce+pooncerelease * to the other? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/LB+poonceonces.litmus b/tools/memory-model/litmus-tests/LB+poonceonces.litmus index 74c49cb3c37b..2107306e8625 100644 --- a/tools/memory-model/litmus-tests/LB+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/LB+poonceonces.litmus @@ -7,7 +7,10 @@ C LB+poonceonces * be prevented even with no explicit ordering? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus index a273da9faa6d..c5c168d92973 100644 --- a/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus +++ b/tools/memory-model/litmus-tests/MP+fencewmbonceonce+fencermbonceonce.litmus @@ -8,23 +8,26 @@ C MP+fencewmbonceonce+fencermbonceonce * is usually better to use smp_store_release() and smp_load_acquire(). *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); smp_wmb(); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); smp_rmb(); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus index 97731b4bbdd8..20ff62649f1e 100644 --- a/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus +++ b/tools/memory-model/litmus-tests/MP+onceassign+derefonce.litmus @@ -10,25 +10,26 @@ C MP+onceassign+derefonce *) { -y=z; -z=0; + int *p=y; + int x; + int y=0; } -P0(int *x, int **y) +P0(int *x, int **p) // Producer { WRITE_ONCE(*x, 1); - rcu_assign_pointer(*y, x); + rcu_assign_pointer(*p, x); } -P1(int *x, int **y) +P1(int *x, int **p) // Consumer { int *r0; int r1; rcu_read_lock(); - r0 = rcu_dereference(*y); + r0 = rcu_dereference(*p); r1 = READ_ONCE(*r0); rcu_read_unlock(); } -exists (1:r0=x /\ 1:r1=0) +exists (1:r0=x /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus index 50f4d62bbf0e..153917ad5dc9 100644 --- a/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus +++ b/tools/memory-model/litmus-tests/MP+polockmbonce+poacquiresilsil.litmus @@ -11,9 +11,11 @@ C MP+polockmbonce+poacquiresilsil *) { + spinlock_t lo; + int x; } -P0(spinlock_t *lo, int *x) +P0(spinlock_t *lo, int *x) // Producer { spin_lock(lo); smp_mb__after_spinlock(); @@ -21,7 +23,7 @@ P0(spinlock_t *lo, int *x) spin_unlock(lo); } -P1(spinlock_t *lo, int *x) +P1(spinlock_t *lo, int *x) // Consumer { int r1; int r2; @@ -32,4 +34,4 @@ P1(spinlock_t *lo, int *x) r3 = spin_is_locked(lo); } -exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) +exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus index abf81e7a0895..aad64397bb8c 100644 --- a/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus +++ b/tools/memory-model/litmus-tests/MP+polockonce+poacquiresilsil.litmus @@ -11,16 +11,18 @@ C MP+polockonce+poacquiresilsil *) { + spinlock_t lo; + int x; } -P0(spinlock_t *lo, int *x) +P0(spinlock_t *lo, int *x) // Producer { spin_lock(lo); WRITE_ONCE(*x, 1); spin_unlock(lo); } -P1(spinlock_t *lo, int *x) +P1(spinlock_t *lo, int *x) // Consumer { int r1; int r2; @@ -31,4 +33,4 @@ P1(spinlock_t *lo, int *x) r3 = spin_is_locked(lo); } -exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) +exists (1:r1=1 /\ 1:r2=0 /\ 1:r3=1) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+polocks.litmus b/tools/memory-model/litmus-tests/MP+polocks.litmus index 712a4fcdf6ce..21cbca6f3be4 100644 --- a/tools/memory-model/litmus-tests/MP+polocks.litmus +++ b/tools/memory-model/litmus-tests/MP+polocks.litmus @@ -11,25 +11,29 @@ C MP+polocks * to see all prior accesses by those other CPUs. *) -{} +{ + spinlock_t mylock; + int buf; + int flag; +} -P0(int *x, int *y, spinlock_t *mylock) +P0(int *buf, int *flag, spinlock_t *mylock) // Producer { - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); spin_lock(mylock); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); spin_unlock(mylock); } -P1(int *x, int *y, spinlock_t *mylock) +P1(int *buf, int *flag, spinlock_t *mylock) // Consumer { int r0; int r1; spin_lock(mylock); - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); spin_unlock(mylock); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+poonceonces.litmus b/tools/memory-model/litmus-tests/MP+poonceonces.litmus index 172f0145301c..9f9769d647c7 100644 --- a/tools/memory-model/litmus-tests/MP+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/MP+poonceonces.litmus @@ -7,21 +7,24 @@ C MP+poonceonces * no ordering at all? *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*buf, 1); + WRITE_ONCE(*flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); - r1 = READ_ONCE(*x); + r0 = READ_ONCE(*flag); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus index d52c68429722..cbe28e733443 100644 --- a/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/MP+pooncerelease+poacquireonce.litmus @@ -8,21 +8,24 @@ C MP+pooncerelease+poacquireonce * pattern. *) -{} +{ + int buf; + int flag; +} -P0(int *x, int *y) +P0(int *buf, int *flag) // Producer { - WRITE_ONCE(*x, 1); - smp_store_release(y, 1); + WRITE_ONCE(*buf, 1); + smp_store_release(flag, 1); } -P1(int *x, int *y) +P1(int *buf, int *flag) // Consumer { int r0; int r1; - r0 = smp_load_acquire(y); - r1 = READ_ONCE(*x); + r0 = smp_load_acquire(flag); + r1 = READ_ONCE(*buf); } -exists (1:r0=1 /\ 1:r1=0) +exists (1:r0=1 /\ 1:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/MP+porevlocks.litmus b/tools/memory-model/litmus-tests/MP+porevlocks.litmus index 72c9276b363e..012041bd4feb 100644 --- a/tools/memory-model/litmus-tests/MP+porevlocks.litmus +++ b/tools/memory-model/litmus-tests/MP+porevlocks.litmus @@ -11,25 +11,29 @@ C MP+porevlocks * see all prior accesses by those other CPUs. *) -{} +{ + spinlock_t mylock; + int buf; + int flag; +} -P0(int *x, int *y, spinlock_t *mylock) +P0(int *buf, int *flag, spinlock_t *mylock) // Consumer { int r0; int r1; - r0 = READ_ONCE(*y); + r0 = READ_ONCE(*flag); spin_lock(mylock); - r1 = READ_ONCE(*x); + r1 = READ_ONCE(*buf); spin_unlock(mylock); } -P1(int *x, int *y, spinlock_t *mylock) +P1(int *buf, int *flag, spinlock_t *mylock) // Producer { spin_lock(mylock); - WRITE_ONCE(*x, 1); + WRITE_ONCE(*buf, 1); spin_unlock(mylock); - WRITE_ONCE(*y, 1); + WRITE_ONCE(*flag, 1); } -exists (0:r0=1 /\ 0:r1=0) +exists (0:r0=1 /\ 0:r1=0) (* Bad outcome. *) diff --git a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus index 222a0b850b4a..af9463b39b4a 100644 --- a/tools/memory-model/litmus-tests/R+fencembonceonces.litmus +++ b/tools/memory-model/litmus-tests/R+fencembonceonces.litmus @@ -9,7 +9,10 @@ C R+fencembonceonces * cause the resulting test to be allowed. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/R+poonceonces.litmus b/tools/memory-model/litmus-tests/R+poonceonces.litmus index 5386f128a131..bcd5574e304a 100644 --- a/tools/memory-model/litmus-tests/R+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/R+poonceonces.litmus @@ -8,7 +8,10 @@ C R+poonceonces * store propagation delays. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus index 18479823cd6c..c36341d1aed6 100644 --- a/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus +++ b/tools/memory-model/litmus-tests/S+fencewmbonceonce+poacquireonce.litmus @@ -7,7 +7,10 @@ C S+fencewmbonceonce+poacquireonce * store against a subsequent store? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/S+poonceonces.litmus b/tools/memory-model/litmus-tests/S+poonceonces.litmus index 8c9c2f81a580..7775c23143a0 100644 --- a/tools/memory-model/litmus-tests/S+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/S+poonceonces.litmus @@ -9,7 +9,10 @@ C S+poonceonces * READ_ONCE(), is ordering preserved? *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus index ed5fff18d223..833cdfeb7c09 100644 --- a/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+fencembonceonces.litmus @@ -9,7 +9,10 @@ C SB+fencembonceonces * suffice, but not much else.) *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+poonceonces.litmus b/tools/memory-model/litmus-tests/SB+poonceonces.litmus index 10d550730b25..c92211ecbfdf 100644 --- a/tools/memory-model/litmus-tests/SB+poonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+poonceonces.litmus @@ -8,7 +8,10 @@ C SB+poonceonces * variable that the preceding process reads. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus index 04a16603660b..84344b455eb7 100644 --- a/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus +++ b/tools/memory-model/litmus-tests/SB+rfionceonce-poonceonces.litmus @@ -6,7 +6,10 @@ C SB+rfionceonce-poonceonces * This litmus test demonstrates that LKMM is not fully multicopy atomic. *) -{} +{ + int x; + int y; +} P0(int *x, int *y) { diff --git a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus index 6a2bc12a1af1..431494708611 100644 --- a/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus +++ b/tools/memory-model/litmus-tests/WRC+poonceonces+Once.litmus @@ -8,7 +8,10 @@ C WRC+poonceonces+Once * test has no ordering at all. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus index e9947250d7de..554999c64db5 100644 --- a/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus +++ b/tools/memory-model/litmus-tests/WRC+pooncerelease+fencermbonceonce+Once.litmus @@ -10,7 +10,10 @@ C WRC+pooncerelease+fencermbonceonce+Once * is A-cumulative in LKMM. *) -{} +{ + int x; + int y; +} P0(int *x) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus index 415248fb6699..265a95ffef13 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+poonceLock+pombonce.litmus @@ -9,7 +9,12 @@ C Z6.0+pooncelock+poonceLock+pombonce * by CPUs not holding that lock. *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus index 10a2aa04cd07..0c9aea8e80df 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncelock+pooncelock+pombonce.litmus @@ -8,7 +8,12 @@ C Z6.0+pooncelock+pooncelock+pombonce * seen as ordered by a third process not holding that lock. *) -{} +{ + spinlock_t mylock; + int x; + int y; + int z; +} P0(int *x, int *y, spinlock_t *mylock) { diff --git a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus index 88e70b87a683..661f9aaa5791 100644 --- a/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus +++ b/tools/memory-model/litmus-tests/Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus @@ -14,7 +14,11 @@ C Z6.0+pooncerelease+poacquirerelease+fencembonceonce * involving locking.) *) -{} +{ + int x; + int y; + int z; +} P0(int *x, int *y) { diff --git a/tools/power/cpupower/lib/cpupower.c b/tools/power/cpupower/lib/cpupower.c index 3656e697537e..3f7d0c0c5067 100644 --- a/tools/power/cpupower/lib/cpupower.c +++ b/tools/power/cpupower/lib/cpupower.c @@ -16,8 +16,8 @@ unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen) { - int fd; ssize_t numread; + int fd; fd = open(path, O_RDONLY); if (fd == -1) @@ -35,6 +35,27 @@ unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen) return (unsigned int) numread; } +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numwritten; + int fd; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + if (numwritten < 1) { + perror(path); + close(fd); + return -1; + } + + close(fd); + + return (unsigned int) numwritten; +} + /* * Detect whether a CPU is online * diff --git a/tools/power/cpupower/lib/cpupower_intern.h b/tools/power/cpupower/lib/cpupower_intern.h index 4887c76d23f8..ac1112b956ec 100644 --- a/tools/power/cpupower/lib/cpupower_intern.h +++ b/tools/power/cpupower/lib/cpupower_intern.h @@ -1,6 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define PATH_TO_CPU "/sys/devices/system/cpu/" + +#ifndef MAX_LINE_LEN #define MAX_LINE_LEN 4096 +#endif + #define SYSFS_PATH_MAX 255 unsigned int cpupower_read_sysfs(const char *path, char *buf, size_t buflen); +unsigned int cpupower_write_sysfs(const char *path, char *buf, size_t buflen); diff --git a/tools/power/cpupower/utils/cpupower-info.c b/tools/power/cpupower/utils/cpupower-info.c index 0ba61a2c4d81..06345b543786 100644 --- a/tools/power/cpupower/utils/cpupower-info.c +++ b/tools/power/cpupower/utils/cpupower-info.c @@ -101,7 +101,7 @@ int cmd_info(int argc, char **argv) } if (params.perf_bias) { - ret = msr_intel_get_perf_bias(cpu); + ret = cpupower_intel_get_perf_bias(cpu); if (ret < 0) { fprintf(stderr, _("Could not read perf-bias value[%d]\n"), ret); diff --git a/tools/power/cpupower/utils/cpupower-set.c b/tools/power/cpupower/utils/cpupower-set.c index 052044d7e012..180d5ba877e6 100644 --- a/tools/power/cpupower/utils/cpupower-set.c +++ b/tools/power/cpupower/utils/cpupower-set.c @@ -95,7 +95,7 @@ int cmd_set(int argc, char **argv) } if (params.perf_bias) { - ret = msr_intel_set_perf_bias(cpu, perf_bias); + ret = cpupower_intel_set_perf_bias(cpu, perf_bias); if (ret) { fprintf(stderr, _("Error setting perf-bias " "value on CPU %d\n"), cpu); diff --git a/tools/power/cpupower/utils/helpers/helpers.h b/tools/power/cpupower/utils/helpers/helpers.h index c258eeccd05f..37dac161f3fe 100644 --- a/tools/power/cpupower/utils/helpers/helpers.h +++ b/tools/power/cpupower/utils/helpers/helpers.h @@ -105,8 +105,8 @@ extern struct cpupower_cpu_info cpupower_cpu_info; extern int read_msr(int cpu, unsigned int idx, unsigned long long *val); extern int write_msr(int cpu, unsigned int idx, unsigned long long val); -extern int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val); -extern int msr_intel_get_perf_bias(unsigned int cpu); +extern int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val); +extern int cpupower_intel_get_perf_bias(unsigned int cpu); extern unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu); /* Read/Write msr ****************************/ @@ -150,9 +150,9 @@ static inline int read_msr(int cpu, unsigned int idx, unsigned long long *val) { return -1; }; static inline int write_msr(int cpu, unsigned int idx, unsigned long long val) { return -1; }; -static inline int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val) +static inline int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) { return -1; }; -static inline int msr_intel_get_perf_bias(unsigned int cpu) +static inline int cpupower_intel_get_perf_bias(unsigned int cpu) { return -1; }; static inline unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu) { return 0; }; diff --git a/tools/power/cpupower/utils/helpers/misc.c b/tools/power/cpupower/utils/helpers/misc.c index f406adc40bad..e8f8f643a627 100644 --- a/tools/power/cpupower/utils/helpers/misc.c +++ b/tools/power/cpupower/utils/helpers/misc.c @@ -1,7 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> + #if defined(__i386__) || defined(__x86_64__) #include "helpers/helpers.h" +#include "helpers/sysfs.h" + +#include "cpupower_intern.h" #define MSR_AMD_HWCR 0xc0010015 @@ -40,4 +48,44 @@ int cpufreq_has_boost_support(unsigned int cpu, int *support, int *active, *support = *active = 1; return 0; } + +int cpupower_intel_get_perf_bias(unsigned int cpu) +{ + char linebuf[MAX_LINE_LEN]; + char path[SYSFS_PATH_MAX]; + unsigned long val; + char *endp; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + + if (cpupower_read_sysfs(path, linebuf, MAX_LINE_LEN) == 0) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return val; +} + +int cpupower_intel_set_perf_bias(unsigned int cpu, unsigned int val) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3] = {}; + + if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + snprintf(linebuf, sizeof(linebuf), "%d", val); + + if (cpupower_write_sysfs(path, linebuf, 3) <= 0) + return -1; + + return 0; +} + #endif /* #if defined(__i386__) || defined(__x86_64__) */ diff --git a/tools/power/cpupower/utils/helpers/msr.c b/tools/power/cpupower/utils/helpers/msr.c index ab9950748838..8b0b6be74bb8 100644 --- a/tools/power/cpupower/utils/helpers/msr.c +++ b/tools/power/cpupower/utils/helpers/msr.c @@ -11,7 +11,6 @@ /* Intel specific MSRs */ #define MSR_IA32_PERF_STATUS 0x198 #define MSR_IA32_MISC_ENABLES 0x1a0 -#define MSR_IA32_ENERGY_PERF_BIAS 0x1b0 #define MSR_NEHALEM_TURBO_RATIO_LIMIT 0x1ad /* @@ -73,33 +72,6 @@ int write_msr(int cpu, unsigned int idx, unsigned long long val) return -1; } -int msr_intel_get_perf_bias(unsigned int cpu) -{ - unsigned long long val; - int ret; - - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) - return -1; - - ret = read_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &val); - if (ret) - return ret; - return val; -} - -int msr_intel_set_perf_bias(unsigned int cpu, unsigned int val) -{ - int ret; - - if (!(cpupower_cpu_info.caps & CPUPOWER_CAP_PERF_BIAS)) - return -1; - - ret = write_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, val); - if (ret) - return ret; - return 0; -} - unsigned long long msr_intel_get_turbo_ratio(unsigned int cpu) { unsigned long long val; diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f3a1746f7f45..389ea5209a83 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1831,6 +1831,25 @@ int get_mp(int cpu, struct msr_counter *mp, unsigned long long *counterp) return 0; } +int get_epb(int cpu) +{ + char path[128 + PATH_BYTES]; + int ret, epb = -1; + FILE *fp; + + sprintf(path, "/sys/devices/system/cpu/cpu%d/power/energy_perf_bias", cpu); + + fp = fopen_or_die(path, "r"); + + ret = fscanf(fp, "%d", &epb); + if (ret != 1) + err(1, "%s(%s)", __func__, path); + + fclose(fp); + + return epb; +} + void get_apic_id(struct thread_data *t) { unsigned int eax, ebx, ecx, edx; @@ -3917,9 +3936,8 @@ dump_sysfs_pstate_config(void) */ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) { - unsigned long long msr; char *epb_string; - int cpu; + int cpu, epb; if (!has_epb) return 0; @@ -3935,10 +3953,11 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; } - if (get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr)) + epb = get_epb(cpu); + if (epb < 0) return 0; - switch (msr & 0xF) { + switch (epb) { case ENERGY_PERF_BIAS_PERFORMANCE: epb_string = "performance"; break; @@ -3952,7 +3971,7 @@ int print_epb(struct thread_data *t, struct core_data *c, struct pkg_data *p) epb_string = "custom"; break; } - fprintf(outf, "cpu%d: MSR_IA32_ENERGY_PERF_BIAS: 0x%08llx (%s)\n", cpu, msr, epb_string); + fprintf(outf, "cpu%d: EPB: %d (%s)\n", cpu, epb, epb_string); return 0; } diff --git a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c index ff6c6661f075..5fd9e594079c 100644 --- a/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c +++ b/tools/power/x86/x86_energy_perf_policy/x86_energy_perf_policy.c @@ -91,6 +91,9 @@ unsigned int has_hwp_request_pkg; /* IA32_HWP_REQUEST_PKG */ unsigned int bdx_highest_ratio; +#define PATH_TO_CPU "/sys/devices/system/cpu/" +#define SYSFS_PATH_MAX 255 + /* * maintain compatibility with original implementation, but don't document it: */ @@ -721,6 +724,48 @@ int put_msr(int cpu, int offset, unsigned long long new_msr) return 0; } +static unsigned int read_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numread; + int fd; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static unsigned int write_sysfs(const char *path, char *buf, size_t buflen) +{ + ssize_t numwritten; + int fd; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + if (numwritten < 1) { + perror("write failed\n"); + close(fd); + return -1; + } + + close(fd); + + return (unsigned int) numwritten; +} + void print_hwp_cap(int cpu, struct msr_hwp_cap *cap, char *str) { if (cpu != -1) @@ -798,17 +843,61 @@ void write_hwp_request(int cpu, struct msr_hwp_request *hwp_req, unsigned int ms put_msr(cpu, msr_offset, msr); } +static int get_epb(int cpu) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3]; + char *endp; + long val; + + if (!has_epb) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + + if (!read_sysfs(path, linebuf, 3)) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return (int)val; +} + +static int set_epb(int cpu, int val) +{ + char path[SYSFS_PATH_MAX]; + char linebuf[3]; + char *endp; + int ret; + + if (!has_epb) + return -1; + + snprintf(path, sizeof(path), PATH_TO_CPU "cpu%u/power/energy_perf_bias", cpu); + snprintf(linebuf, sizeof(linebuf), "%d", val); + + ret = write_sysfs(path, linebuf, 3); + if (ret <= 0) + return -1; + + val = strtol(linebuf, &endp, 0); + if (endp == linebuf || errno == ERANGE) + return -1; + + return (int)val; +} + int print_cpu_msrs(int cpu) { - unsigned long long msr; struct msr_hwp_request req; struct msr_hwp_cap cap; + int epb; - if (has_epb) { - get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); - - printf("cpu%d: EPB %u\n", cpu, (unsigned int) msr); - } + epb = get_epb(cpu); + if (epb >= 0) + printf("cpu%d: EPB %u\n", cpu, (unsigned int) epb); if (!has_hwp) return 0; @@ -1091,15 +1180,15 @@ int enable_hwp_on_cpu(int cpu) int update_cpu_msrs(int cpu) { unsigned long long msr; - + int epb; if (update_epb) { - get_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, &msr); - put_msr(cpu, MSR_IA32_ENERGY_PERF_BIAS, new_epb); + epb = get_epb(cpu); + set_epb(cpu, new_epb); if (verbose) printf("cpu%d: ENERGY_PERF_BIAS old: %d new: %d\n", - cpu, (unsigned int) msr, (unsigned int) new_epb); + cpu, epb, (unsigned int) new_epb); } if (update_turbo) { diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index cb16d2aac51c..54188ee16c48 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl @@ -2040,7 +2040,7 @@ sub reboot_to { if ($reboot_type eq "grub") { run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub --batch)'"; - } elsif ($reboot_type eq "grub2") { + } elsif (($reboot_type eq "grub2") or ($reboot_type eq "grub2bls")) { run_ssh "$grub_reboot $grub_number"; } elsif ($reboot_type eq "syslinux") { run_ssh "$syslinux --once \\\"$syslinux_label\\\" $syslinux_path"; diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c index 2ac0fff6dad8..9b185bf82da8 100644 --- a/tools/testing/nvdimm/test/nfit.c +++ b/tools/testing/nvdimm/test/nfit.c @@ -23,7 +23,6 @@ #include "nfit_test.h" #include "../watermark.h" -#include <asm/copy_mc_test.h> #include <asm/mce.h> /* @@ -3284,107 +3283,6 @@ static struct platform_driver nfit_test_driver = { .id_table = nfit_test_id, }; -static char copy_mc_buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); - -enum INJECT { - INJECT_NONE, - INJECT_SRC, - INJECT_DST, -}; - -static void copy_mc_test_init(char *dst, char *src, size_t size) -{ - size_t i; - - memset(dst, 0xff, size); - for (i = 0; i < size; i++) - src[i] = (char) i; -} - -static bool copy_mc_test_validate(unsigned char *dst, unsigned char *src, - size_t size, unsigned long rem) -{ - size_t i; - - for (i = 0; i < size - rem; i++) - if (dst[i] != (unsigned char) i) { - pr_info_once("%s:%d: offset: %zd got: %#x expect: %#x\n", - __func__, __LINE__, i, dst[i], - (unsigned char) i); - return false; - } - for (i = size - rem; i < size; i++) - if (dst[i] != 0xffU) { - pr_info_once("%s:%d: offset: %zd got: %#x expect: 0xff\n", - __func__, __LINE__, i, dst[i]); - return false; - } - return true; -} - -void copy_mc_test(void) -{ - char *inject_desc[] = { "none", "source", "destination" }; - enum INJECT inj; - - if (IS_ENABLED(CONFIG_COPY_MC_TEST)) { - pr_info("%s: run...\n", __func__); - } else { - pr_info("%s: disabled, skip.\n", __func__); - return; - } - - for (inj = INJECT_NONE; inj <= INJECT_DST; inj++) { - int i; - - pr_info("%s: inject: %s\n", __func__, inject_desc[inj]); - for (i = 0; i < 512; i++) { - unsigned long expect, rem; - void *src, *dst; - bool valid; - - switch (inj) { - case INJECT_NONE: - copy_mc_inject_src(NULL); - copy_mc_inject_dst(NULL); - dst = ©_mc_buf[2048]; - src = ©_mc_buf[1024 - i]; - expect = 0; - break; - case INJECT_SRC: - copy_mc_inject_src(©_mc_buf[1024]); - copy_mc_inject_dst(NULL); - dst = ©_mc_buf[2048]; - src = ©_mc_buf[1024 - i]; - expect = 512 - i; - break; - case INJECT_DST: - copy_mc_inject_src(NULL); - copy_mc_inject_dst(©_mc_buf[2048]); - dst = ©_mc_buf[2048 - i]; - src = ©_mc_buf[1024]; - expect = 512 - i; - break; - } - - copy_mc_test_init(dst, src, 512); - rem = copy_mc_fragile(dst, src, 512); - valid = copy_mc_test_validate(dst, src, 512, expect); - if (rem == expect && valid) - continue; - pr_info("%s: copy(%#lx, %#lx, %d) off: %d rem: %ld %s expect: %ld\n", - __func__, - ((unsigned long) dst) & ~PAGE_MASK, - ((unsigned long ) src) & ~PAGE_MASK, - 512, i, rem, valid ? "valid" : "bad", - expect); - } - } - - copy_mc_inject_src(NULL); - copy_mc_inject_dst(NULL); -} - static __init int nfit_test_init(void) { int rc, i; @@ -3393,7 +3291,6 @@ static __init int nfit_test_init(void) libnvdimm_test(); acpi_nfit_test(); device_dax_test(); - copy_mc_test(); dax_pmem_test(); dax_pmem_core_test(); #ifdef CONFIG_DEV_DAX_PMEM_COMPAT diff --git a/tools/testing/scatterlist/main.c b/tools/testing/scatterlist/main.c index f561aed7c657..8a577c95496e 100644 --- a/tools/testing/scatterlist/main.c +++ b/tools/testing/scatterlist/main.c @@ -50,7 +50,7 @@ static void fail(struct test *test, struct sg_table *st, const char *cond) int main(void) { - const unsigned int sgmax = SCATTERLIST_MAX_SEGMENT; + const unsigned int sgmax = UINT_MAX; struct test *test, tests[] = { { -EINVAL, 1, pfn(0), PAGE_SIZE, 0, 1 }, { 0, 1, pfn(0), PAGE_SIZE, PAGE_SIZE + 1, 1 }, diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d9c283503159..e93f10386e76 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -50,12 +50,14 @@ TARGETS += openat2 TARGETS += rseq TARGETS += rtc TARGETS += seccomp +TARGETS += sgx TARGETS += sigaltstack TARGETS += size TARGETS += sparc64 TARGETS += splice TARGETS += static_keys TARGETS += sync +TARGETS += syscall_user_dispatch TARGETS += sysctl TARGETS += tc-testing TARGETS += timens diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 52414058a627..5861446d0777 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -456,10 +456,10 @@ static struct bpf_align_test tests[] = { */ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"}, /* Checked s>=0 */ - {9, "R5=inv(id=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* packet pointer + nonnegative (4n+2) */ - {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, - {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, + {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine. * We checked the bounds, but it might have been able * to overflow if the packet pointer started in the @@ -467,7 +467,7 @@ static struct bpf_align_test tests[] = { * So we did not get a 'range' on R6, and the access * attempt will fail. */ - {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372034707292158,var_off=(0x2; 0x7fffffff7ffffffc)"}, + {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"}, } }, { diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c index c1650548433c..fddbc5db5d6a 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c @@ -217,9 +217,15 @@ void test_ringbuf(void) if (CHECK(err, "join_bg", "err %d\n", err)) goto cleanup; - if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret)) + if (CHECK(bg_ret <= 0, "bg_ret", "epoll_wait result: %ld", bg_ret)) goto cleanup; + /* due to timing variations, there could still be non-notified + * samples, so consume them here to collect all the samples + */ + err = ring_buffer__consume(ringbuf); + CHECK(err < 0, "rb_consume", "failed: %d\b", err); + /* 3 rounds, 2 samples each */ cnt = atomic_xchg(&sample_cnt, 0); CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt); diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c index 78e450609803..d37161e59bb2 100644 --- a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c @@ -81,7 +81,7 @@ void test_ringbuf_multi(void) /* poll for samples, should get 2 ringbufs back */ err = ring_buffer__poll(ringbuf, -1); - if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err)) + if (CHECK(err != 2, "poll_res", "expected 2 records, got %d\n", err)) goto cleanup; /* expect extra polling to return nothing */ diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py index 43c9cda199b8..b99bb8ed3ed4 100755 --- a/tools/testing/selftests/bpf/test_offload.py +++ b/tools/testing/selftests/bpf/test_offload.py @@ -184,9 +184,7 @@ def bpftool_prog_list(expected=None, ns=""): def bpftool_map_list(expected=None, ns=""): _, maps = bpftool("map show", JSON=True, ns=ns, fail=True) # Remove the base maps - for m in base_maps: - if m in maps: - maps.remove(m) + maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names] if expected is not None: if len(maps) != expected: fail(True, "%d BPF maps loaded, expected %d" % @@ -716,13 +714,11 @@ def test_multi_prog(simdev, sim, obj, modename, modeid): fail(ret == 0, "Replaced one of programs without -force") check_extack(err, "XDP program already attached.", args) - if modename == "" or modename == "drv": - othermode = "" if modename == "drv" else "drv" - start_test("Test multi-attachment XDP - detach...") - ret, _, err = sim.unset_xdp(othermode, force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) + start_test("Test multi-attachment XDP - remove without mode...") + ret, _, err = sim.unset_xdp("", force=True, + fail=False, include_stderr=True) + fail(ret == 0, "Removed program without a mode flag") + check_extack(err, "More than one program loaded, unset mode is ambiguous.", args) sim.unset_xdp("offload") xdp = sim.ip_link_show(xdp=True)["xdp"] @@ -772,6 +768,9 @@ ret, progs = bpftool("prog", fail=False) skip(ret != 0, "bpftool not installed") base_progs = progs _, base_maps = bpftool("map") +base_map_names = [ + 'pid_iter.rodata' # created on each bpftool invocation +] # Check netdevsim ret, out = cmd("modprobe netdevsim", fail=False) @@ -913,11 +912,18 @@ try: sim.tc_flush_filters() + start_test("Test TC offloads failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, + fail=False, include_stderr=True) + fail(ret == 0, "TC filter did not reject with TC offloads enabled") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + start_test("Test TC offloads work...") ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True, fail=False, include_stderr=True) fail(ret != 0, "TC filter did not load with TC offloads enabled") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test TC offload basics...") dfs = simdev.dfs_get_bound_progs(expected=1) @@ -941,6 +947,7 @@ try: start_test("Test disabling TC offloads is rejected while filters installed...") ret, _ = sim.set_ethtool_tc_offloads(False, fail=False) fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...") + sim.set_ethtool_tc_offloads(True) start_test("Test qdisc removal frees things...") sim.tc_flush_filters() @@ -999,18 +1006,8 @@ try: fail=False, include_stderr=True) fail(ret == 0, "Replaced XDP program with a program in different mode") check_extack(err, - "native and generic XDP can't be active at the same time.", + "Native and generic XDP can't be active at the same time.", args) - ret, _, err = sim.set_xdp(obj, "", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Replaced XDP program with a program in different mode") - check_extack(err, "program loaded with different flags.", args) - - start_test("Test XDP prog remove with bad flags...") - ret, _, err = sim.unset_xdp("", force=True, - fail=False, include_stderr=True) - fail(ret == 0, "Removed program with a bad mode") - check_extack(err, "program loaded with different flags.", args) start_test("Test MTU restrictions...") ret, _ = sim.set_mtu(9000, fail=False) @@ -1040,10 +1037,19 @@ try: offload = bpf_pinned("/sys/fs/bpf/offload") ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True) fail(ret == 0, "attached offloaded XDP program to drv") - check_extack(err, "using device-bound program without HW_MODE flag is not supported.", args) + check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args) rm("/sys/fs/bpf/offload") sim.wait_for_flush() + start_test("Test XDP load failure...") + sim.dfs["dev/bpf_bind_verifier_accept"] = 0 + ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload", + dev=sim['ifname'], fail=False, include_stderr=True) + fail(ret == 0, "verifier should fail on load") + check_verifier_log(err, "[netdevsim] Hello from netdevsim!") + sim.dfs["dev/bpf_bind_verifier_accept"] = 1 + sim.wait_for_flush() + start_test("Test XDP offload...") _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True) ipl = sim.ip_link_show(xdp=True) @@ -1051,7 +1057,6 @@ try: progs = bpftool_prog_list(expected=1) prog = progs[0] fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID") - check_verifier_log(err, "[netdevsim] Hello from netdevsim!") start_test("Test XDP offload is device bound...") dfs = simdev.dfs_get_bound_progs(expected=1) diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c index 1c4b1939f5a8..bed53b561e04 100644 --- a/tools/testing/selftests/bpf/verifier/array_access.c +++ b/tools/testing/selftests/bpf/verifier/array_access.c @@ -68,7 +68,7 @@ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), - BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), + BPF_JMP32_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1), BPF_MOV32_IMM(BPF_REG_1, 0), BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES), BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1), diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c index dac40de3f868..57ed67b86074 100644 --- a/tools/testing/selftests/bpf/verifier/bounds.c +++ b/tools/testing/selftests/bpf/verifier/bounds.c @@ -703,3 +703,44 @@ .fixup_map_hash_8b = { 3 }, .result = ACCEPT, }, +{ + "bounds checks after 32-bit truncation. test 1", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + /* This used to reduce the max bound to 0x7fffffff */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1), + BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0x7fffffff, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, +{ + "bounds checks after 32-bit truncation. test 2", + .insns = { + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JSLT, BPF_REG_1, 1, 1), + BPF_JMP32_IMM(BPF_JSLT, BPF_REG_1, 0, 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_hash_8b = { 3 }, + .errstr_unpriv = "R0 leaks addr", + .result_unpriv = REJECT, + .result = ACCEPT, +}, diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index b3ece55a2da6..6f441dd9f33c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -156,14 +156,23 @@ static void guest_code_move_memory_region(void) GUEST_SYNC(0); /* - * Spin until the memory region is moved to a misaligned address. This - * may or may not trigger MMIO, as the window where the memslot is - * invalid is quite small. + * Spin until the memory region starts getting moved to a + * misaligned address. + * Every region move may or may not trigger MMIO, as the + * window where the memslot is invalid is usually quite small. */ val = guest_spin_on_val(0); GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); - /* Spin until the memory region is realigned. */ + /* Spin until the misaligning memory region move completes. */ + val = guest_spin_on_val(MMIO_VAL); + GUEST_ASSERT_1(val == 1 || val == 0, val); + + /* Spin until the memory region starts to get re-aligned. */ + val = guest_spin_on_val(0); + GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val); + + /* Spin until the re-aligning memory region move completes. */ val = guest_spin_on_val(MMIO_VAL); GUEST_ASSERT_1(val == 1, val); diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh index fb5c55dd6df8..02b0b9ead40b 100755 --- a/tools/testing/selftests/net/fcnal-test.sh +++ b/tools/testing/selftests/net/fcnal-test.sh @@ -256,6 +256,28 @@ setup_cmd_nsb() fi } +setup_cmd_nsc() +{ + local cmd="$*" + local rc + + run_cmd_nsc ${cmd} + rc=$? + if [ $rc -ne 0 ]; then + # show user the command if not done so already + if [ "$VERBOSE" = "0" ]; then + echo "setup command: $cmd" + fi + echo "failed. stopping tests" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue" + read a + fi + exit $rc + fi +} + # set sysctl values in NS-A set_sysctl() { @@ -471,6 +493,36 @@ setup() sleep 1 } +setup_lla_only() +{ + # make sure we are starting with a clean slate + kill_procs + cleanup 2>/dev/null + + log_debug "Configuring network namespaces" + set -e + + create_ns ${NSA} "-" "-" + create_ns ${NSB} "-" "-" + create_ns ${NSC} "-" "-" + connect_ns ${NSA} ${NSA_DEV} "-" "-" \ + ${NSB} ${NSB_DEV} "-" "-" + connect_ns ${NSA} ${NSA_DEV2} "-" "-" \ + ${NSC} ${NSC_DEV} "-" "-" + + NSA_LINKIP6=$(get_linklocal ${NSA} ${NSA_DEV}) + NSB_LINKIP6=$(get_linklocal ${NSB} ${NSB_DEV}) + NSC_LINKIP6=$(get_linklocal ${NSC} ${NSC_DEV}) + + create_vrf ${NSA} ${VRF} ${VRF_TABLE} "-" "-" + ip -netns ${NSA} link set dev ${NSA_DEV} vrf ${VRF} + ip -netns ${NSA} link set dev ${NSA_DEV2} vrf ${VRF} + + set +e + + sleep 1 +} + ################################################################################ # IPv4 @@ -3787,10 +3839,53 @@ use_case_br() setup_cmd_nsb ip li del vlan100 2>/dev/null } +# VRF only. +# ns-A device is connected to both ns-B and ns-C on a single VRF but only has +# LLA on the interfaces +use_case_ping_lla_multi() +{ + setup_lla_only + # only want reply from ns-A + setup_cmd_nsb sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + setup_cmd_nsc sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Pre cycle, ping out ns-B" + + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Pre cycle, ping out ns-C" + + # cycle/flap the first ns-A interface + setup_cmd ip link set ${NSA_DEV} down + setup_cmd ip link set ${NSA_DEV} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-C" + + # cycle/flap the second ns-A interface + setup_cmd ip link set ${NSA_DEV2} down + setup_cmd ip link set ${NSA_DEV2} up + sleep 1 + + log_start + run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV} + log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-B" + run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV} + log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C" +} + use_cases() { log_section "Use cases" + log_subsection "Device enslaved to bridge" use_case_br + log_subsection "Ping LLA with multiple interfaces" + use_case_ping_lla_multi } ################################################################################ diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index db3d4a8b5a4c..76a24052f4b4 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -113,6 +113,9 @@ static void do_poll(int fd, int timeout_ms) interrupted = true; break; } + + /* no events and more time to wait, do poll again */ + continue; } if (pfd.revents != POLLIN) error(1, errno, "poll: 0x%x expected 0x%x\n", diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh index 0e4c0b2eb7f0..80ae7f08b363 100755 --- a/tools/testing/selftests/rcutorture/bin/console-badness.sh +++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh @@ -13,4 +13,5 @@ egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' | grep -v 'ODEBUG: ' | grep -v 'This means that this is a DEBUG kernel and it is' | -grep -v 'Warning: unable to open an initial console' +grep -v 'Warning: unable to open an initial console' | +grep -v 'NOHZ tick-stop error: Non-RCU local softirq work is pending, handler' diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh index 51f3464b96d3..82663495fb38 100644 --- a/tools/testing/selftests/rcutorture/bin/functions.sh +++ b/tools/testing/selftests/rcutorture/bin/functions.sh @@ -169,6 +169,7 @@ identify_qemu () { # Output arguments for the qemu "-append" string based on CPU type # and the TORTURE_QEMU_INTERACTIVE environment variable. identify_qemu_append () { + echo debug_boot_weak_hash local console=ttyS0 case "$1" in qemu-system-x86_64|qemu-system-i386) diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh index 6e65c134e5f1..370406bbfeed 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh @@ -52,8 +52,7 @@ echo Results directory: $resdir/$ds KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM PATH=${KVM}/bin:$PATH; export PATH . functions.sh -cpus="`identify_qemu_vcpus`" -echo Using up to $cpus CPUs. +echo Using all `identify_qemu_vcpus` CPUs. # Each pass through this loop does one command-line argument. for gitbr in $@ @@ -74,7 +73,7 @@ do # Test the specified commit. git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1 echo git checkout return code: $? "(Commit $ntry: $i)" - kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 + kvm.sh --allcpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1 ret=$? echo kvm.sh return code $ret for commit $i from branch $gitbr diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh index aa745152a525..b582113178ac 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh @@ -32,7 +32,7 @@ sed -e 's/^\[[^]]*]//' < $i/console.log | awk ' /-scale: .* gps: .* batches:/ { ngps = $9; - nbatches = $11; + nbatches = 1; } /-scale: .*writer-duration/ { diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh index 6dc2b49b85ea..3cd03d01857c 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh @@ -206,7 +206,10 @@ do kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null` if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1 then - if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE" + if test -n "$TORTURE_KCONFIG_GDB_ARG" + then + : + elif test $kruntime -ge $seconds || test -f "$TORTURE_STOPFILE" then break; fi @@ -223,6 +226,20 @@ do echo "ps -fp $killpid" >> $resdir/Warnings 2>&1 ps -fp $killpid >> $resdir/Warnings 2>&1 fi + # Reduce probability of PID reuse by allowing a one-minute buffer + if test $((kruntime + 60)) -lt $seconds && test -s "$resdir/../jitter_pids" + then + awk < "$resdir/../jitter_pids" ' + NF > 0 { + pidlist = pidlist " " $1; + n++; + } + END { + if (n > 0) { + print "kill " pidlist; + } + }' | sh + fi else echo ' ---' `date`: "Kernel done" fi diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh index 6eb1d3f6524d..45d07b7b69f5 100755 --- a/tools/testing/selftests/rcutorture/bin/kvm.sh +++ b/tools/testing/selftests/rcutorture/bin/kvm.sh @@ -58,7 +58,7 @@ usage () { echo " --datestamp string" echo " --defconfig string" echo " --dryrun sched|script" - echo " --duration minutes" + echo " --duration minutes | <seconds>s | <hours>h | <days>d" echo " --gdb" echo " --help" echo " --interactive" @@ -93,7 +93,7 @@ do TORTURE_BOOT_IMAGE="$2" shift ;; - --buildonly) + --buildonly|--build-only) TORTURE_BUILDONLY=1 ;; --configs|--config) @@ -128,8 +128,20 @@ do shift ;; --duration) - checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error' - dur=$(($2*60)) + checkarg --duration "(minutes)" $# "$2" '^[0-9][0-9]*\(s\|m\|h\|d\|\)$' '^error' + mult=60 + if echo "$2" | grep -q 's$' + then + mult=1 + elif echo "$2" | grep -q 'h$' + then + mult=3600 + elif echo "$2" | grep -q 'd$' + then + mult=86400 + fi + ts=`echo $2 | sed -e 's/[smhd]$//'` + dur=$(($ts*mult)) shift ;; --gdb) @@ -148,7 +160,7 @@ do jitter="$2" shift ;; - --kconfig) + --kconfig|--kconfigs) checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$' TORTURE_KCONFIG_ARG="$2" shift @@ -159,7 +171,7 @@ do --kcsan) TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG ;; - --kmake-arg) + --kmake-arg|--kmake-args) checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$' TORTURE_KMAKE_ARG="$2" shift @@ -459,8 +471,11 @@ function dump(first, pastlast, batchnum) print "if test -n \"$needqemurun\"" print "then" print "\techo ---- Starting kernels. `date` | tee -a " rd "log"; - for (j = 0; j < njitter; j++) + print "\techo > " rd "jitter_pids" + for (j = 0; j < njitter; j++) { print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&" + print "\techo $! >> " rd "jitter_pids" + } print "\twait" print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log"; print "else" diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh index e03338091a06..263b1be50008 100755 --- a/tools/testing/selftests/rcutorture/bin/parse-console.sh +++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh @@ -133,7 +133,7 @@ then then summary="$summary Warnings: $n_warn" fi - n_bugs=`egrep -c 'BUG|Oops:' $file` + n_bugs=`egrep -c '\bBUG|Oops:' $file` if test "$n_bugs" -ne 0 then summary="$summary Bugs: $n_bugs" diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t index 6c78022c8cd8..d6557c38dfe4 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t @@ -4,7 +4,8 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_TINY_SRCU=y CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_DEBUG_ATOMIC_SLEEP=y #CHECK#CONFIG_PREEMPT_COUNT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u index c15ada821e45..6bc24e99862f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u +++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u @@ -4,7 +4,6 @@ CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n #CHECK#CONFIG_TINY_SRCU=y CONFIG_RCU_TRACE=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y +CONFIG_DEBUG_LOCK_ALLOC=n CONFIG_DEBUG_OBJECTS_RCU_HEAD=n CONFIG_PREEMPT_COUNT=n diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 index 12e7661b86f5..34c8ff5a12f2 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 @@ -4,8 +4,8 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=y CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=n -CONFIG_DEBUG_LOCK_ALLOC=y -CONFIG_PROVE_LOCKING=y -#CHECK#CONFIG_PROVE_RCU=y +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +#CHECK#CONFIG_PROVE_RCU=n CONFIG_TASKS_TRACE_RCU_READ_MB=y CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 index b69ed6673c41..77541eeb4e9f 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 @@ -4,8 +4,8 @@ CONFIG_HOTPLUG_CPU=y CONFIG_PREEMPT_NONE=n CONFIG_PREEMPT_VOLUNTARY=n CONFIG_PREEMPT=y -CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_PROVE_LOCKING=n -#CHECK#CONFIG_PROVE_RCU=n +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +#CHECK#CONFIG_PROVE_RCU=y CONFIG_TASKS_TRACE_RCU_READ_MB=n CONFIG_RCU_EXPERT=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 87caa0e932c7..90942bb5bebc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,2 +1,5 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RCU=y +CONFIG_TASKS_TRACE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 new file mode 100644 index 000000000000..e6baa2fbaeb3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 @@ -0,0 +1,15 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot new file mode 100644 index 000000000000..af0aff1457a4 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot @@ -0,0 +1 @@ +rcuscale.scale_type=tasks-tracing diff --git a/tools/testing/selftests/sgx/.gitignore b/tools/testing/selftests/sgx/.gitignore new file mode 100644 index 000000000000..fbaf0bda9a92 --- /dev/null +++ b/tools/testing/selftests/sgx/.gitignore @@ -0,0 +1,2 @@ +test_sgx +test_encl.elf diff --git a/tools/testing/selftests/sgx/Makefile b/tools/testing/selftests/sgx/Makefile new file mode 100644 index 000000000000..7f12d55b97f8 --- /dev/null +++ b/tools/testing/selftests/sgx/Makefile @@ -0,0 +1,57 @@ +top_srcdir = ../../../.. + +include ../lib.mk + +.PHONY: all clean + +CAN_BUILD_X86_64 := $(shell ../x86/check_cc.sh $(CC) \ + ../x86/trivial_64bit_program.c) + +ifndef OBJCOPY +OBJCOPY := $(CROSS_COMPILE)objcopy +endif + +INCLUDES := -I$(top_srcdir)/tools/include +HOST_CFLAGS := -Wall -Werror -g $(INCLUDES) -fPIC -z noexecstack +ENCL_CFLAGS := -Wall -Werror -static -nostdlib -nostartfiles -fPIC \ + -fno-stack-protector -mrdrnd $(INCLUDES) + +TEST_CUSTOM_PROGS := $(OUTPUT)/test_sgx + +ifeq ($(CAN_BUILD_X86_64), 1) +all: $(TEST_CUSTOM_PROGS) $(OUTPUT)/test_encl.elf +endif + +$(OUTPUT)/test_sgx: $(OUTPUT)/main.o \ + $(OUTPUT)/load.o \ + $(OUTPUT)/sigstruct.o \ + $(OUTPUT)/call.o \ + $(OUTPUT)/sign_key.o + $(CC) $(HOST_CFLAGS) -o $@ $^ -lcrypto + +$(OUTPUT)/main.o: main.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/load.o: load.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/sigstruct.o: sigstruct.c + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/call.o: call.S + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/sign_key.o: sign_key.S + $(CC) $(HOST_CFLAGS) -c $< -o $@ + +$(OUTPUT)/test_encl.elf: test_encl.lds test_encl.c test_encl_bootstrap.S + $(CC) $(ENCL_CFLAGS) -T $^ -o $@ + +EXTRA_CLEAN := \ + $(OUTPUT)/test_encl.elf \ + $(OUTPUT)/load.o \ + $(OUTPUT)/call.o \ + $(OUTPUT)/main.o \ + $(OUTPUT)/sigstruct.o \ + $(OUTPUT)/test_sgx \ + $(OUTPUT)/test_sgx.o \ diff --git a/tools/testing/selftests/sgx/call.S b/tools/testing/selftests/sgx/call.S new file mode 100644 index 000000000000..4ecadc7490f4 --- /dev/null +++ b/tools/testing/selftests/sgx/call.S @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** +* Copyright(c) 2016-20 Intel Corporation. +*/ + + .text + + .global sgx_call_vdso +sgx_call_vdso: + .cfi_startproc + push %r15 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r15, 0 + push %r14 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r14, 0 + push %r13 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r13, 0 + push %r12 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r12, 0 + push %rbx + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbx, 0 + push $0 + .cfi_adjust_cfa_offset 8 + push 0x38(%rsp) + .cfi_adjust_cfa_offset 8 + call *eenter(%rip) + add $0x10, %rsp + .cfi_adjust_cfa_offset -0x10 + pop %rbx + .cfi_adjust_cfa_offset -8 + pop %r12 + .cfi_adjust_cfa_offset -8 + pop %r13 + .cfi_adjust_cfa_offset -8 + pop %r14 + .cfi_adjust_cfa_offset -8 + pop %r15 + .cfi_adjust_cfa_offset -8 + ret + .cfi_endproc diff --git a/tools/testing/selftests/sgx/defines.h b/tools/testing/selftests/sgx/defines.h new file mode 100644 index 000000000000..592c1ccf4576 --- /dev/null +++ b/tools/testing/selftests/sgx/defines.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + +#ifndef DEFINES_H +#define DEFINES_H + +#include <stdint.h> + +#define PAGE_SIZE 4096 +#define PAGE_MASK (~(PAGE_SIZE - 1)) + +#define __aligned(x) __attribute__((__aligned__(x))) +#define __packed __attribute__((packed)) + +#include "../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../arch/x86/include/asm/enclu.h" +#include "../../../../arch/x86/include/uapi/asm/sgx.h" + +#endif /* DEFINES_H */ diff --git a/tools/testing/selftests/sgx/load.c b/tools/testing/selftests/sgx/load.c new file mode 100644 index 000000000000..9d43b75aaa55 --- /dev/null +++ b/tools/testing/selftests/sgx/load.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <assert.h> +#include <elf.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include "defines.h" +#include "main.h" + +void encl_delete(struct encl *encl) +{ + if (encl->encl_base) + munmap((void *)encl->encl_base, encl->encl_size); + + if (encl->bin) + munmap(encl->bin, encl->bin_size); + + if (encl->fd) + close(encl->fd); + + if (encl->segment_tbl) + free(encl->segment_tbl); + + memset(encl, 0, sizeof(*encl)); +} + +static bool encl_map_bin(const char *path, struct encl *encl) +{ + struct stat sb; + void *bin; + int ret; + int fd; + + fd = open(path, O_RDONLY); + if (fd == -1) { + perror("open()"); + return false; + } + + ret = stat(path, &sb); + if (ret) { + perror("stat()"); + goto err; + } + + bin = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (bin == MAP_FAILED) { + perror("mmap()"); + goto err; + } + + encl->bin = bin; + encl->bin_size = sb.st_size; + + close(fd); + return true; + +err: + close(fd); + return false; +} + +static bool encl_ioc_create(struct encl *encl) +{ + struct sgx_secs *secs = &encl->secs; + struct sgx_enclave_create ioc; + int rc; + + assert(encl->encl_base != 0); + + memset(secs, 0, sizeof(*secs)); + secs->ssa_frame_size = 1; + secs->attributes = SGX_ATTR_MODE64BIT; + secs->xfrm = 3; + secs->base = encl->encl_base; + secs->size = encl->encl_size; + + ioc.src = (unsigned long)secs; + rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_CREATE, &ioc); + if (rc) { + fprintf(stderr, "SGX_IOC_ENCLAVE_CREATE failed: errno=%d\n", + errno); + munmap((void *)secs->base, encl->encl_size); + return false; + } + + return true; +} + +static bool encl_ioc_add_pages(struct encl *encl, struct encl_segment *seg) +{ + struct sgx_enclave_add_pages ioc; + struct sgx_secinfo secinfo; + int rc; + + memset(&secinfo, 0, sizeof(secinfo)); + secinfo.flags = seg->flags; + + ioc.src = (uint64_t)encl->src + seg->offset; + ioc.offset = seg->offset; + ioc.length = seg->size; + ioc.secinfo = (unsigned long)&secinfo; + ioc.flags = SGX_PAGE_MEASURE; + + rc = ioctl(encl->fd, SGX_IOC_ENCLAVE_ADD_PAGES, &ioc); + if (rc < 0) { + fprintf(stderr, "SGX_IOC_ENCLAVE_ADD_PAGES failed: errno=%d.\n", + errno); + return false; + } + + return true; +} + +bool encl_load(const char *path, struct encl *encl) +{ + Elf64_Phdr *phdr_tbl; + off_t src_offset; + Elf64_Ehdr *ehdr; + int i, j; + int ret; + + memset(encl, 0, sizeof(*encl)); + + ret = open("/dev/sgx_enclave", O_RDWR); + if (ret < 0) { + fprintf(stderr, "Unable to open /dev/sgx_enclave\n"); + goto err; + } + + encl->fd = ret; + + if (!encl_map_bin(path, encl)) + goto err; + + ehdr = encl->bin; + phdr_tbl = encl->bin + ehdr->e_phoff; + + for (i = 0; i < ehdr->e_phnum; i++) { + Elf64_Phdr *phdr = &phdr_tbl[i]; + + if (phdr->p_type == PT_LOAD) + encl->nr_segments++; + } + + encl->segment_tbl = calloc(encl->nr_segments, + sizeof(struct encl_segment)); + if (!encl->segment_tbl) + goto err; + + for (i = 0, j = 0; i < ehdr->e_phnum; i++) { + Elf64_Phdr *phdr = &phdr_tbl[i]; + unsigned int flags = phdr->p_flags; + struct encl_segment *seg; + + if (phdr->p_type != PT_LOAD) + continue; + + seg = &encl->segment_tbl[j]; + + if (!!(flags & ~(PF_R | PF_W | PF_X))) { + fprintf(stderr, + "%d has invalid segment flags 0x%02x.\n", i, + phdr->p_flags); + goto err; + } + + if (j == 0 && flags != (PF_R | PF_W)) { + fprintf(stderr, + "TCS has invalid segment flags 0x%02x.\n", + phdr->p_flags); + goto err; + } + + if (j == 0) { + src_offset = phdr->p_offset & PAGE_MASK; + + seg->prot = PROT_READ | PROT_WRITE; + seg->flags = SGX_PAGE_TYPE_TCS << 8; + } else { + seg->prot = (phdr->p_flags & PF_R) ? PROT_READ : 0; + seg->prot |= (phdr->p_flags & PF_W) ? PROT_WRITE : 0; + seg->prot |= (phdr->p_flags & PF_X) ? PROT_EXEC : 0; + seg->flags = (SGX_PAGE_TYPE_REG << 8) | seg->prot; + } + + seg->offset = (phdr->p_offset & PAGE_MASK) - src_offset; + seg->size = (phdr->p_filesz + PAGE_SIZE - 1) & PAGE_MASK; + + printf("0x%016lx 0x%016lx 0x%02x\n", seg->offset, seg->size, + seg->prot); + + j++; + } + + assert(j == encl->nr_segments); + + encl->src = encl->bin + src_offset; + encl->src_size = encl->segment_tbl[j - 1].offset + + encl->segment_tbl[j - 1].size; + + for (encl->encl_size = 4096; encl->encl_size < encl->src_size; ) + encl->encl_size <<= 1; + + return true; + +err: + encl_delete(encl); + return false; +} + +static bool encl_map_area(struct encl *encl) +{ + size_t encl_size = encl->encl_size; + void *area; + + area = mmap(NULL, encl_size * 2, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (area == MAP_FAILED) { + perror("mmap"); + return false; + } + + encl->encl_base = ((uint64_t)area + encl_size - 1) & ~(encl_size - 1); + + munmap(area, encl->encl_base - (uint64_t)area); + munmap((void *)(encl->encl_base + encl_size), + (uint64_t)area + encl_size - encl->encl_base); + + return true; +} + +bool encl_build(struct encl *encl) +{ + struct sgx_enclave_init ioc; + int ret; + int i; + + if (!encl_map_area(encl)) + return false; + + if (!encl_ioc_create(encl)) + return false; + + /* + * Pages must be added before mapping VMAs because their permissions + * cap the VMA permissions. + */ + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (!encl_ioc_add_pages(encl, seg)) + return false; + } + + ioc.sigstruct = (uint64_t)&encl->sigstruct; + ret = ioctl(encl->fd, SGX_IOC_ENCLAVE_INIT, &ioc); + if (ret) { + fprintf(stderr, "SGX_IOC_ENCLAVE_INIT failed: errno=%d\n", + errno); + return false; + } + + return true; +} diff --git a/tools/testing/selftests/sgx/main.c b/tools/testing/selftests/sgx/main.c new file mode 100644 index 000000000000..724cec700926 --- /dev/null +++ b/tools/testing/selftests/sgx/main.c @@ -0,0 +1,246 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <elf.h> +#include <errno.h> +#include <fcntl.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/time.h> +#include <sys/types.h> +#include "defines.h" +#include "main.h" +#include "../kselftest.h" + +static const uint64_t MAGIC = 0x1122334455667788ULL; +vdso_sgx_enter_enclave_t eenter; + +struct vdso_symtab { + Elf64_Sym *elf_symtab; + const char *elf_symstrtab; + Elf64_Word *elf_hashtab; +}; + +static void *vdso_get_base_addr(char *envp[]) +{ + Elf64_auxv_t *auxv; + int i; + + for (i = 0; envp[i]; i++) + ; + + auxv = (Elf64_auxv_t *)&envp[i + 1]; + + for (i = 0; auxv[i].a_type != AT_NULL; i++) { + if (auxv[i].a_type == AT_SYSINFO_EHDR) + return (void *)auxv[i].a_un.a_val; + } + + return NULL; +} + +static Elf64_Dyn *vdso_get_dyntab(void *addr) +{ + Elf64_Ehdr *ehdr = addr; + Elf64_Phdr *phdrtab = addr + ehdr->e_phoff; + int i; + + for (i = 0; i < ehdr->e_phnum; i++) + if (phdrtab[i].p_type == PT_DYNAMIC) + return addr + phdrtab[i].p_offset; + + return NULL; +} + +static void *vdso_get_dyn(void *addr, Elf64_Dyn *dyntab, Elf64_Sxword tag) +{ + int i; + + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == tag) + return addr + dyntab[i].d_un.d_ptr; + + return NULL; +} + +static bool vdso_get_symtab(void *addr, struct vdso_symtab *symtab) +{ + Elf64_Dyn *dyntab = vdso_get_dyntab(addr); + + symtab->elf_symtab = vdso_get_dyn(addr, dyntab, DT_SYMTAB); + if (!symtab->elf_symtab) + return false; + + symtab->elf_symstrtab = vdso_get_dyn(addr, dyntab, DT_STRTAB); + if (!symtab->elf_symstrtab) + return false; + + symtab->elf_hashtab = vdso_get_dyn(addr, dyntab, DT_HASH); + if (!symtab->elf_hashtab) + return false; + + return true; +} + +static unsigned long elf_sym_hash(const char *name) +{ + unsigned long h = 0, high; + + while (*name) { + h = (h << 4) + *name++; + high = h & 0xf0000000; + + if (high) + h ^= high >> 24; + + h &= ~high; + } + + return h; +} + +static Elf64_Sym *vdso_symtab_get(struct vdso_symtab *symtab, const char *name) +{ + Elf64_Word bucketnum = symtab->elf_hashtab[0]; + Elf64_Word *buckettab = &symtab->elf_hashtab[2]; + Elf64_Word *chaintab = &symtab->elf_hashtab[2 + bucketnum]; + Elf64_Sym *sym; + Elf64_Word i; + + for (i = buckettab[elf_sym_hash(name) % bucketnum]; i != STN_UNDEF; + i = chaintab[i]) { + sym = &symtab->elf_symtab[i]; + if (!strcmp(name, &symtab->elf_symstrtab[sym->st_name])) + return sym; + } + + return NULL; +} + +bool report_results(struct sgx_enclave_run *run, int ret, uint64_t result, + const char *test) +{ + bool valid = true; + + if (ret) { + printf("FAIL: %s() returned: %d\n", test, ret); + valid = false; + } + + if (run->function != EEXIT) { + printf("FAIL: %s() function, expected: %u, got: %u\n", test, EEXIT, + run->function); + valid = false; + } + + if (result != MAGIC) { + printf("FAIL: %s(), expected: 0x%lx, got: 0x%lx\n", test, MAGIC, + result); + valid = false; + } + + if (run->user_data) { + printf("FAIL: %s() user data, expected: 0x0, got: 0x%llx\n", + test, run->user_data); + valid = false; + } + + return valid; +} + +static int user_handler(long rdi, long rsi, long rdx, long ursp, long r8, long r9, + struct sgx_enclave_run *run) +{ + run->user_data = 0; + return 0; +} + +int main(int argc, char *argv[], char *envp[]) +{ + struct sgx_enclave_run run; + struct vdso_symtab symtab; + Elf64_Sym *eenter_sym; + uint64_t result = 0; + struct encl encl; + unsigned int i; + void *addr; + int ret; + + memset(&run, 0, sizeof(run)); + + if (!encl_load("test_encl.elf", &encl)) { + encl_delete(&encl); + ksft_exit_skip("cannot load enclaves\n"); + } + + if (!encl_measure(&encl)) + goto err; + + if (!encl_build(&encl)) + goto err; + + /* + * An enclave consumer only must do this. + */ + for (i = 0; i < encl.nr_segments; i++) { + struct encl_segment *seg = &encl.segment_tbl[i]; + + addr = mmap((void *)encl.encl_base + seg->offset, seg->size, + seg->prot, MAP_SHARED | MAP_FIXED, encl.fd, 0); + if (addr == MAP_FAILED) { + fprintf(stderr, "mmap() failed, errno=%d.\n", errno); + exit(KSFT_FAIL); + } + } + + memset(&run, 0, sizeof(run)); + run.tcs = encl.encl_base; + + addr = vdso_get_base_addr(envp); + if (!addr) + goto err; + + if (!vdso_get_symtab(addr, &symtab)) + goto err; + + eenter_sym = vdso_symtab_get(&symtab, "__vdso_sgx_enter_enclave"); + if (!eenter_sym) + goto err; + + eenter = addr + eenter_sym->st_value; + + ret = sgx_call_vdso((void *)&MAGIC, &result, 0, EENTER, NULL, NULL, &run); + if (!report_results(&run, ret, result, "sgx_call_vdso")) + goto err; + + + /* Invoke the vDSO directly. */ + result = 0; + ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, + 0, 0, &run); + if (!report_results(&run, ret, result, "eenter")) + goto err; + + /* And with an exit handler. */ + run.user_handler = (__u64)user_handler; + run.user_data = 0xdeadbeef; + ret = eenter((unsigned long)&MAGIC, (unsigned long)&result, 0, EENTER, + 0, 0, &run); + if (!report_results(&run, ret, result, "user_handler")) + goto err; + + printf("SUCCESS\n"); + encl_delete(&encl); + exit(KSFT_PASS); + +err: + encl_delete(&encl); + exit(KSFT_FAIL); +} diff --git a/tools/testing/selftests/sgx/main.h b/tools/testing/selftests/sgx/main.h new file mode 100644 index 000000000000..67211a708f04 --- /dev/null +++ b/tools/testing/selftests/sgx/main.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + +#ifndef MAIN_H +#define MAIN_H + +struct encl_segment { + off_t offset; + size_t size; + unsigned int prot; + unsigned int flags; +}; + +struct encl { + int fd; + void *bin; + off_t bin_size; + void *src; + size_t src_size; + size_t encl_size; + off_t encl_base; + unsigned int nr_segments; + struct encl_segment *segment_tbl; + struct sgx_secs secs; + struct sgx_sigstruct sigstruct; +}; + +extern unsigned char sign_key[]; +extern unsigned char sign_key_end[]; + +void encl_delete(struct encl *ctx); +bool encl_load(const char *path, struct encl *encl); +bool encl_measure(struct encl *encl); +bool encl_build(struct encl *encl); + +int sgx_call_vdso(void *rdi, void *rsi, long rdx, u32 function, void *r8, void *r9, + struct sgx_enclave_run *run); + +#endif /* MAIN_H */ diff --git a/tools/testing/selftests/sgx/sign_key.S b/tools/testing/selftests/sgx/sign_key.S new file mode 100644 index 000000000000..e4fbe948444a --- /dev/null +++ b/tools/testing/selftests/sgx/sign_key.S @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/** +* Copyright(c) 2016-20 Intel Corporation. +*/ + + .section ".rodata", "a" + +sign_key: + .globl sign_key + .incbin "sign_key.pem" +sign_key_end: + .globl sign_key_end diff --git a/tools/testing/selftests/sgx/sign_key.pem b/tools/testing/selftests/sgx/sign_key.pem new file mode 100644 index 000000000000..d76f21f19187 --- /dev/null +++ b/tools/testing/selftests/sgx/sign_key.pem @@ -0,0 +1,39 @@ +-----BEGIN RSA PRIVATE KEY----- +MIIG4wIBAAKCAYEApalGbq7Q+usM91CPtksu3D+b0Prc8gAFL6grM3mg85A5Bx8V +cfMXPgtrw8EYFwQxDAvzZWwl+9VfOX0ECrFRBkOHcOiG0SnADN8+FLj1UiNUQwbp +S6OzhNWuRcSbGraSOyUlVlV0yMQSvewyzGklOaXBe30AJqzIBc8QfdSxKuP8rs0Z +ga6k/Bl73osrYKByILJTUUeZqjLERsE6GebsdzbWgKn8qVqng4ZS4yMNg6LeRlH3 ++9CIPgg4jwpSLHcp7dq2qTIB9a0tGe9ayp+5FbucpB6U7ePold0EeRN6RlJGDF9k +L93v8P5ykz5G5gYZ2g0K1X2sHIWV4huxPgv5PXgdyQYbK+6olqj0d5rjYuwX57Ul +k6SroPS1U6UbdCjG5txM+BNGU0VpD0ZhrIRw0leQdnNcCO9sTJuInZrgYacSVJ7u +mtB+uCt+uzUesc+l+xPRYA+9e14lLkZp7AAmo9FvL816XDI09deehJ3i/LmHKCRN +tuqC5TprRjFwUr6dAgEDAoIBgG5w2Z8fNfycs0+LCnmHdJLVEotR6KFVWMpwHMz7 +wKJgJgS/Y6FMuilc8oKAuroCy11dTO5IGVKOP3uorVx2NgQtBPXwWeDGgAiU1A3Q +o4wXjYIEm4fCd63jyYPYZ2ckYXzDbjmOTdstYdPyzIhGGNEZK6eoqsRzMAPfYFPj +IMdCqHSIu6vJw1K7p+myHOsVoWshjODaZnF3LYSA0WaZ8vokjwBxUxuRxQJZjJds +s60XPtmL+qfgWtQFewoG4XL6GuD8FcXccynRRtzrLtFNPIl9BQfWfjBBhTC1/Te1 +0Z6XbZvpdUTD9OfLB7SbR2OUFNpKQgriO0iYVdbW3cr7uu38Zwp4W1TX73DPjoi6 +KNooP6SGWd4mRJW2+dUmSYS4QNG8eVVZswKcploEIXlAKRsOe4kzJJ1iETugIe85 +uX8nd1WYEp65xwoRUg8hqng0MeyveVbXqNKuJG6tzNDt9kgFYo+hmC/oouAW2Dtc +T9jdRAwKJXqA2Eg6OkgXCEv+kwKBwQDYaQiFMlFhsmLlqI+EzCUh7c941/cL7m6U +7j98+8ngl0HgCEcrc10iJVCKakQW3YbPzAx3XkKTaGjWazvvrFarXIGlOud64B8a +iWyQ7VdlnmZnNEdk+C83tI91OQeaTKqRLDGzKh29Ry/jL8Pcbazt+kDgxa0H7qJp +roADUanLQuNkYubpbhFBh3xpa2EExaVq6rF7nIVsD8W9TrbmPKA4LgH7z0iy544D +kVCNYsTjYDdUWP+WiSor8kCnnpjnN9sCgcEAw/eNezUD1UDf6OYFC9+5JZJFn4Tg +mZMyN93JKIb199ffwnjtHUSjcyiWeesXucpzwtGbTcwQnDisSW4oneYKLSEBlBaq +scqiUugyGZZOthFSCbdXYXMViK2vHrKlkse7GxVlROKcEhM/pRBrmjaGO8eWR+D4 +FO2wCXzVs3KgV6j779frw0vC54oHOxc9+Lu1rSHp4i+600koyvL/zF6U/5tZXIvN +YW2yoiQJnjCmVA1pwbwV6KAUTPDTMnBK+YjnAoHBAJBGBa4hi5Z27JkbCliIGMFJ +NPs6pLKe9GNJf6in2+sPgUAFhMeiPhbDiwbxgrnpBIqICE+ULGJFmzmc0p/IOceT +ARjR76dAFLxbnbXzj5kURETNhO36yiUjCk4mBRGIcbYddndxaSjaH+zKgpLzyJ6m +1esuc1qfFvEfAAI2cTIsl5hB70ZJYNZaUvDyQK3ZGPHxy6e9rkgKg9OJz0QoatAe +q/002yHvtAJg4F5B2JeVejg7VQ8GHB1MKxppu0TP5wKBwQCCpQj8zgKOKz/wmViy +lSYZDC5qWJW7t3bP6TDFr06lOpUsUJ4TgxeiGw778g/RMaKB4RIz3WBoJcgw9BsT +7rFza1ZiucchMcGMmswRDt8kC4wGejpA92Owc8oUdxkMhSdnY5jYlxK2t3/DYEe8 +JFl9L7mFQKVjSSAGUzkiTGrlG1Kf5UfXh9dFBq98uilQfSPIwUaWynyM23CHTKqI +Pw3/vOY9sojrnncWwrEUIG7is5vWfWPwargzSzd29YdRBe8CgcEAuRVewK/YeNOX +B7ZG6gKKsfsvrGtY7FPETzLZAHjoVXYNea4LVZ2kn4hBXXlvw/4HD+YqcTt4wmif +5JQlDvjNobUiKJZpzy7hklVhF7wZFl4pCF7Yh43q9iQ7gKTaeUG7MiaK+G8Zz8aY +HW9rsiihbdZkccMvnPfO9334XMxl3HtBRzLstjUlbLB7Sdh+7tZ3JQidCOFNs5pE +XyWwnASPu4tKfDahH1UUTp1uJcq/6716CSWg080avYxFcn75qqsb +-----END RSA PRIVATE KEY----- diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c new file mode 100644 index 000000000000..dee7a3d6c5a5 --- /dev/null +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#define _GNU_SOURCE +#include <assert.h> +#include <getopt.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <unistd.h> +#include <openssl/err.h> +#include <openssl/pem.h> +#include "defines.h" +#include "main.h" + +struct q1q2_ctx { + BN_CTX *bn_ctx; + BIGNUM *m; + BIGNUM *s; + BIGNUM *q1; + BIGNUM *qr; + BIGNUM *q2; +}; + +static void free_q1q2_ctx(struct q1q2_ctx *ctx) +{ + BN_CTX_free(ctx->bn_ctx); + BN_free(ctx->m); + BN_free(ctx->s); + BN_free(ctx->q1); + BN_free(ctx->qr); + BN_free(ctx->q2); +} + +static bool alloc_q1q2_ctx(const uint8_t *s, const uint8_t *m, + struct q1q2_ctx *ctx) +{ + ctx->bn_ctx = BN_CTX_new(); + ctx->s = BN_bin2bn(s, SGX_MODULUS_SIZE, NULL); + ctx->m = BN_bin2bn(m, SGX_MODULUS_SIZE, NULL); + ctx->q1 = BN_new(); + ctx->qr = BN_new(); + ctx->q2 = BN_new(); + + if (!ctx->bn_ctx || !ctx->s || !ctx->m || !ctx->q1 || !ctx->qr || + !ctx->q2) { + free_q1q2_ctx(ctx); + return false; + } + + return true; +} + +static bool calc_q1q2(const uint8_t *s, const uint8_t *m, uint8_t *q1, + uint8_t *q2) +{ + struct q1q2_ctx ctx; + + if (!alloc_q1q2_ctx(s, m, &ctx)) { + fprintf(stderr, "Not enough memory for Q1Q2 calculation\n"); + return false; + } + + if (!BN_mul(ctx.q1, ctx.s, ctx.s, ctx.bn_ctx)) + goto out; + + if (!BN_div(ctx.q1, ctx.qr, ctx.q1, ctx.m, ctx.bn_ctx)) + goto out; + + if (BN_num_bytes(ctx.q1) > SGX_MODULUS_SIZE) { + fprintf(stderr, "Too large Q1 %d bytes\n", + BN_num_bytes(ctx.q1)); + goto out; + } + + if (!BN_mul(ctx.q2, ctx.s, ctx.qr, ctx.bn_ctx)) + goto out; + + if (!BN_div(ctx.q2, NULL, ctx.q2, ctx.m, ctx.bn_ctx)) + goto out; + + if (BN_num_bytes(ctx.q2) > SGX_MODULUS_SIZE) { + fprintf(stderr, "Too large Q2 %d bytes\n", + BN_num_bytes(ctx.q2)); + goto out; + } + + BN_bn2bin(ctx.q1, q1); + BN_bn2bin(ctx.q2, q2); + + free_q1q2_ctx(&ctx); + return true; +out: + free_q1q2_ctx(&ctx); + return false; +} + +struct sgx_sigstruct_payload { + struct sgx_sigstruct_header header; + struct sgx_sigstruct_body body; +}; + +static bool check_crypto_errors(void) +{ + int err; + bool had_errors = false; + const char *filename; + int line; + char str[256]; + + for ( ; ; ) { + if (ERR_peek_error() == 0) + break; + + had_errors = true; + err = ERR_get_error_line(&filename, &line); + ERR_error_string_n(err, str, sizeof(str)); + fprintf(stderr, "crypto: %s: %s:%d\n", str, filename, line); + } + + return had_errors; +} + +static inline const BIGNUM *get_modulus(RSA *key) +{ + const BIGNUM *n; + + RSA_get0_key(key, &n, NULL, NULL); + return n; +} + +static RSA *gen_sign_key(void) +{ + unsigned long sign_key_length; + BIO *bio; + RSA *key; + + sign_key_length = (unsigned long)&sign_key_end - + (unsigned long)&sign_key; + + bio = BIO_new_mem_buf(&sign_key, sign_key_length); + if (!bio) + return NULL; + + key = PEM_read_bio_RSAPrivateKey(bio, NULL, NULL, NULL); + BIO_free(bio); + + return key; +} + +static void reverse_bytes(void *data, int length) +{ + int i = 0; + int j = length - 1; + uint8_t temp; + uint8_t *ptr = data; + + while (i < j) { + temp = ptr[i]; + ptr[i] = ptr[j]; + ptr[j] = temp; + i++; + j--; + } +} + +enum mrtags { + MRECREATE = 0x0045544145524345, + MREADD = 0x0000000044444145, + MREEXTEND = 0x00444E4554584545, +}; + +static bool mrenclave_update(EVP_MD_CTX *ctx, const void *data) +{ + if (!EVP_DigestUpdate(ctx, data, 64)) { + fprintf(stderr, "digest update failed\n"); + return false; + } + + return true; +} + +static bool mrenclave_commit(EVP_MD_CTX *ctx, uint8_t *mrenclave) +{ + unsigned int size; + + if (!EVP_DigestFinal_ex(ctx, (unsigned char *)mrenclave, &size)) { + fprintf(stderr, "digest commit failed\n"); + return false; + } + + if (size != 32) { + fprintf(stderr, "invalid digest size = %u\n", size); + return false; + } + + return true; +} + +struct mrecreate { + uint64_t tag; + uint32_t ssaframesize; + uint64_t size; + uint8_t reserved[44]; +} __attribute__((__packed__)); + + +static bool mrenclave_ecreate(EVP_MD_CTX *ctx, uint64_t blob_size) +{ + struct mrecreate mrecreate; + uint64_t encl_size; + + for (encl_size = 0x1000; encl_size < blob_size; ) + encl_size <<= 1; + + memset(&mrecreate, 0, sizeof(mrecreate)); + mrecreate.tag = MRECREATE; + mrecreate.ssaframesize = 1; + mrecreate.size = encl_size; + + if (!EVP_DigestInit_ex(ctx, EVP_sha256(), NULL)) + return false; + + return mrenclave_update(ctx, &mrecreate); +} + +struct mreadd { + uint64_t tag; + uint64_t offset; + uint64_t flags; /* SECINFO flags */ + uint8_t reserved[40]; +} __attribute__((__packed__)); + +static bool mrenclave_eadd(EVP_MD_CTX *ctx, uint64_t offset, uint64_t flags) +{ + struct mreadd mreadd; + + memset(&mreadd, 0, sizeof(mreadd)); + mreadd.tag = MREADD; + mreadd.offset = offset; + mreadd.flags = flags; + + return mrenclave_update(ctx, &mreadd); +} + +struct mreextend { + uint64_t tag; + uint64_t offset; + uint8_t reserved[48]; +} __attribute__((__packed__)); + +static bool mrenclave_eextend(EVP_MD_CTX *ctx, uint64_t offset, + const uint8_t *data) +{ + struct mreextend mreextend; + int i; + + for (i = 0; i < 0x1000; i += 0x100) { + memset(&mreextend, 0, sizeof(mreextend)); + mreextend.tag = MREEXTEND; + mreextend.offset = offset + i; + + if (!mrenclave_update(ctx, &mreextend)) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x00])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x40])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0x80])) + return false; + + if (!mrenclave_update(ctx, &data[i + 0xC0])) + return false; + } + + return true; +} + +static bool mrenclave_segment(EVP_MD_CTX *ctx, struct encl *encl, + struct encl_segment *seg) +{ + uint64_t end = seg->offset + seg->size; + uint64_t offset; + + for (offset = seg->offset; offset < end; offset += PAGE_SIZE) { + if (!mrenclave_eadd(ctx, offset, seg->flags)) + return false; + + if (!mrenclave_eextend(ctx, offset, encl->src + offset)) + return false; + } + + return true; +} + +bool encl_measure(struct encl *encl) +{ + uint64_t header1[2] = {0x000000E100000006, 0x0000000000010000}; + uint64_t header2[2] = {0x0000006000000101, 0x0000000100000060}; + struct sgx_sigstruct *sigstruct = &encl->sigstruct; + struct sgx_sigstruct_payload payload; + uint8_t digest[SHA256_DIGEST_LENGTH]; + unsigned int siglen; + RSA *key = NULL; + EVP_MD_CTX *ctx; + int i; + + memset(sigstruct, 0, sizeof(*sigstruct)); + + sigstruct->header.header1[0] = header1[0]; + sigstruct->header.header1[1] = header1[1]; + sigstruct->header.header2[0] = header2[0]; + sigstruct->header.header2[1] = header2[1]; + sigstruct->exponent = 3; + sigstruct->body.attributes = SGX_ATTR_MODE64BIT; + sigstruct->body.xfrm = 3; + + /* sanity check */ + if (check_crypto_errors()) + goto err; + + key = gen_sign_key(); + if (!key) { + ERR_print_errors_fp(stdout); + goto err; + } + + BN_bn2bin(get_modulus(key), sigstruct->modulus); + + ctx = EVP_MD_CTX_create(); + if (!ctx) + goto err; + + if (!mrenclave_ecreate(ctx, encl->src_size)) + goto err; + + for (i = 0; i < encl->nr_segments; i++) { + struct encl_segment *seg = &encl->segment_tbl[i]; + + if (!mrenclave_segment(ctx, encl, seg)) + goto err; + } + + if (!mrenclave_commit(ctx, sigstruct->body.mrenclave)) + goto err; + + memcpy(&payload.header, &sigstruct->header, sizeof(sigstruct->header)); + memcpy(&payload.body, &sigstruct->body, sizeof(sigstruct->body)); + + SHA256((unsigned char *)&payload, sizeof(payload), digest); + + if (!RSA_sign(NID_sha256, digest, SHA256_DIGEST_LENGTH, + sigstruct->signature, &siglen, key)) + goto err; + + if (!calc_q1q2(sigstruct->signature, sigstruct->modulus, sigstruct->q1, + sigstruct->q2)) + goto err; + + /* BE -> LE */ + reverse_bytes(sigstruct->signature, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->modulus, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->q1, SGX_MODULUS_SIZE); + reverse_bytes(sigstruct->q2, SGX_MODULUS_SIZE); + + EVP_MD_CTX_destroy(ctx); + RSA_free(key); + return true; + +err: + EVP_MD_CTX_destroy(ctx); + RSA_free(key); + return false; +} diff --git a/tools/testing/selftests/sgx/test_encl.c b/tools/testing/selftests/sgx/test_encl.c new file mode 100644 index 000000000000..cf25b5dc1e03 --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2016-20 Intel Corporation. */ + +#include <stddef.h> +#include "defines.h" + +static void *memcpy(void *dest, const void *src, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) + ((char *)dest)[i] = ((char *)src)[i]; + + return dest; +} + +void encl_body(void *rdi, void *rsi) +{ + memcpy(rsi, rdi, 8); +} diff --git a/tools/testing/selftests/sgx/test_encl.lds b/tools/testing/selftests/sgx/test_encl.lds new file mode 100644 index 000000000000..0fbbda7e665e --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl.lds @@ -0,0 +1,40 @@ +OUTPUT_FORMAT(elf64-x86-64) + +PHDRS +{ + tcs PT_LOAD; + text PT_LOAD; + data PT_LOAD; +} + +SECTIONS +{ + . = 0; + .tcs : { + *(.tcs*) + } : tcs + + . = ALIGN(4096); + .text : { + *(.text*) + *(.rodata*) + } : text + + . = ALIGN(4096); + .data : { + *(.data*) + } : data + + /DISCARD/ : { + *(.comment*) + *(.note*) + *(.debug*) + *(.eh_frame*) + } +} + +ASSERT(!DEFINED(.altinstructions), "ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.altinstr_replacement), "ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.discard.retpoline_safe), "RETPOLINE ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.discard.nospec), "RETPOLINE ALTERNATIVES are not supported in enclaves") +ASSERT(!DEFINED(.got.plt), "Libcalls are not supported in enclaves") diff --git a/tools/testing/selftests/sgx/test_encl_bootstrap.S b/tools/testing/selftests/sgx/test_encl_bootstrap.S new file mode 100644 index 000000000000..5d5680d4ea39 --- /dev/null +++ b/tools/testing/selftests/sgx/test_encl_bootstrap.S @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright(c) 2016-20 Intel Corporation. + */ + + .macro ENCLU + .byte 0x0f, 0x01, 0xd7 + .endm + + .section ".tcs", "aw" + .balign 4096 + + .fill 1, 8, 0 # STATE (set by CPU) + .fill 1, 8, 0 # FLAGS + .quad encl_ssa # OSSA + .fill 1, 4, 0 # CSSA (set by CPU) + .fill 1, 4, 1 # NSSA + .quad encl_entry # OENTRY + .fill 1, 8, 0 # AEP (set by EENTER and ERESUME) + .fill 1, 8, 0 # OFSBASE + .fill 1, 8, 0 # OGSBASE + .fill 1, 4, 0xFFFFFFFF # FSLIMIT + .fill 1, 4, 0xFFFFFFFF # GSLIMIT + .fill 4024, 1, 0 # Reserved + + # Identical to the previous TCS. + .fill 1, 8, 0 # STATE (set by CPU) + .fill 1, 8, 0 # FLAGS + .quad encl_ssa # OSSA + .fill 1, 4, 0 # CSSA (set by CPU) + .fill 1, 4, 1 # NSSA + .quad encl_entry # OENTRY + .fill 1, 8, 0 # AEP (set by EENTER and ERESUME) + .fill 1, 8, 0 # OFSBASE + .fill 1, 8, 0 # OGSBASE + .fill 1, 4, 0xFFFFFFFF # FSLIMIT + .fill 1, 4, 0xFFFFFFFF # GSLIMIT + .fill 4024, 1, 0 # Reserved + + .text + +encl_entry: + # RBX contains the base address for TCS, which is also the first address + # inside the enclave. By adding the value of le_stack_end to it, we get + # the absolute address for the stack. + lea (encl_stack)(%rbx), %rax + xchg %rsp, %rax + push %rax + + push %rcx # push the address after EENTER + push %rbx # push the enclave base address + + call encl_body + + pop %rbx # pop the enclave base address + + /* Clear volatile GPRs, except RAX (EEXIT function). */ + xor %rcx, %rcx + xor %rdx, %rdx + xor %rdi, %rdi + xor %rsi, %rsi + xor %r8, %r8 + xor %r9, %r9 + xor %r10, %r10 + xor %r11, %r11 + + # Reset status flags. + add %rdx, %rdx # OF = SF = AF = CF = 0; ZF = PF = 1 + + # Prepare EEXIT target by popping the address of the instruction after + # EENTER to RBX. + pop %rbx + + # Restore the caller stack. + pop %rax + mov %rax, %rsp + + # EEXIT + mov $4, %rax + enclu + + .section ".data", "aw" + +encl_ssa: + .space 4096 + + .balign 4096 + .space 8192 +encl_stack: diff --git a/tools/testing/selftests/syscall_user_dispatch/.gitignore b/tools/testing/selftests/syscall_user_dispatch/.gitignore new file mode 100644 index 000000000000..f539615ad5da --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +sud_test +sud_benchmark diff --git a/tools/testing/selftests/syscall_user_dispatch/Makefile b/tools/testing/selftests/syscall_user_dispatch/Makefile new file mode 100644 index 000000000000..03c120270953 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +top_srcdir = ../../../.. +INSTALL_HDR_PATH = $(top_srcdir)/usr +LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ + +CFLAGS += -Wall -I$(LINUX_HDR_PATH) + +TEST_GEN_PROGS := sud_test sud_benchmark +include ../lib.mk diff --git a/tools/testing/selftests/syscall_user_dispatch/config b/tools/testing/selftests/syscall_user_dispatch/config new file mode 100644 index 000000000000..039e303e59d7 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/config @@ -0,0 +1 @@ +CONFIG_GENERIC_ENTRY=y diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c new file mode 100644 index 000000000000..6689f1183dbf --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Benchmark and test syscall user dispatch + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <signal.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <unistd.h> +#include <sys/sysinfo.h> +#include <sys/prctl.h> +#include <sys/syscall.h> + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +/* + * To test returning from a sigsys with selector blocked, the test + * requires some per-architecture support (i.e. knowledge about the + * signal trampoline address). On i386, we know it is on the vdso, and + * a small trampoline is open-coded for x86_64. Other architectures + * that have a trampoline in the vdso will support TEST_BLOCKED_RETURN + * out of the box, but don't enable them until they support syscall user + * dispatch. + */ +#if defined(__x86_64__) || defined(__i386__) +#define TEST_BLOCKED_RETURN +#endif + +#ifdef __x86_64__ +void* (syscall_dispatcher_start)(void); +void* (syscall_dispatcher_end)(void); +#else +unsigned long syscall_dispatcher_start = 0; +unsigned long syscall_dispatcher_end = 0; +#endif + +unsigned long trapped_call_count = 0; +unsigned long native_call_count = 0; + +char selector; +#define SYSCALL_BLOCK (selector = PR_SYS_DISPATCH_ON) +#define SYSCALL_UNBLOCK (selector = PR_SYS_DISPATCH_OFF) + +#define CALIBRATION_STEP 100000 +#define CALIBRATE_TO_SECS 5 +int factor; + +static double one_sysinfo_step(void) +{ + struct timespec t1, t2; + int i; + struct sysinfo info; + + clock_gettime(CLOCK_MONOTONIC, &t1); + for (i = 0; i < CALIBRATION_STEP; i++) + sysinfo(&info); + clock_gettime(CLOCK_MONOTONIC, &t2); + return (t2.tv_sec - t1.tv_sec) + 1.0e-9 * (t2.tv_nsec - t1.tv_nsec); +} + +static void calibrate_set(void) +{ + double elapsed = 0; + + printf("Calibrating test set to last ~%d seconds...\n", CALIBRATE_TO_SECS); + + while (elapsed < 1) { + elapsed += one_sysinfo_step(); + factor += CALIBRATE_TO_SECS; + } + + printf("test iterations = %d\n", CALIBRATION_STEP * factor); +} + +static double perf_syscall(void) +{ + unsigned int i; + double partial = 0; + + for (i = 0; i < factor; ++i) + partial += one_sysinfo_step()/(CALIBRATION_STEP*factor); + return partial; +} + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + char buf[1024]; + int len; + + SYSCALL_UNBLOCK; + + /* printf and friends are not signal-safe. */ + len = snprintf(buf, 1024, "Caught sys_%x\n", info->si_syscall); + write(1, buf, len); + + if (info->si_syscall == MAGIC_SYSCALL_1) + trapped_call_count++; + else + native_call_count++; + +#ifdef TEST_BLOCKED_RETURN + SYSCALL_BLOCK; +#endif + +#ifdef __x86_64__ + __asm__ volatile("movq $0xf, %rax"); + __asm__ volatile("leaveq"); + __asm__ volatile("add $0x8, %rsp"); + __asm__ volatile("syscall_dispatcher_start:"); + __asm__ volatile("syscall"); + __asm__ volatile("nop"); /* Landing pad within dispatcher area */ + __asm__ volatile("syscall_dispatcher_end:"); +#endif + +} + +int main(void) +{ + struct sigaction act; + double time1, time2; + int ret; + sigset_t mask; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + calibrate_set(); + + time1 = perf_syscall(); + printf("Avg syscall time %.0lfns.\n", time1 * 1.0e9); + + ret = sigaction(SIGSYS, &act, NULL); + if (ret) { + perror("Error sigaction:"); + exit(-1); + } + + fprintf(stderr, "Enabling syscall trapping.\n"); + + if (prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, + syscall_dispatcher_start, + (syscall_dispatcher_end - syscall_dispatcher_start + 1), + &selector)) { + perror("prctl failed\n"); + exit(-1); + } + + SYSCALL_BLOCK; + syscall(MAGIC_SYSCALL_1); + +#ifdef TEST_BLOCKED_RETURN + if (selector == PR_SYS_DISPATCH_OFF) { + fprintf(stderr, "Failed to return with selector blocked.\n"); + exit(-1); + } +#endif + + SYSCALL_UNBLOCK; + + if (!trapped_call_count) { + fprintf(stderr, "syscall trapping does not work.\n"); + exit(-1); + } + + time2 = perf_syscall(); + + if (native_call_count) { + perror("syscall trapping intercepted more syscalls than expected\n"); + exit(-1); + } + + printf("trapped_call_count %lu, native_call_count %lu.\n", + trapped_call_count, native_call_count); + printf("Avg syscall time %.0lfns.\n", time2 * 1.0e9); + printf("Interception overhead: %.1lf%% (+%.0lfns).\n", + 100.0 * (time2 / time1 - 1.0), 1.0e9 * (time2 - time1)); + return 0; + +} diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c new file mode 100644 index 000000000000..6498b050ef89 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Test code for syscall user dispatch + */ + +#define _GNU_SOURCE +#include <sys/prctl.h> +#include <sys/sysinfo.h> +#include <sys/syscall.h> +#include <signal.h> + +#include <asm/unistd.h> +#include "../kselftest_harness.h" + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifndef SYS_USER_DISPATCH +# define SYS_USER_DISPATCH 2 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +#define SYSCALL_DISPATCH_ON(x) ((x) = 1) +#define SYSCALL_DISPATCH_OFF(x) ((x) = 0) + +/* Test Summary: + * + * - dispatch_trigger_sigsys: Verify if PR_SET_SYSCALL_USER_DISPATCH is + * able to trigger SIGSYS on a syscall. + * + * - bad_selector: Test that a bad selector value triggers SIGSYS with + * si_errno EINVAL. + * + * - bad_prctl_param: Test that the API correctly rejects invalid + * parameters on prctl + * + * - dispatch_and_return: Test that a syscall is selectively dispatched + * to userspace depending on the value of selector. + * + * - disable_dispatch: Test that the PR_SYS_DISPATCH_OFF correctly + * disables the dispatcher + * + * - direct_dispatch_range: Test that a syscall within the allowed range + * can bypass the dispatcher. + */ + +TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) +{ + char sel = 0; + struct sysinfo info; + int ret; + + ret = sysinfo(&info); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + sysinfo(&info); + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(bad_prctl_param) +{ + char sel = 0; + int op; + + /* Invalid op */ + op = -1; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); + ASSERT_EQ(EINVAL, errno); + + /* PR_SYS_DISPATCH_OFF */ + op = PR_SYS_DISPATCH_OFF; + + /* offset != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); + EXPECT_EQ(EINVAL, errno); + + /* len != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); + EXPECT_EQ(EINVAL, errno); + + /* sel != NULL */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Valid parameter */ + errno = 0; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); + EXPECT_EQ(0, errno); + + /* PR_SYS_DISPATCH_ON */ + op = PR_SYS_DISPATCH_ON; + + /* Dispatcher region is bad (offset > 0 && len == 0) */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Invalid selector */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); + ASSERT_EQ(EFAULT, errno); + + /* + * Dispatcher range overflows unsigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } + + /* + * Allowed range overflows usigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } +} + +/* + * Use global selector for handle_sigsys tests, to avoid passing + * selector to signal handler + */ +char glob_sel; +int nr_syscalls_emulated; +int si_code; +int si_errno; + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + si_code = info->si_code; + si_errno = info->si_errno; + + if (info->si_syscall == MAGIC_SYSCALL_1) + nr_syscalls_emulated++; + + /* In preparation for sigreturn. */ + SYSCALL_DISPATCH_OFF(glob_sel); +} + +TEST(dispatch_and_return) +{ + long ret; + struct sigaction act; + sigset_t mask; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(-1, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } + + /* MAGIC_SYSCALL_1 should be emulated. */ + nr_syscalls_emulated = 0; + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(MAGIC_SYSCALL_1, ret) { + TH_LOG("Failed to intercept syscall"); + } + EXPECT_EQ(1, nr_syscalls_emulated) { + TH_LOG("Failed to emulate syscall"); + } + ASSERT_EQ(SYS_USER_DISPATCH, si_code) { + TH_LOG("Bad si_code in SIGSYS"); + } + ASSERT_EQ(0, si_errno) { + TH_LOG("Bad si_errno in SIGSYS"); + } +} + +TEST_SIGNAL(bad_selector, SIGSYS) +{ + long ret; + struct sigaction act; + sigset_t mask; + struct sysinfo info; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + glob_sel = -1; + + sysinfo(&info); + + /* Even though it is ready to catch SIGSYS, the signal is + * supposed to be uncatchable. + */ + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(disable_dispatch) +{ + int ret; + struct sysinfo info; + char sel = 0; + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_OFF, 0, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Failed to unset syscall user dispatch"); + } + + /* Shouldn't have any effect... */ + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(__NR_sysinfo, &info); + EXPECT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST(direct_dispatch_range) +{ + int ret = 0; + struct sysinfo info; + char sel = 0; + + /* + * Instead of calculating libc addresses; allow the entire + * memory map and lock the selector. + */ + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + ret = sysinfo(&info); + ASSERT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index c33a7aac27ff..b71828df5a6d 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -59,6 +59,7 @@ CONFIG_NET_IFE_SKBPRIO=m CONFIG_NET_IFE_SKBTCINDEX=m CONFIG_NET_SCH_FIFO=y CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_RED=m # ## Network testing diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c index 7f14f0fdac84..f2519154208a 100644 --- a/tools/testing/selftests/timens/procfs.c +++ b/tools/testing/selftests/timens/procfs.c @@ -93,6 +93,33 @@ static int read_proc_uptime(struct timespec *uptime) return 0; } +static int read_proc_stat_btime(unsigned long long *boottime_sec) +{ + FILE *proc; + char line_buf[2048]; + + proc = fopen("/proc/stat", "r"); + if (proc == NULL) { + pr_perror("Unable to open /proc/stat"); + return -1; + } + + while (fgets(line_buf, 2048, proc)) { + if (sscanf(line_buf, "btime %llu", boottime_sec) != 1) + continue; + fclose(proc); + return 0; + } + if (errno) { + pr_perror("fscanf"); + fclose(proc); + return -errno; + } + pr_err("failed to parse /proc/stat"); + fclose(proc); + return -1; +} + static int check_uptime(void) { struct timespec uptime_new, uptime_old; @@ -123,18 +150,47 @@ static int check_uptime(void) return 0; } +static int check_stat_btime(void) +{ + unsigned long long btime_new, btime_old; + unsigned long long btime_expected; + + if (switch_ns(parent_ns)) + return pr_err("switch_ns(%d)", parent_ns); + + if (read_proc_stat_btime(&btime_old)) + return 1; + + if (switch_ns(child_ns)) + return pr_err("switch_ns(%d)", child_ns); + + if (read_proc_stat_btime(&btime_new)) + return 1; + + btime_expected = btime_old - TEN_DAYS_IN_SEC; + if (btime_new != btime_expected) { + pr_fail("btime in /proc/stat: old %llu, new %llu [%llu]", + btime_old, btime_new, btime_expected); + return 1; + } + + ksft_test_result_pass("Passed for /proc/stat btime\n"); + return 0; +} + int main(int argc, char *argv[]) { int ret = 0; nscheck(); - ksft_set_plan(1); + ksft_set_plan(2); if (init_namespaces()) return 1; ret |= check_uptime(); + ret |= check_stat_btime(); if (ret) ksft_exit_fail(); diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 30873b19d04b..691893afc15d 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -60,9 +60,13 @@ ifeq ($(CAN_BUILD_X86_64),1) TEST_GEN_FILES += $(BINARIES_64) endif else + +ifneq (,$(findstring $(ARCH),powerpc)) TEST_GEN_FILES += protection_keys endif +endif + ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64)) TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 9b0912a01777..c4425597769a 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -206,19 +206,19 @@ static int hugetlb_release_pages(char *rel_area) return ret; } - static void hugetlb_allocate_area(void **alloc_area) { void *area_alias = NULL; char **alloc_area_alias; + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, (map_shared ? MAP_SHARED : MAP_PRIVATE) | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) { - fprintf(stderr, "mmap of hugetlbfs file failed\n"); - *alloc_area = NULL; + perror("mmap of hugetlbfs file failed"); + goto fail; } if (map_shared) { @@ -227,14 +227,11 @@ static void hugetlb_allocate_area(void **alloc_area) huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) { - if (munmap(*alloc_area, nr_pages * page_size) < 0) { - perror("hugetlb munmap"); - exit(1); - } - *alloc_area = NULL; - return; + perror("mmap of hugetlb file alias failed"); + goto fail_munmap; } } + if (*alloc_area == area_src) { huge_fd_off0 = *alloc_area; alloc_area_alias = &area_src_alias; @@ -243,6 +240,16 @@ static void hugetlb_allocate_area(void **alloc_area) } if (area_alias) *alloc_area_alias = area_alias; + + return; + +fail_munmap: + if (munmap(*alloc_area, nr_pages * page_size) < 0) { + perror("hugetlb munmap"); + exit(1); + } +fail: + *alloc_area = NULL; } static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c index 7161cfc2e60b..8c780cce941d 100644 --- a/tools/testing/selftests/x86/fsgsbase.c +++ b/tools/testing/selftests/x86/fsgsbase.c @@ -392,8 +392,8 @@ static void set_gs_and_switch_to(unsigned long local, local = read_base(GS); /* - * Signal delivery seems to mess up weird selectors. Put it - * back. + * Signal delivery is quite likely to change a selector + * of 1, 2, or 3 back to 0 due to IRET being defective. */ asm volatile ("mov %0, %%gs" : : "rm" (force_sel)); } else { @@ -411,6 +411,14 @@ static void set_gs_and_switch_to(unsigned long local, if (base == local && sel_pre_sched == sel_post_sched) { printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n", sel_pre_sched, local); + } else if (base == local && sel_pre_sched >= 1 && sel_pre_sched <= 3 && + sel_post_sched == 0) { + /* + * IRET is misdesigned and will squash selectors 1, 2, or 3 + * to zero. Don't fail the test just because this happened. + */ + printf("[OK]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx because IRET is defective\n", + sel_pre_sched, local, sel_post_sched, base); } else { nerrs++; printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n", diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S b/tools/testing/selftests/x86/raw_syscall_helper_32.S index 94410fa2b5ed..a10d36afdca0 100644 --- a/tools/testing/selftests/x86/raw_syscall_helper_32.S +++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S @@ -45,3 +45,5 @@ int80_and_ret: .type int80_and_ret, @function .size int80_and_ret, .-int80_and_ret + +.section .note.GNU-stack,"",%progbits diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S index 1bb5d62c16a4..a2d47d8344d4 100644 --- a/tools/testing/selftests/x86/thunks.S +++ b/tools/testing/selftests/x86/thunks.S @@ -57,3 +57,5 @@ call32_from_64: ret .size call32_from_64, .-call32_from_64 + +.section .note.GNU-stack,"",%progbits |