From ad07c8ceb6631a83b62d405a61448bba92adac68 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:34 +0000 Subject: perf/core: Remove unused perf_flags Now that perf_flags is not used we remove it. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-13-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- tools/include/uapi/linux/perf_event.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 9de8780ac8d9..ea19b5d491bf 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -445,8 +445,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ -- cgit v1.3-7-g2ca7 From d764ac6464915523e68e220b6aa4c3c2eb8e3f94 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:14 -0800 Subject: tools headers uapi: Sync tools/include/uapi/linux/perf_event.h Sync changes for PERF_RECORD_KSYMBOL. Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-3-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index ea19b5d491bf..1dee5c8f166b 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -372,7 +372,8 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + __reserved_1 : 34; union { __u32 wakeup_events; /* wakeup every n events */ @@ -963,9 +964,32 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 -- cgit v1.3-7-g2ca7 From df063c83aa2c58412ddf533ada9aaf25986120ec Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:16 -0800 Subject: tools headers uapi: Sync tools/include/uapi/linux/perf_event.h Sync for PERF_RECORD_BPF_EVENT. Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-5-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/perf_event.h | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 1dee5c8f166b..7198ddd0c6b1 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -373,7 +373,8 @@ struct perf_event_attr { write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - __reserved_1 : 34; + bpf_event : 1, /* include bpf events */ + __reserved_1 : 33; union { __u32 wakeup_events; /* wakeup every n events */ @@ -979,6 +980,25 @@ enum perf_event_type { */ PERF_RECORD_KSYMBOL = 17, + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -990,6 +1010,13 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 -- cgit v1.3-7-g2ca7 From 3aef2cad5d51ee66d2a614dd2f70cb34c74caf77 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 6 Dec 2018 11:18:13 -0800 Subject: tools: Update rbtree implementation There have been a number of changes in the kernel's rbrtee implementation, including loose lockless searching guarantees and rb_root_cached, which later patches will use as an optimization. Signed-off-by: Davidlohr Bueso Tested-by: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Namhyung Kim Link: http://lkml.kernel.org/r/20181206191819.30182-2-dave@stgolabs.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/rbtree.h | 52 ++++++++-- tools/include/linux/rbtree_augmented.h | 60 ++++++++--- tools/lib/rbtree.c | 178 +++++++++++++++++++++++++-------- 3 files changed, 229 insertions(+), 61 deletions(-) (limited to 'tools/include') diff --git a/tools/include/linux/rbtree.h b/tools/include/linux/rbtree.h index 112582253dd0..8e9ed4786269 100644 --- a/tools/include/linux/rbtree.h +++ b/tools/include/linux/rbtree.h @@ -43,13 +43,28 @@ struct rb_root { struct rb_node *rb_node; }; +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } +#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } #define rb_entry(ptr, type, member) container_of(ptr, type, member) -#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL) +#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ @@ -68,6 +83,12 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +extern void rb_insert_color_cached(struct rb_node *, + struct rb_root_cached *, bool); +extern void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); +/* Same as rb_first(), but O(1) */ +#define rb_first_cached(root) (root)->rb_leftmost + /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); @@ -75,6 +96,8 @@ extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); +extern void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) @@ -90,12 +113,29 @@ static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) - -/* - * Handy for checking that we are not deleting an entry that is - * already in a list, found in block/{blk-throttle,cfq-iosched}.c, - * probably should be moved to lib/rbtree.c... +/** + * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of + * given type allowing the backing memory of @pos to be invalidated + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + * + * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as + * list_for_each_entry_safe() and allows the iteration to continue independent + * of changes to @pos by the body of the loop. + * + * Note, however, that it cannot handle other modifications that re-order the + * rbtree it is iterating over. This includes calling rb_erase() on @pos, as + * rb_erase() may rebalance the tree, causing us to miss some nodes. */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ + pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ + typeof(*pos), field); 1; }); \ + pos = n) + static inline void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); diff --git a/tools/include/linux/rbtree_augmented.h b/tools/include/linux/rbtree_augmented.h index 43be941db695..d008e1404580 100644 --- a/tools/include/linux/rbtree_augmented.h +++ b/tools/include/linux/rbtree_augmented.h @@ -44,7 +44,9 @@ struct rb_augment_callbacks { void (*rotate)(struct rb_node *old, struct rb_node *new); }; -extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, +extern void __rb_insert_augmented(struct rb_node *node, + struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. @@ -60,7 +62,16 @@ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - __rb_insert_augmented(node, root, augment->rotate); + __rb_insert_augmented(node, root, false, NULL, augment->rotate); +} + +static inline void +rb_insert_augmented_cached(struct rb_node *node, + struct rb_root_cached *root, bool newleft, + const struct rb_augment_callbacks *augment) +{ + __rb_insert_augmented(node, &root->rb_root, + newleft, &root->rb_leftmost, augment->rotate); } #define RB_DECLARE_CALLBACKS(rbstatic, rbname, rbstruct, rbfield, \ @@ -93,7 +104,9 @@ rbname ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ old->rbaugmented = rbcompute(old); \ } \ rbstatic const struct rb_augment_callbacks rbname = { \ - rbname ## _propagate, rbname ## _copy, rbname ## _rotate \ + .propagate = rbname ## _propagate, \ + .copy = rbname ## _copy, \ + .rotate = rbname ## _rotate \ }; @@ -126,11 +139,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new, { if (parent) { if (parent->rb_left == old) - parent->rb_left = new; + WRITE_ONCE(parent->rb_left, new); else - parent->rb_right = new; + WRITE_ONCE(parent->rb_right, new); } else - root->rb_node = new; + WRITE_ONCE(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, @@ -138,12 +151,17 @@ extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, + struct rb_node **leftmost, const struct rb_augment_callbacks *augment) { - struct rb_node *child = node->rb_right, *tmp = node->rb_left; + struct rb_node *child = node->rb_right; + struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; + if (leftmost && node == *leftmost) + *leftmost = rb_next(node); + if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) @@ -170,6 +188,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, tmp = parent; } else { struct rb_node *successor = child, *child2; + tmp = child->rb_left; if (!tmp) { /* @@ -183,6 +202,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, */ parent = successor; child2 = successor->rb_right; + augment->copy(node, successor); } else { /* @@ -204,19 +224,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root, successor = tmp; tmp = tmp->rb_left; } while (tmp); - parent->rb_left = child2 = successor->rb_right; - successor->rb_right = child; + child2 = successor->rb_right; + WRITE_ONCE(parent->rb_left, child2); + WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); + augment->copy(node, successor); augment->propagate(parent, successor); } - successor->rb_left = tmp = node->rb_left; + tmp = node->rb_left; + WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); + if (child2) { successor->__rb_parent_color = pc; rb_set_parent_color(child2, parent, RB_BLACK); @@ -237,9 +261,21 @@ static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { - struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); + struct rb_node *rebalance = __rb_erase_augmented(node, root, + NULL, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } -#endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ +static __always_inline void +rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, + const struct rb_augment_callbacks *augment) +{ + struct rb_node *rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, + augment); + if (rebalance) + __rb_erase_color(rebalance, &root->rb_root, augment->rotate); +} + +#endif /* _TOOLS_LINUX_RBTREE_AUGMENTED_H */ diff --git a/tools/lib/rbtree.c b/tools/lib/rbtree.c index 17c2b596f043..904adb70a4f0 100644 --- a/tools/lib/rbtree.c +++ b/tools/lib/rbtree.c @@ -22,6 +22,7 @@ */ #include +#include /* * red-black trees properties: http://en.wikipedia.org/wiki/Rbtree @@ -43,6 +44,30 @@ * parentheses and have some accompanying text comment. */ +/* + * Notes on lockless lookups: + * + * All stores to the tree structure (rb_left and rb_right) must be done using + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the + * tree structure as seen in program order. + * + * These two requirements will allow lockless iteration of the tree -- not + * correct iteration mind you, tree rotations are not atomic so a lookup might + * miss entire subtrees. + * + * But they do guarantee that any such traversal will only see valid elements + * and that it will indeed complete -- does not get stuck in a loop. + * + * It also guarantees that if the lookup returns an element it is the 'correct' + * one. But not returning an element does _NOT_ mean it's not present. + * + * NOTE: + * + * Stores to __rb_parent_color are not important for simple lookups so those + * are left undone as of now. Nor did I check for loops involving parent + * pointers. + */ + static inline void rb_set_black(struct rb_node *rb) { rb->__rb_parent_color |= RB_BLACK; @@ -70,22 +95,35 @@ __rb_rotate_set_parents(struct rb_node *old, struct rb_node *new, static __always_inline void __rb_insert(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { struct rb_node *parent = rb_red_parent(node), *gparent, *tmp; + if (newleft) + *leftmost = node; + while (true) { /* - * Loop invariant: node is red - * - * If there is a black parent, we are done. - * Otherwise, take some corrective action as we don't - * want a red root or two consecutive red nodes. + * Loop invariant: node is red. */ - if (!parent) { + if (unlikely(!parent)) { + /* + * The inserted node is root. Either this is the + * first node, or we recursed at Case 1 below and + * are no longer violating 4). + */ rb_set_parent_color(node, NULL, RB_BLACK); break; - } else if (rb_is_black(parent)) + } + + /* + * If there is a black parent, we are done. + * Otherwise, take some corrective action as, + * per 4), we don't want a red root or two + * consecutive red nodes. + */ + if(rb_is_black(parent)) break; gparent = rb_red_parent(parent); @@ -94,7 +132,7 @@ __rb_insert(struct rb_node *node, struct rb_root *root, if (parent != tmp) { /* parent == gparent->rb_left */ if (tmp && rb_is_red(tmp)) { /* - * Case 1 - color flips + * Case 1 - node's uncle is red (color flips). * * G g * / \ / \ @@ -117,7 +155,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, tmp = parent->rb_right; if (node == tmp) { /* - * Case 2 - left rotate at parent + * Case 2 - node's uncle is black and node is + * the parent's right child (left rotate at parent). * * G G * / \ / \ @@ -128,8 +167,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, * This still leaves us in violation of 4), the * continuation into Case 3 will fix that. */ - parent->rb_right = tmp = node->rb_left; - node->rb_left = parent; + tmp = node->rb_left; + WRITE_ONCE(parent->rb_right, tmp); + WRITE_ONCE(node->rb_left, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); @@ -140,7 +180,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } /* - * Case 3 - right rotate at gparent + * Case 3 - node's uncle is black and node is + * the parent's left child (right rotate at gparent). * * G P * / \ / \ @@ -148,8 +189,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, * / \ * n U */ - gparent->rb_left = tmp; /* == parent->rb_right */ - parent->rb_right = gparent; + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ + WRITE_ONCE(parent->rb_right, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); @@ -170,8 +211,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root, tmp = parent->rb_left; if (node == tmp) { /* Case 2 - right rotate at parent */ - parent->rb_left = tmp = node->rb_right; - node->rb_right = parent; + tmp = node->rb_right; + WRITE_ONCE(parent->rb_left, tmp); + WRITE_ONCE(node->rb_right, parent); if (tmp) rb_set_parent_color(tmp, parent, RB_BLACK); @@ -182,8 +224,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root, } /* Case 3 - left rotate at gparent */ - gparent->rb_right = tmp; /* == parent->rb_left */ - parent->rb_left = gparent; + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ + WRITE_ONCE(parent->rb_left, gparent); if (tmp) rb_set_parent_color(tmp, gparent, RB_BLACK); __rb_rotate_set_parents(gparent, parent, root, RB_RED); @@ -223,8 +265,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * Sl Sr N Sl */ - parent->rb_right = tmp1 = sibling->rb_left; - sibling->rb_left = parent; + tmp1 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp1); + WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); @@ -268,15 +311,31 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * * (p) (p) * / \ / \ - * N S --> N Sl + * N S --> N sl * / \ \ - * sl Sr s + * sl Sr S * \ * Sr + * + * Note: p might be red, and then both + * p and sl are red after rotation(which + * breaks property 4). This is fixed in + * Case 4 (in __rb_rotate_set_parents() + * which set sl the color of p + * and set p RB_BLACK) + * + * (p) (sl) + * / \ / \ + * N sl --> P S + * \ / \ + * S N Sr + * \ + * Sr */ - sibling->rb_left = tmp1 = tmp2->rb_right; - tmp2->rb_right = sibling; - parent->rb_right = tmp2; + tmp1 = tmp2->rb_right; + WRITE_ONCE(sibling->rb_left, tmp1); + WRITE_ONCE(tmp2->rb_right, sibling); + WRITE_ONCE(parent->rb_right, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); @@ -296,8 +355,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, * / \ / \ * (sl) sr N (sl) */ - parent->rb_right = tmp2 = sibling->rb_left; - sibling->rb_left = parent; + tmp2 = sibling->rb_left; + WRITE_ONCE(parent->rb_right, tmp2); + WRITE_ONCE(sibling->rb_left, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); @@ -309,8 +369,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, sibling = parent->rb_left; if (rb_is_red(sibling)) { /* Case 1 - right rotate at parent */ - parent->rb_left = tmp1 = sibling->rb_right; - sibling->rb_right = parent; + tmp1 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp1); + WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, parent, RB_BLACK); __rb_rotate_set_parents(parent, sibling, root, RB_RED); @@ -334,10 +395,11 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, } break; } - /* Case 3 - right rotate at sibling */ - sibling->rb_right = tmp1 = tmp2->rb_left; - tmp2->rb_left = sibling; - parent->rb_left = tmp2; + /* Case 3 - left rotate at sibling */ + tmp1 = tmp2->rb_left; + WRITE_ONCE(sibling->rb_right, tmp1); + WRITE_ONCE(tmp2->rb_left, sibling); + WRITE_ONCE(parent->rb_left, tmp2); if (tmp1) rb_set_parent_color(tmp1, sibling, RB_BLACK); @@ -345,9 +407,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root, tmp1 = sibling; sibling = tmp2; } - /* Case 4 - left rotate at parent + color flips */ - parent->rb_left = tmp2 = sibling->rb_right; - sibling->rb_right = parent; + /* Case 4 - right rotate at parent + color flips */ + tmp2 = sibling->rb_right; + WRITE_ONCE(parent->rb_left, tmp2); + WRITE_ONCE(sibling->rb_right, parent); rb_set_parent_color(tmp1, sibling, RB_BLACK); if (tmp2) rb_set_parent(tmp2, parent); @@ -378,22 +441,41 @@ static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} static const struct rb_augment_callbacks dummy_callbacks = { - dummy_propagate, dummy_copy, dummy_rotate + .propagate = dummy_propagate, + .copy = dummy_copy, + .rotate = dummy_rotate }; void rb_insert_color(struct rb_node *node, struct rb_root *root) { - __rb_insert(node, root, dummy_rotate); + __rb_insert(node, root, false, NULL, dummy_rotate); } void rb_erase(struct rb_node *node, struct rb_root *root) { struct rb_node *rebalance; - rebalance = __rb_erase_augmented(node, root, &dummy_callbacks); + rebalance = __rb_erase_augmented(node, root, + NULL, &dummy_callbacks); if (rebalance) ____rb_erase_color(rebalance, root, dummy_rotate); } +void rb_insert_color_cached(struct rb_node *node, + struct rb_root_cached *root, bool leftmost) +{ + __rb_insert(node, &root->rb_root, leftmost, + &root->rb_leftmost, dummy_rotate); +} + +void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) +{ + struct rb_node *rebalance; + rebalance = __rb_erase_augmented(node, &root->rb_root, + &root->rb_leftmost, &dummy_callbacks); + if (rebalance) + ____rb_erase_color(rebalance, &root->rb_root, dummy_rotate); +} + /* * Augmented rbtree manipulation functions. * @@ -402,9 +484,10 @@ void rb_erase(struct rb_node *node, struct rb_root *root) */ void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, + bool newleft, struct rb_node **leftmost, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)) { - __rb_insert(node, root, augment_rotate); + __rb_insert(node, root, newleft, leftmost, augment_rotate); } /* @@ -498,15 +581,24 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, { struct rb_node *parent = rb_parent(victim); + /* Copy the pointers/colour from the victim to the replacement */ + *new = *victim; + /* Set the surrounding nodes to point to the replacement */ - __rb_change_child(victim, new, parent, root); if (victim->rb_left) rb_set_parent(victim->rb_left, new); if (victim->rb_right) rb_set_parent(victim->rb_right, new); + __rb_change_child(victim, new, parent, root); +} - /* Copy the pointers/colour from the victim to the replacement */ - *new = *victim; +void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, + struct rb_root_cached *root) +{ + rb_replace_node(victim, new, &root->rb_root); + + if (root->rb_leftmost == victim) + root->rb_leftmost = new; } static struct rb_node *rb_left_deepest_node(const struct rb_node *node) -- cgit v1.3-7-g2ca7 From 30ca20517ac136e63967396899af89f359f16f36 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sat, 29 Dec 2018 19:04:53 +0100 Subject: tools headers: Move the nolibc header from rcutorture to tools/include/nolibc/ As suggested by Ingo, this header file might benefit other tools than just rcutorture. For now it's quite limited, but is easy to extend, so exposing it into tools/include/nolibc/ will make it much easier to adopt by other tools. The mkinitrd.sh script in rcutorture was updated to use this new location. Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Cc: Paul E. McKenney Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/nolibc.h | 2263 ++++++++++++++++++++ tools/testing/selftests/rcutorture/bin/mkinitrd.sh | 4 +- tools/testing/selftests/rcutorture/bin/nolibc.h | 2263 -------------------- 3 files changed, 2265 insertions(+), 2265 deletions(-) create mode 100644 tools/include/nolibc/nolibc.h delete mode 100644 tools/testing/selftests/rcutorture/bin/nolibc.h (limited to 'tools/include') diff --git a/tools/include/nolibc/nolibc.h b/tools/include/nolibc/nolibc.h new file mode 100644 index 000000000000..1708e9f9f8aa --- /dev/null +++ b/tools/include/nolibc/nolibc.h @@ -0,0 +1,2263 @@ +/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ +/* nolibc.h + * Copyright (C) 2017-2018 Willy Tarreau + */ + +/* + * This file is designed to be used as a libc alternative for minimal programs + * with very limited requirements. It consists of a small number of syscall and + * type definitions, and the minimal startup code needed to call main(). + * All syscalls are declared as static functions so that they can be optimized + * away by the compiler when not used. + * + * Syscalls are split into 3 levels: + * - The lower level is the arch-specific syscall() definition, consisting in + * assembly code in compound expressions. These are called my_syscall0() to + * my_syscall6() depending on the number of arguments. The MIPS + * implementation is limited to 5 arguments. All input arguments are cast + * to a long stored in a register. These expressions always return the + * syscall's return value as a signed long value which is often either a + * pointer or the negated errno value. + * + * - The second level is mostly architecture-independent. It is made of + * static functions called sys_() which rely on my_syscallN() + * depending on the syscall definition. These functions are responsible + * for exposing the appropriate types for the syscall arguments (int, + * pointers, etc) and for setting the appropriate return type (often int). + * A few of them are architecture-specific because the syscalls are not all + * mapped exactly the same among architectures. For example, some archs do + * not implement select() and need pselect6() instead, so the sys_select() + * function will have to abstract this. + * + * - The third level is the libc call definition. It exposes the lower raw + * sys_() calls in a way that looks like what a libc usually does, + * takes care of specific input values, and of setting errno upon error. + * There can be minor variations compared to standard libc calls. For + * example the open() call always takes 3 args here. + * + * The errno variable is declared static and unused. This way it can be + * optimized away if not used. However this means that a program made of + * multiple C files may observe different errno values (one per C file). For + * the type of programs this project targets it usually is not a problem. The + * resulting program may even be reduced by defining the NOLIBC_IGNORE_ERRNO + * macro, in which case the errno value will never be assigned. + * + * Some stdint-like integer types are defined. These are valid on all currently + * supported architectures, because signs are enforced, ints are assumed to be + * 32 bits, longs the size of a pointer and long long 64 bits. If more + * architectures have to be supported, this may need to be adapted. + * + * Some macro definitions like the O_* values passed to open(), and some + * structures like the sys_stat struct depend on the architecture. + * + * The definitions start with the architecture-specific parts, which are picked + * based on what the compiler knows about the target architecture, and are + * completed with the generic code. Since it is the compiler which sets the + * target architecture, cross-compiling normally works out of the box without + * having to specify anything. + * + * Finally some very common libc-level functions are provided. It is the case + * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing + * is currently provided regarding stdio emulation. + * + * The macro NOLIBC is always defined, so that it is possible for a program to + * check this macro to know if it is being built against and decide to disable + * some features or simply not to include some standard libc files. + * + * Ideally this file should be split in multiple files for easier long term + * maintenance, but provided as a single file as it is now, it's quite + * convenient to use. Maybe some variations involving a set of includes at the + * top could work. + * + * A simple static executable may be built this way : + * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ + * -static -include nolibc.h -lgcc -o hello hello.c + * + * A very useful calling convention table may be found here : + * http://man7.org/linux/man-pages/man2/syscall.2.html + * + * This doc is quite convenient though not necessarily up to date : + * https://w3challs.com/syscalls/ + * + */ + +/* Some archs (at least aarch64) don't expose the regular syscalls anymore by + * default, either because they have an "_at" replacement, or because there are + * more modern alternatives. For now we'd rather still use them. + */ +#define __ARCH_WANT_SYSCALL_NO_AT +#define __ARCH_WANT_SYSCALL_NO_FLAGS +#define __ARCH_WANT_SYSCALL_DEPRECATED + +#include +#include +#include +#include +#include + +#define NOLIBC + +/* this way it will be removed if unused */ +static int errno; + +#ifndef NOLIBC_IGNORE_ERRNO +#define SET_ERRNO(v) do { errno = (v); } while (0) +#else +#define SET_ERRNO(v) do { } while (0) +#endif + +/* errno codes all ensure that they will not conflict with a valid pointer + * because they all correspond to the highest addressable memry page. + */ +#define MAX_ERRNO 4095 + +/* Declare a few quite common macros and types that usually are in stdlib.h, + * stdint.h, ctype.h, unistd.h and a few other common locations. + */ + +#define NULL ((void *)0) + +/* stdint types */ +typedef unsigned char uint8_t; +typedef signed char int8_t; +typedef unsigned short uint16_t; +typedef signed short int16_t; +typedef unsigned int uint32_t; +typedef signed int int32_t; +typedef unsigned long long uint64_t; +typedef signed long long int64_t; +typedef unsigned long size_t; +typedef signed long ssize_t; +typedef unsigned long uintptr_t; +typedef signed long intptr_t; +typedef signed long ptrdiff_t; + +/* for stat() */ +typedef unsigned int dev_t; +typedef unsigned long ino_t; +typedef unsigned int mode_t; +typedef signed int pid_t; +typedef unsigned int uid_t; +typedef unsigned int gid_t; +typedef unsigned long nlink_t; +typedef signed long off_t; +typedef signed long blksize_t; +typedef signed long blkcnt_t; +typedef signed long time_t; + +/* for poll() */ +struct pollfd { + int fd; + short int events; + short int revents; +}; + +/* for select() */ +struct timeval { + long tv_sec; + long tv_usec; +}; + +/* for pselect() */ +struct timespec { + long tv_sec; + long tv_nsec; +}; + +/* for gettimeofday() */ +struct timezone { + int tz_minuteswest; + int tz_dsttime; +}; + +/* for getdents64() */ +struct linux_dirent64 { + uint64_t d_ino; + int64_t d_off; + unsigned short d_reclen; + unsigned char d_type; + char d_name[]; +}; + +/* commonly an fd_set represents 256 FDs */ +#define FD_SETSIZE 256 +typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; + +/* needed by wait4() */ +struct rusage { + struct timeval ru_utime; + struct timeval ru_stime; + long ru_maxrss; + long ru_ixrss; + long ru_idrss; + long ru_isrss; + long ru_minflt; + long ru_majflt; + long ru_nswap; + long ru_inblock; + long ru_oublock; + long ru_msgsnd; + long ru_msgrcv; + long ru_nsignals; + long ru_nvcsw; + long ru_nivcsw; +}; + +/* stat flags (WARNING, octal here) */ +#define S_IFDIR 0040000 +#define S_IFCHR 0020000 +#define S_IFBLK 0060000 +#define S_IFREG 0100000 +#define S_IFIFO 0010000 +#define S_IFLNK 0120000 +#define S_IFSOCK 0140000 +#define S_IFMT 0170000 + +#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) +#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) +#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) +#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) +#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) +#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) +#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) + +#define DT_UNKNOWN 0 +#define DT_FIFO 1 +#define DT_CHR 2 +#define DT_DIR 4 +#define DT_BLK 6 +#define DT_REG 8 +#define DT_LNK 10 +#define DT_SOCK 12 + +/* all the *at functions */ +#ifndef AT_FDWCD +#define AT_FDCWD -100 +#endif + +/* lseek */ +#define SEEK_SET 0 +#define SEEK_CUR 1 +#define SEEK_END 2 + +/* reboot */ +#define LINUX_REBOOT_MAGIC1 0xfee1dead +#define LINUX_REBOOT_MAGIC2 0x28121969 +#define LINUX_REBOOT_CMD_HALT 0xcdef0123 +#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc +#define LINUX_REBOOT_CMD_RESTART 0x01234567 +#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 + + +/* The format of the struct as returned by the libc to the application, which + * significantly differs from the format returned by the stat() syscall flavours. + */ +struct stat { + dev_t st_dev; /* ID of device containing file */ + ino_t st_ino; /* inode number */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device ID (if special file) */ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for file system I/O */ + blkcnt_t st_blocks; /* number of 512B blocks allocated */ + time_t st_atime; /* time of last access */ + time_t st_mtime; /* time of last modification */ + time_t st_ctime; /* time of last status change */ +}; + +#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) +#define WIFEXITED(status) (((status) & 0x7f) == 0) + + +/* Below comes the architecture-specific code. For each architecture, we have + * the syscall declarations and the _start code definition. This is the only + * global part. On all architectures the kernel puts everything in the stack + * before jumping to _start just above us, without any return address (_start + * is not a function but an entry pint). So at the stack pointer we find argc. + * Then argv[] begins, and ends at the first NULL. Then we have envp which + * starts and ends with a NULL as well. So envp=argv+argc+1. + */ + +#if defined(__x86_64__) +/* Syscalls for x86_64 : + * - registers are 64-bit + * - syscall number is passed in rax + * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively + * - the system call is performed by calling the syscall instruction + * - syscall return comes in rax + * - rcx and r8..r11 may be clobbered, others are preserved. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + */ + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "rcx", "r8", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "rcx", "r9", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + long _ret; \ + register long _num asm("rax") = (num); \ + register long _arg1 asm("rdi") = (long)(arg1); \ + register long _arg2 asm("rsi") = (long)(arg2); \ + register long _arg3 asm("rdx") = (long)(arg3); \ + register long _arg4 asm("r10") = (long)(arg4); \ + register long _arg5 asm("r8") = (long)(arg5); \ + register long _arg6 asm("r9") = (long)(arg6); \ + \ + asm volatile ( \ + "syscall\n" \ + : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "0"(_num) \ + : "rcx", "r11", "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %rdi\n" // argc (first arg, %rdi) + "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) + "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) + "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when + "sub $8, %rsp\n" // entering the callee + "call main\n" // main() returns the status code, we'll exit with it. + "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits + "mov $60, %rax\n" // NR_exit == 60 + "syscall\n" // really exit + "hlt\n" // ensure it does not return + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, equivalent to stat64(). The + * syscall returns 116 bytes and stops in the middle of __unused. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + + unsigned int st_gid; + unsigned int __pad0; + unsigned long st_rdev; + long st_size; + long st_blksize; + + long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + long __unused[3]; +}; + +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +/* Syscalls for i386 : + * - mostly similar to x86_64 + * - registers are 32-bit + * - syscall number is passed in eax + * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively + * - all registers are preserved (except eax of course) + * - the system call is performed by calling int $0x80 + * - syscall return comes in eax + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, i386 supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + long _ret; \ + register long _num asm("eax") = (num); \ + register long _arg1 asm("ebx") = (long)(arg1); \ + register long _arg2 asm("ecx") = (long)(arg2); \ + register long _arg3 asm("edx") = (long)(arg3); \ + register long _arg4 asm("esi") = (long)(arg4); \ + register long _arg5 asm("edi") = (long)(arg5); \ + \ + asm volatile ( \ + "int $0x80\n" \ + : "=a" (_ret) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "0"(_num) \ + : "memory", "cc" \ + ); \ + _ret; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "pop %eax\n" // argc (first arg, %eax) + "mov %esp, %ebx\n" // argv[] (second arg, %ebx) + "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) + "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when + "push %ecx\n" // push all registers on the stack so that we + "push %ebx\n" // support both regparm and plain stack modes + "push %eax\n" + "call main\n" // main() returns the status code in %eax + "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits + "movl $1, %eax\n" // NR_exit == 1 + "int $0x80\n" // exit now + "hlt\n" // ensure it does not + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__ARM_EABI__) +/* Syscalls for ARM in ARM or Thumb modes : + * - registers are 32-bit + * - stack is 8-byte aligned + * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) + * - syscall number is passed in r7 + * - arguments are in r0, r1, r2, r3, r4, r5 + * - the system call is performed by calling svc #0 + * - syscall return comes in r0. + * - only lr is clobbered. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * - the syscall number is always specified last in order to allow to force + * some registers before (gcc refuses a %-register at the last position). + * + * Also, ARM supports the old_select syscall if newselect is not available + */ +#define __ARCH_WANT_SYS_OLD_SELECT + +#define my_syscall0(num) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("r7") = (num); \ + register long _arg1 asm("r0") = (long)(arg1); \ + register long _arg2 asm("r1") = (long)(arg2); \ + register long _arg3 asm("r2") = (long)(arg3); \ + register long _arg4 asm("r3") = (long)(arg4); \ + register long _arg5 asm("r4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc", "lr" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" +#if defined(__THUMBEB__) || defined(__THUMBEL__) + /* We enter here in 32-bit mode but if some previous functions were in + * 16-bit mode, the assembler cannot know, so we need to tell it we're in + * 32-bit now, then switch to 16-bit (is there a better way to do it than + * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that + * it generates correct instructions. Note that we do not support thumb1. + */ + ".code 32\n" + "add r0, pc, #1\n" + "bx r0\n" + ".code 16\n" +#endif + "pop {%r0}\n" // argc was in the stack + "mov %r1, %sp\n" // argv = sp + "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... + "add %r2, %r2, $4\n" // ... + 4 + "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the + "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) + "bl main\n" // main() returns the status code, we'll exit with it. + "and %r0, %r0, $0xff\n" // limit exit code to 8 bits + "movs r7, $1\n" // NR_exit == 1 + "svc $0x00\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the stat() syscall, 32-bit only, the syscall returns + * exactly 56 bytes (stops before the unused array). In big endian, the format + * differs as devices are returned as short only. + */ +struct sys_stat_struct { +#if defined(__ARMEB__) + unsigned short st_dev; + unsigned short __pad1; +#else + unsigned long st_dev; +#endif + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; +#if defined(__ARMEB__) + unsigned short st_rdev; + unsigned short __pad2; +#else + unsigned long st_rdev; +#endif + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime; + unsigned long st_atime_nsec; + unsigned long st_mtime; + unsigned long st_mtime_nsec; + unsigned long st_ctime; + unsigned long st_ctime_nsec; + unsigned long __unused[2]; +}; + +#elif defined(__aarch64__) +/* Syscalls for AARCH64 : + * - registers are 64-bit + * - stack is 16-byte aligned + * - syscall number is passed in x8 + * - arguments are in x0, x1, x2, x3, x4, x5 + * - the system call is performed by calling svc 0 + * - syscall return comes in x0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + * + * On aarch64, select() is not implemented so we have to use pselect6(). + */ +#define __ARCH_WANT_SYS_PSELECT6 + +#define my_syscall0(num) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0"); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r"(_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ +({ \ + register long _num asm("x8") = (num); \ + register long _arg1 asm("x0") = (long)(arg1); \ + register long _arg2 asm("x1") = (long)(arg2); \ + register long _arg3 asm("x2") = (long)(arg3); \ + register long _arg4 asm("x3") = (long)(arg4); \ + register long _arg5 asm("x4") = (long)(arg5); \ + register long _arg6 asm("x5") = (long)(arg6); \ + \ + asm volatile ( \ + "svc #0\n" \ + : "=r" (_arg1) \ + : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ + "r"(_arg6), "r"(_num) \ + : "memory", "cc" \ + ); \ + _arg1; \ +}) + +/* startup code */ +asm(".section .text\n" + ".global _start\n" + "_start:\n" + "ldr x0, [sp]\n" // argc (x0) was in the stack + "add x1, sp, 8\n" // argv (x1) = sp + "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... + "add x2, x2, 8\n" // + 8 (skip null) + "add x2, x2, x1\n" // + argv + "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee + "bl main\n" // main() returns the status code, we'll exit with it. + "and x0, x0, 0xff\n" // limit exit code to 8 bits + "mov x8, 93\n" // NR_exit == 93 + "svc #0\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_CREAT 0x40 +#define O_EXCL 0x80 +#define O_NOCTTY 0x100 +#define O_TRUNC 0x200 +#define O_APPEND 0x400 +#define O_NONBLOCK 0x800 +#define O_DIRECTORY 0x4000 + +/* The struct returned by the newfstatat() syscall. Differs slightly from the + * x86_64's stat one by field ordering, so be careful. + */ +struct sys_stat_struct { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + + long st_blocks; + long st_atime; + unsigned long st_atime_nsec; + long st_mtime; + + unsigned long st_mtime_nsec; + long st_ctime; + unsigned long st_ctime_nsec; + unsigned int __unused[2]; +}; + +#elif defined(__mips__) && defined(_ABIO32) +/* Syscalls for MIPS ABI O32 : + * - WARNING! there's always a delayed slot! + * - WARNING again, the syntax is different, registers take a '$' and numbers + * do not. + * - registers are 32-bit + * - stack is 8-byte aligned + * - syscall number is passed in v0 (starts at 0xfa0). + * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to + * leave some room in the stack for the callee to save a0..a3 if needed. + * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are + * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as + * scall32-o32.S in the kernel sources. + * - the system call is performed by calling "syscall" + * - syscall return comes in v0, and register a3 needs to be checked to know + * if an error occured, in which case errno is in v0. + * - the arguments are cast to long and assigned into the target registers + * which are then simply passed as registers to the asm code, so that we + * don't have to experience issues with register constraints. + */ + +#define my_syscall0(num) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "r"(_num) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall1(num, arg1) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall2(num, arg1, arg2) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall3(num, arg1, arg2, arg3) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3"); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r"(_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall4(num, arg1, arg2, arg3, arg4) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "syscall\n" \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ +({ \ + register long _num asm("v0") = (num); \ + register long _arg1 asm("a0") = (long)(arg1); \ + register long _arg2 asm("a1") = (long)(arg2); \ + register long _arg3 asm("a2") = (long)(arg3); \ + register long _arg4 asm("a3") = (long)(arg4); \ + register long _arg5 = (long)(arg5); \ + \ + asm volatile ( \ + "addiu $sp, $sp, -32\n" \ + "sw %7, 16($sp)\n" \ + "syscall\n " \ + "addiu $sp, $sp, 32\n" \ + : "=r" (_num), "=r"(_arg4) \ + : "0"(_num), \ + "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ + : "memory", "cc", "at", "v1", "hi", "lo", \ + "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ + ); \ + _arg4 ? -_num : _num; \ +}) + +/* startup code, note that it's called __start on MIPS */ +asm(".section .text\n" + ".set nomips16\n" + ".global __start\n" + ".set noreorder\n" + ".option pic0\n" + ".ent __start\n" + "__start:\n" + "lw $a0,($sp)\n" // argc was in the stack + "addiu $a1, $sp, 4\n" // argv = sp + 4 + "sll $a2, $a0, 2\n" // a2 = argc * 4 + "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... + "addiu $a2, $a2, 4\n" // ... + 4 + "li $t0, -8\n" + "and $sp, $sp, $t0\n" // sp must be 8-byte aligned + "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! + "jal main\n" // main() returns the status code, we'll exit with it. + "nop\n" // delayed slot + "and $a0, $v0, 0xff\n" // limit exit code to 8 bits + "li $v0, 4001\n" // NR_exit == 4001 + "syscall\n" + ".end __start\n" + ""); + +/* fcntl / open */ +#define O_RDONLY 0 +#define O_WRONLY 1 +#define O_RDWR 2 +#define O_APPEND 0x0008 +#define O_NONBLOCK 0x0080 +#define O_CREAT 0x0100 +#define O_TRUNC 0x0200 +#define O_EXCL 0x0400 +#define O_NOCTTY 0x0800 +#define O_DIRECTORY 0x10000 + +/* The struct returned by the stat() syscall. 88 bytes are returned by the + * syscall. + */ +struct sys_stat_struct { + unsigned int st_dev; + long st_pad1[3]; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned int st_rdev; + long st_pad2[2]; + long st_size; + long st_pad3; + long st_atime; + long st_atime_nsec; + long st_mtime; + long st_mtime_nsec; + long st_ctime; + long st_ctime_nsec; + long st_blksize; + long st_blocks; + long st_pad4[14]; +}; + +#endif + + +/* Below are the C functions used to declare the raw syscalls. They try to be + * architecture-agnostic, and return either a success or -errno. Declaring them + * static will lead to them being inlined in most cases, but it's still possible + * to reference them by a pointer if needed. + */ +static __attribute__((unused)) +void *sys_brk(void *addr) +{ + return (void *)my_syscall1(__NR_brk, addr); +} + +static __attribute__((noreturn,unused)) +void sys_exit(int status) +{ + my_syscall1(__NR_exit, status & 255); + while(1); // shut the "noreturn" warnings. +} + +static __attribute__((unused)) +int sys_chdir(const char *path) +{ + return my_syscall1(__NR_chdir, path); +} + +static __attribute__((unused)) +int sys_chmod(const char *path, mode_t mode) +{ +#ifdef __NR_fchmodat + return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); +#else + return my_syscall2(__NR_chmod, path, mode); +#endif +} + +static __attribute__((unused)) +int sys_chown(const char *path, uid_t owner, gid_t group) +{ +#ifdef __NR_fchownat + return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); +#else + return my_syscall3(__NR_chown, path, owner, group); +#endif +} + +static __attribute__((unused)) +int sys_chroot(const char *path) +{ + return my_syscall1(__NR_chroot, path); +} + +static __attribute__((unused)) +int sys_close(int fd) +{ + return my_syscall1(__NR_close, fd); +} + +static __attribute__((unused)) +int sys_dup(int fd) +{ + return my_syscall1(__NR_dup, fd); +} + +static __attribute__((unused)) +int sys_dup2(int old, int new) +{ + return my_syscall2(__NR_dup2, old, new); +} + +static __attribute__((unused)) +int sys_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return my_syscall3(__NR_execve, filename, argv, envp); +} + +static __attribute__((unused)) +pid_t sys_fork(void) +{ + return my_syscall0(__NR_fork); +} + +static __attribute__((unused)) +int sys_fsync(int fd) +{ + return my_syscall1(__NR_fsync, fd); +} + +static __attribute__((unused)) +int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + return my_syscall3(__NR_getdents64, fd, dirp, count); +} + +static __attribute__((unused)) +pid_t sys_getpgrp(void) +{ + return my_syscall0(__NR_getpgrp); +} + +static __attribute__((unused)) +pid_t sys_getpid(void) +{ + return my_syscall0(__NR_getpid); +} + +static __attribute__((unused)) +int sys_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return my_syscall2(__NR_gettimeofday, tv, tz); +} + +static __attribute__((unused)) +int sys_ioctl(int fd, unsigned long req, void *value) +{ + return my_syscall3(__NR_ioctl, fd, req, value); +} + +static __attribute__((unused)) +int sys_kill(pid_t pid, int signal) +{ + return my_syscall2(__NR_kill, pid, signal); +} + +static __attribute__((unused)) +int sys_link(const char *old, const char *new) +{ +#ifdef __NR_linkat + return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); +#else + return my_syscall2(__NR_link, old, new); +#endif +} + +static __attribute__((unused)) +off_t sys_lseek(int fd, off_t offset, int whence) +{ + return my_syscall3(__NR_lseek, fd, offset, whence); +} + +static __attribute__((unused)) +int sys_mkdir(const char *path, mode_t mode) +{ +#ifdef __NR_mkdirat + return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); +#else + return my_syscall2(__NR_mkdir, path, mode); +#endif +} + +static __attribute__((unused)) +long sys_mknod(const char *path, mode_t mode, dev_t dev) +{ +#ifdef __NR_mknodat + return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); +#else + return my_syscall3(__NR_mknod, path, mode, dev); +#endif +} + +static __attribute__((unused)) +int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return my_syscall5(__NR_mount, src, tgt, fst, flags, data); +} + +static __attribute__((unused)) +int sys_open(const char *path, int flags, mode_t mode) +{ +#ifdef __NR_openat + return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); +#else + return my_syscall3(__NR_open, path, flags, mode); +#endif +} + +static __attribute__((unused)) +int sys_pivot_root(const char *new, const char *old) +{ + return my_syscall2(__NR_pivot_root, new, old); +} + +static __attribute__((unused)) +int sys_poll(struct pollfd *fds, int nfds, int timeout) +{ + return my_syscall3(__NR_poll, fds, nfds, timeout); +} + +static __attribute__((unused)) +ssize_t sys_read(int fd, void *buf, size_t count) +{ + return my_syscall3(__NR_read, fd, buf, count); +} + +static __attribute__((unused)) +ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) +{ + return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); +} + +static __attribute__((unused)) +int sys_sched_yield(void) +{ + return my_syscall0(__NR_sched_yield); +} + +static __attribute__((unused)) +int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ +#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) + struct sel_arg_struct { + unsigned long n; + fd_set *r, *w, *e; + struct timeval *t; + } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; + return my_syscall1(__NR_select, &arg); +#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) + struct timespec t; + + if (timeout) { + t.tv_sec = timeout->tv_sec; + t.tv_nsec = timeout->tv_usec * 1000; + } + return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); +#else +#ifndef __NR__newselect +#define __NR__newselect __NR_select +#endif + return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); +#endif +} + +static __attribute__((unused)) +int sys_setpgid(pid_t pid, pid_t pgid) +{ + return my_syscall2(__NR_setpgid, pid, pgid); +} + +static __attribute__((unused)) +pid_t sys_setsid(void) +{ + return my_syscall0(__NR_setsid); +} + +static __attribute__((unused)) +int sys_stat(const char *path, struct stat *buf) +{ + struct sys_stat_struct stat; + long ret; + +#ifdef __NR_newfstatat + /* only solution for arm64 */ + ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); +#else + ret = my_syscall2(__NR_stat, path, &stat); +#endif + buf->st_dev = stat.st_dev; + buf->st_ino = stat.st_ino; + buf->st_mode = stat.st_mode; + buf->st_nlink = stat.st_nlink; + buf->st_uid = stat.st_uid; + buf->st_gid = stat.st_gid; + buf->st_rdev = stat.st_rdev; + buf->st_size = stat.st_size; + buf->st_blksize = stat.st_blksize; + buf->st_blocks = stat.st_blocks; + buf->st_atime = stat.st_atime; + buf->st_mtime = stat.st_mtime; + buf->st_ctime = stat.st_ctime; + return ret; +} + + +static __attribute__((unused)) +int sys_symlink(const char *old, const char *new) +{ +#ifdef __NR_symlinkat + return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); +#else + return my_syscall2(__NR_symlink, old, new); +#endif +} + +static __attribute__((unused)) +mode_t sys_umask(mode_t mode) +{ + return my_syscall1(__NR_umask, mode); +} + +static __attribute__((unused)) +int sys_umount2(const char *path, int flags) +{ + return my_syscall2(__NR_umount2, path, flags); +} + +static __attribute__((unused)) +int sys_unlink(const char *path) +{ +#ifdef __NR_unlinkat + return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); +#else + return my_syscall1(__NR_unlink, path); +#endif +} + +static __attribute__((unused)) +pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + return my_syscall4(__NR_wait4, pid, status, options, rusage); +} + +static __attribute__((unused)) +pid_t sys_waitpid(pid_t pid, int *status, int options) +{ + return sys_wait4(pid, status, options, 0); +} + +static __attribute__((unused)) +pid_t sys_wait(int *status) +{ + return sys_waitpid(-1, status, 0); +} + +static __attribute__((unused)) +ssize_t sys_write(int fd, const void *buf, size_t count) +{ + return my_syscall3(__NR_write, fd, buf, count); +} + + +/* Below are the libc-compatible syscalls which return x or -1 and set errno. + * They rely on the functions above. Similarly they're marked static so that it + * is possible to assign pointers to them if needed. + */ + +static __attribute__((unused)) +int brk(void *addr) +{ + void *ret = sys_brk(addr); + + if (!ret) { + SET_ERRNO(ENOMEM); + return -1; + } + return 0; +} + +static __attribute__((noreturn,unused)) +void exit(int status) +{ + sys_exit(status); +} + +static __attribute__((unused)) +int chdir(const char *path) +{ + int ret = sys_chdir(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chmod(const char *path, mode_t mode) +{ + int ret = sys_chmod(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chown(const char *path, uid_t owner, gid_t group) +{ + int ret = sys_chown(path, owner, group); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int chroot(const char *path) +{ + int ret = sys_chroot(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int close(int fd) +{ + int ret = sys_close(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int dup2(int old, int new) +{ + int ret = sys_dup2(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int execve(const char *filename, char *const argv[], char *const envp[]) +{ + int ret = sys_execve(filename, argv, envp); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t fork(void) +{ + pid_t ret = sys_fork(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int fsync(int fd) +{ + int ret = sys_fsync(fd); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int getdents64(int fd, struct linux_dirent64 *dirp, int count) +{ + int ret = sys_getdents64(fd, dirp, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpgrp(void) +{ + pid_t ret = sys_getpgrp(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t getpid(void) +{ + pid_t ret = sys_getpid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int gettimeofday(struct timeval *tv, struct timezone *tz) +{ + int ret = sys_gettimeofday(tv, tz); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int ioctl(int fd, unsigned long req, void *value) +{ + int ret = sys_ioctl(fd, req, value); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int kill(pid_t pid, int signal) +{ + int ret = sys_kill(pid, signal); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int link(const char *old, const char *new) +{ + int ret = sys_link(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +off_t lseek(int fd, off_t offset, int whence) +{ + off_t ret = sys_lseek(fd, offset, whence); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mkdir(const char *path, mode_t mode) +{ + int ret = sys_mkdir(path, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mknod(const char *path, mode_t mode, dev_t dev) +{ + int ret = sys_mknod(path, mode, dev); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int mount(const char *src, const char *tgt, + const char *fst, unsigned long flags, + const void *data) +{ + int ret = sys_mount(src, tgt, fst, flags, data); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int open(const char *path, int flags, mode_t mode) +{ + int ret = sys_open(path, flags, mode); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int pivot_root(const char *new, const char *old) +{ + int ret = sys_pivot_root(new, old); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int poll(struct pollfd *fds, int nfds, int timeout) +{ + int ret = sys_poll(fds, nfds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t read(int fd, void *buf, size_t count) +{ + ssize_t ret = sys_read(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int reboot(int cmd) +{ + int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +void *sbrk(intptr_t inc) +{ + void *ret; + + /* first call to find current end */ + if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) + return ret + inc; + + SET_ERRNO(ENOMEM); + return (void *)-1; +} + +static __attribute__((unused)) +int sched_yield(void) +{ + int ret = sys_sched_yield(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) +{ + int ret = sys_select(nfds, rfds, wfds, efds, timeout); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int setpgid(pid_t pid, pid_t pgid) +{ + int ret = sys_setpgid(pid, pgid); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t setsid(void) +{ + pid_t ret = sys_setsid(); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +unsigned int sleep(unsigned int seconds) +{ + struct timeval my_timeval = { seconds, 0 }; + + if (sys_select(0, 0, 0, 0, &my_timeval) < 0) + return my_timeval.tv_sec + !!my_timeval.tv_usec; + else + return 0; +} + +static __attribute__((unused)) +int stat(const char *path, struct stat *buf) +{ + int ret = sys_stat(path, buf); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int symlink(const char *old, const char *new) +{ + int ret = sys_symlink(old, new); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int tcsetpgrp(int fd, pid_t pid) +{ + return ioctl(fd, TIOCSPGRP, &pid); +} + +static __attribute__((unused)) +mode_t umask(mode_t mode) +{ + return sys_umask(mode); +} + +static __attribute__((unused)) +int umount2(const char *path, int flags) +{ + int ret = sys_umount2(path, flags); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +int unlink(const char *path) +{ + int ret = sys_unlink(path); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) +{ + pid_t ret = sys_wait4(pid, status, options, rusage); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t waitpid(pid_t pid, int *status, int options) +{ + pid_t ret = sys_waitpid(pid, status, options); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +pid_t wait(int *status) +{ + pid_t ret = sys_wait(status); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +static __attribute__((unused)) +ssize_t write(int fd, const void *buf, size_t count) +{ + ssize_t ret = sys_write(fd, buf, count); + + if (ret < 0) { + SET_ERRNO(-ret); + ret = -1; + } + return ret; +} + +/* some size-optimized reimplementations of a few common str* and mem* + * functions. They're marked static, except memcpy() and raise() which are used + * by libgcc on ARM, so they are marked weak instead in order not to cause an + * error when building a program made of multiple files (not recommended). + */ + +static __attribute__((unused)) +void *memmove(void *dst, const void *src, size_t len) +{ + ssize_t pos = (dst <= src) ? -1 : (long)len; + void *ret = dst; + + while (len--) { + pos += (dst <= src) ? 1 : -1; + ((char *)dst)[pos] = ((char *)src)[pos]; + } + return ret; +} + +static __attribute__((unused)) +void *memset(void *dst, int b, size_t len) +{ + char *p = dst; + + while (len--) + *(p++) = b; + return dst; +} + +static __attribute__((unused)) +int memcmp(const void *s1, const void *s2, size_t n) +{ + size_t ofs = 0; + char c1 = 0; + + while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { + ofs++; + } + return c1; +} + +static __attribute__((unused)) +char *strcpy(char *dst, const char *src) +{ + char *ret = dst; + + while ((*dst++ = *src++)); + return ret; +} + +static __attribute__((unused)) +char *strchr(const char *s, int c) +{ + while (*s) { + if (*s == (char)c) + return (char *)s; + s++; + } + return NULL; +} + +static __attribute__((unused)) +char *strrchr(const char *s, int c) +{ + const char *ret = NULL; + + while (*s) { + if (*s == (char)c) + ret = s; + s++; + } + return (char *)ret; +} + +static __attribute__((unused)) +size_t nolibc_strlen(const char *str) +{ + size_t len; + + for (len = 0; str[len]; len++); + return len; +} + +#define strlen(str) ({ \ + __builtin_constant_p((str)) ? \ + __builtin_strlen((str)) : \ + nolibc_strlen((str)); \ +}) + +static __attribute__((unused)) +int isdigit(int c) +{ + return (unsigned int)(c - '0') <= 9; +} + +static __attribute__((unused)) +long atol(const char *s) +{ + unsigned long ret = 0; + unsigned long d; + int neg = 0; + + if (*s == '-') { + neg = 1; + s++; + } + + while (1) { + d = (*s++) - '0'; + if (d > 9) + break; + ret *= 10; + ret += d; + } + + return neg ? -ret : ret; +} + +static __attribute__((unused)) +int atoi(const char *s) +{ + return atol(s); +} + +static __attribute__((unused)) +const char *ltoa(long in) +{ + /* large enough for -9223372036854775808 */ + static char buffer[21]; + char *pos = buffer + sizeof(buffer) - 1; + int neg = in < 0; + unsigned long n = neg ? -in : in; + + *pos-- = '\0'; + do { + *pos-- = '0' + n % 10; + n /= 10; + if (pos < buffer) + return pos + 1; + } while (n); + + if (neg) + *pos-- = '-'; + return pos + 1; +} + +__attribute__((weak,unused)) +void *memcpy(void *dst, const void *src, size_t len) +{ + return memmove(dst, src, len); +} + +/* needed by libgcc for divide by zero */ +__attribute__((weak,unused)) +int raise(int signal) +{ + return kill(getpid(), signal); +} + +/* Here come a few helper functions */ + +static __attribute__((unused)) +void FD_ZERO(fd_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +static __attribute__((unused)) +void FD_SET(int fd, fd_set *set) +{ + if (fd < 0 || fd >= FD_SETSIZE) + return; + set->fd32[fd / 32] |= 1 << (fd & 31); +} + +/* WARNING, it only deals with the 4096 first majors and 256 first minors */ +static __attribute__((unused)) +dev_t makedev(unsigned int major, unsigned int minor) +{ + return ((major & 0xfff) << 8) | (minor & 0xff); +} diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh index e79eb35c41e2..83552bb007b4 100755 --- a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh +++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh @@ -131,8 +131,8 @@ if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \ | grep -q '^yes'; then # architecture supported by nolibc ${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \ - -nostdlib -include ../bin/nolibc.h -lgcc -s -static -Os \ - -o init init.c + -nostdlib -include ../../../../include/nolibc/nolibc.h \ + -lgcc -s -static -Os -o init init.c else ${CROSS_COMPILE}gcc -s -static -Os -o init init.c fi diff --git a/tools/testing/selftests/rcutorture/bin/nolibc.h b/tools/testing/selftests/rcutorture/bin/nolibc.h deleted file mode 100644 index 1708e9f9f8aa..000000000000 --- a/tools/testing/selftests/rcutorture/bin/nolibc.h +++ /dev/null @@ -1,2263 +0,0 @@ -/* SPDX-License-Identifier: LGPL-2.1 OR MIT */ -/* nolibc.h - * Copyright (C) 2017-2018 Willy Tarreau - */ - -/* - * This file is designed to be used as a libc alternative for minimal programs - * with very limited requirements. It consists of a small number of syscall and - * type definitions, and the minimal startup code needed to call main(). - * All syscalls are declared as static functions so that they can be optimized - * away by the compiler when not used. - * - * Syscalls are split into 3 levels: - * - The lower level is the arch-specific syscall() definition, consisting in - * assembly code in compound expressions. These are called my_syscall0() to - * my_syscall6() depending on the number of arguments. The MIPS - * implementation is limited to 5 arguments. All input arguments are cast - * to a long stored in a register. These expressions always return the - * syscall's return value as a signed long value which is often either a - * pointer or the negated errno value. - * - * - The second level is mostly architecture-independent. It is made of - * static functions called sys_() which rely on my_syscallN() - * depending on the syscall definition. These functions are responsible - * for exposing the appropriate types for the syscall arguments (int, - * pointers, etc) and for setting the appropriate return type (often int). - * A few of them are architecture-specific because the syscalls are not all - * mapped exactly the same among architectures. For example, some archs do - * not implement select() and need pselect6() instead, so the sys_select() - * function will have to abstract this. - * - * - The third level is the libc call definition. It exposes the lower raw - * sys_() calls in a way that looks like what a libc usually does, - * takes care of specific input values, and of setting errno upon error. - * There can be minor variations compared to standard libc calls. For - * example the open() call always takes 3 args here. - * - * The errno variable is declared static and unused. This way it can be - * optimized away if not used. However this means that a program made of - * multiple C files may observe different errno values (one per C file). For - * the type of programs this project targets it usually is not a problem. The - * resulting program may even be reduced by defining the NOLIBC_IGNORE_ERRNO - * macro, in which case the errno value will never be assigned. - * - * Some stdint-like integer types are defined. These are valid on all currently - * supported architectures, because signs are enforced, ints are assumed to be - * 32 bits, longs the size of a pointer and long long 64 bits. If more - * architectures have to be supported, this may need to be adapted. - * - * Some macro definitions like the O_* values passed to open(), and some - * structures like the sys_stat struct depend on the architecture. - * - * The definitions start with the architecture-specific parts, which are picked - * based on what the compiler knows about the target architecture, and are - * completed with the generic code. Since it is the compiler which sets the - * target architecture, cross-compiling normally works out of the box without - * having to specify anything. - * - * Finally some very common libc-level functions are provided. It is the case - * for a few functions usually found in string.h, ctype.h, or stdlib.h. Nothing - * is currently provided regarding stdio emulation. - * - * The macro NOLIBC is always defined, so that it is possible for a program to - * check this macro to know if it is being built against and decide to disable - * some features or simply not to include some standard libc files. - * - * Ideally this file should be split in multiple files for easier long term - * maintenance, but provided as a single file as it is now, it's quite - * convenient to use. Maybe some variations involving a set of includes at the - * top could work. - * - * A simple static executable may be built this way : - * $ gcc -fno-asynchronous-unwind-tables -fno-ident -s -Os -nostdlib \ - * -static -include nolibc.h -lgcc -o hello hello.c - * - * A very useful calling convention table may be found here : - * http://man7.org/linux/man-pages/man2/syscall.2.html - * - * This doc is quite convenient though not necessarily up to date : - * https://w3challs.com/syscalls/ - * - */ - -/* Some archs (at least aarch64) don't expose the regular syscalls anymore by - * default, either because they have an "_at" replacement, or because there are - * more modern alternatives. For now we'd rather still use them. - */ -#define __ARCH_WANT_SYSCALL_NO_AT -#define __ARCH_WANT_SYSCALL_NO_FLAGS -#define __ARCH_WANT_SYSCALL_DEPRECATED - -#include -#include -#include -#include -#include - -#define NOLIBC - -/* this way it will be removed if unused */ -static int errno; - -#ifndef NOLIBC_IGNORE_ERRNO -#define SET_ERRNO(v) do { errno = (v); } while (0) -#else -#define SET_ERRNO(v) do { } while (0) -#endif - -/* errno codes all ensure that they will not conflict with a valid pointer - * because they all correspond to the highest addressable memry page. - */ -#define MAX_ERRNO 4095 - -/* Declare a few quite common macros and types that usually are in stdlib.h, - * stdint.h, ctype.h, unistd.h and a few other common locations. - */ - -#define NULL ((void *)0) - -/* stdint types */ -typedef unsigned char uint8_t; -typedef signed char int8_t; -typedef unsigned short uint16_t; -typedef signed short int16_t; -typedef unsigned int uint32_t; -typedef signed int int32_t; -typedef unsigned long long uint64_t; -typedef signed long long int64_t; -typedef unsigned long size_t; -typedef signed long ssize_t; -typedef unsigned long uintptr_t; -typedef signed long intptr_t; -typedef signed long ptrdiff_t; - -/* for stat() */ -typedef unsigned int dev_t; -typedef unsigned long ino_t; -typedef unsigned int mode_t; -typedef signed int pid_t; -typedef unsigned int uid_t; -typedef unsigned int gid_t; -typedef unsigned long nlink_t; -typedef signed long off_t; -typedef signed long blksize_t; -typedef signed long blkcnt_t; -typedef signed long time_t; - -/* for poll() */ -struct pollfd { - int fd; - short int events; - short int revents; -}; - -/* for select() */ -struct timeval { - long tv_sec; - long tv_usec; -}; - -/* for pselect() */ -struct timespec { - long tv_sec; - long tv_nsec; -}; - -/* for gettimeofday() */ -struct timezone { - int tz_minuteswest; - int tz_dsttime; -}; - -/* for getdents64() */ -struct linux_dirent64 { - uint64_t d_ino; - int64_t d_off; - unsigned short d_reclen; - unsigned char d_type; - char d_name[]; -}; - -/* commonly an fd_set represents 256 FDs */ -#define FD_SETSIZE 256 -typedef struct { uint32_t fd32[FD_SETSIZE/32]; } fd_set; - -/* needed by wait4() */ -struct rusage { - struct timeval ru_utime; - struct timeval ru_stime; - long ru_maxrss; - long ru_ixrss; - long ru_idrss; - long ru_isrss; - long ru_minflt; - long ru_majflt; - long ru_nswap; - long ru_inblock; - long ru_oublock; - long ru_msgsnd; - long ru_msgrcv; - long ru_nsignals; - long ru_nvcsw; - long ru_nivcsw; -}; - -/* stat flags (WARNING, octal here) */ -#define S_IFDIR 0040000 -#define S_IFCHR 0020000 -#define S_IFBLK 0060000 -#define S_IFREG 0100000 -#define S_IFIFO 0010000 -#define S_IFLNK 0120000 -#define S_IFSOCK 0140000 -#define S_IFMT 0170000 - -#define S_ISDIR(mode) (((mode) & S_IFDIR) == S_IFDIR) -#define S_ISCHR(mode) (((mode) & S_IFCHR) == S_IFCHR) -#define S_ISBLK(mode) (((mode) & S_IFBLK) == S_IFBLK) -#define S_ISREG(mode) (((mode) & S_IFREG) == S_IFREG) -#define S_ISFIFO(mode) (((mode) & S_IFIFO) == S_IFIFO) -#define S_ISLNK(mode) (((mode) & S_IFLNK) == S_IFLNK) -#define S_ISSOCK(mode) (((mode) & S_IFSOCK) == S_IFSOCK) - -#define DT_UNKNOWN 0 -#define DT_FIFO 1 -#define DT_CHR 2 -#define DT_DIR 4 -#define DT_BLK 6 -#define DT_REG 8 -#define DT_LNK 10 -#define DT_SOCK 12 - -/* all the *at functions */ -#ifndef AT_FDWCD -#define AT_FDCWD -100 -#endif - -/* lseek */ -#define SEEK_SET 0 -#define SEEK_CUR 1 -#define SEEK_END 2 - -/* reboot */ -#define LINUX_REBOOT_MAGIC1 0xfee1dead -#define LINUX_REBOOT_MAGIC2 0x28121969 -#define LINUX_REBOOT_CMD_HALT 0xcdef0123 -#define LINUX_REBOOT_CMD_POWER_OFF 0x4321fedc -#define LINUX_REBOOT_CMD_RESTART 0x01234567 -#define LINUX_REBOOT_CMD_SW_SUSPEND 0xd000fce2 - - -/* The format of the struct as returned by the libc to the application, which - * significantly differs from the format returned by the stat() syscall flavours. - */ -struct stat { - dev_t st_dev; /* ID of device containing file */ - ino_t st_ino; /* inode number */ - mode_t st_mode; /* protection */ - nlink_t st_nlink; /* number of hard links */ - uid_t st_uid; /* user ID of owner */ - gid_t st_gid; /* group ID of owner */ - dev_t st_rdev; /* device ID (if special file) */ - off_t st_size; /* total size, in bytes */ - blksize_t st_blksize; /* blocksize for file system I/O */ - blkcnt_t st_blocks; /* number of 512B blocks allocated */ - time_t st_atime; /* time of last access */ - time_t st_mtime; /* time of last modification */ - time_t st_ctime; /* time of last status change */ -}; - -#define WEXITSTATUS(status) (((status) & 0xff00) >> 8) -#define WIFEXITED(status) (((status) & 0x7f) == 0) - - -/* Below comes the architecture-specific code. For each architecture, we have - * the syscall declarations and the _start code definition. This is the only - * global part. On all architectures the kernel puts everything in the stack - * before jumping to _start just above us, without any return address (_start - * is not a function but an entry pint). So at the stack pointer we find argc. - * Then argv[] begins, and ends at the first NULL. Then we have envp which - * starts and ends with a NULL as well. So envp=argv+argc+1. - */ - -#if defined(__x86_64__) -/* Syscalls for x86_64 : - * - registers are 64-bit - * - syscall number is passed in rax - * - arguments are in rdi, rsi, rdx, r10, r8, r9 respectively - * - the system call is performed by calling the syscall instruction - * - syscall return comes in rax - * - rcx and r8..r11 may be clobbered, others are preserved. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - */ - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r10", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "rcx", "r8", "r9", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "rcx", "r9", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - long _ret; \ - register long _num asm("rax") = (num); \ - register long _arg1 asm("rdi") = (long)(arg1); \ - register long _arg2 asm("rsi") = (long)(arg2); \ - register long _arg3 asm("rdx") = (long)(arg3); \ - register long _arg4 asm("r10") = (long)(arg4); \ - register long _arg5 asm("r8") = (long)(arg5); \ - register long _arg6 asm("r9") = (long)(arg6); \ - \ - asm volatile ( \ - "syscall\n" \ - : "=a" (_ret), "=r"(_arg4), "=r"(_arg5) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "0"(_num) \ - : "rcx", "r11", "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %rdi\n" // argc (first arg, %rdi) - "mov %rsp, %rsi\n" // argv[] (second arg, %rsi) - "lea 8(%rsi,%rdi,8),%rdx\n" // then a NULL then envp (third arg, %rdx) - "and $-16, %rsp\n" // x86 ABI : esp must be 16-byte aligned when - "sub $8, %rsp\n" // entering the callee - "call main\n" // main() returns the status code, we'll exit with it. - "movzb %al, %rdi\n" // retrieve exit code from 8 lower bits - "mov $60, %rax\n" // NR_exit == 60 - "syscall\n" // really exit - "hlt\n" // ensure it does not return - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, equivalent to stat64(). The - * syscall returns 116 bytes and stops in the middle of __unused. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; - unsigned int st_mode; - unsigned int st_uid; - - unsigned int st_gid; - unsigned int __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - - long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - long __unused[3]; -}; - -#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) -/* Syscalls for i386 : - * - mostly similar to x86_64 - * - registers are 32-bit - * - syscall number is passed in eax - * - arguments are in ebx, ecx, edx, esi, edi, ebp respectively - * - all registers are preserved (except eax of course) - * - the system call is performed by calling int $0x80 - * - syscall return comes in eax - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, i386 supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - long _ret; \ - register long _num asm("eax") = (num); \ - register long _arg1 asm("ebx") = (long)(arg1); \ - register long _arg2 asm("ecx") = (long)(arg2); \ - register long _arg3 asm("edx") = (long)(arg3); \ - register long _arg4 asm("esi") = (long)(arg4); \ - register long _arg5 asm("edi") = (long)(arg5); \ - \ - asm volatile ( \ - "int $0x80\n" \ - : "=a" (_ret) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "0"(_num) \ - : "memory", "cc" \ - ); \ - _ret; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "pop %eax\n" // argc (first arg, %eax) - "mov %esp, %ebx\n" // argv[] (second arg, %ebx) - "lea 4(%ebx,%eax,4),%ecx\n" // then a NULL then envp (third arg, %ecx) - "and $-16, %esp\n" // x86 ABI : esp must be 16-byte aligned when - "push %ecx\n" // push all registers on the stack so that we - "push %ebx\n" // support both regparm and plain stack modes - "push %eax\n" - "call main\n" // main() returns the status code in %eax - "movzbl %al, %ebx\n" // retrieve exit code from lower 8 bits - "movl $1, %eax\n" // NR_exit == 1 - "int $0x80\n" // exit now - "hlt\n" // ensure it does not - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; - - unsigned long st_rdev; - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__ARM_EABI__) -/* Syscalls for ARM in ARM or Thumb modes : - * - registers are 32-bit - * - stack is 8-byte aligned - * ( http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.faqs/ka4127.html) - * - syscall number is passed in r7 - * - arguments are in r0, r1, r2, r3, r4, r5 - * - the system call is performed by calling svc #0 - * - syscall return comes in r0. - * - only lr is clobbered. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - the syscall number is always specified last in order to allow to force - * some registers before (gcc refuses a %-register at the last position). - * - * Also, ARM supports the old_select syscall if newselect is not available - */ -#define __ARCH_WANT_SYS_OLD_SELECT - -#define my_syscall0(num) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("r7") = (num); \ - register long _arg1 asm("r0") = (long)(arg1); \ - register long _arg2 asm("r1") = (long)(arg2); \ - register long _arg3 asm("r2") = (long)(arg3); \ - register long _arg4 asm("r3") = (long)(arg4); \ - register long _arg5 asm("r4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc", "lr" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" -#if defined(__THUMBEB__) || defined(__THUMBEL__) - /* We enter here in 32-bit mode but if some previous functions were in - * 16-bit mode, the assembler cannot know, so we need to tell it we're in - * 32-bit now, then switch to 16-bit (is there a better way to do it than - * adding 1 by hand ?) and tell the asm we're now in 16-bit mode so that - * it generates correct instructions. Note that we do not support thumb1. - */ - ".code 32\n" - "add r0, pc, #1\n" - "bx r0\n" - ".code 16\n" -#endif - "pop {%r0}\n" // argc was in the stack - "mov %r1, %sp\n" // argv = sp - "add %r2, %r1, %r0, lsl #2\n" // envp = argv + 4*argc ... - "add %r2, %r2, $4\n" // ... + 4 - "and %r3, %r1, $-8\n" // AAPCS : sp must be 8-byte aligned in the - "mov %sp, %r3\n" // callee, an bl doesn't push (lr=pc) - "bl main\n" // main() returns the status code, we'll exit with it. - "and %r0, %r0, $0xff\n" // limit exit code to 8 bits - "movs r7, $1\n" // NR_exit == 1 - "svc $0x00\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the stat() syscall, 32-bit only, the syscall returns - * exactly 56 bytes (stops before the unused array). In big endian, the format - * differs as devices are returned as short only. - */ -struct sys_stat_struct { -#if defined(__ARMEB__) - unsigned short st_dev; - unsigned short __pad1; -#else - unsigned long st_dev; -#endif - unsigned long st_ino; - unsigned short st_mode; - unsigned short st_nlink; - unsigned short st_uid; - unsigned short st_gid; -#if defined(__ARMEB__) - unsigned short st_rdev; - unsigned short __pad2; -#else - unsigned long st_rdev; -#endif - unsigned long st_size; - unsigned long st_blksize; - unsigned long st_blocks; - unsigned long st_atime; - unsigned long st_atime_nsec; - unsigned long st_mtime; - unsigned long st_mtime_nsec; - unsigned long st_ctime; - unsigned long st_ctime_nsec; - unsigned long __unused[2]; -}; - -#elif defined(__aarch64__) -/* Syscalls for AARCH64 : - * - registers are 64-bit - * - stack is 16-byte aligned - * - syscall number is passed in x8 - * - arguments are in x0, x1, x2, x3, x4, x5 - * - the system call is performed by calling svc 0 - * - syscall return comes in x0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - * - * On aarch64, select() is not implemented so we have to use pselect6(). - */ -#define __ARCH_WANT_SYS_PSELECT6 - -#define my_syscall0(num) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0"); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r"(_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -#define my_syscall6(num, arg1, arg2, arg3, arg4, arg5, arg6) \ -({ \ - register long _num asm("x8") = (num); \ - register long _arg1 asm("x0") = (long)(arg1); \ - register long _arg2 asm("x1") = (long)(arg2); \ - register long _arg3 asm("x2") = (long)(arg3); \ - register long _arg4 asm("x3") = (long)(arg4); \ - register long _arg5 asm("x4") = (long)(arg5); \ - register long _arg6 asm("x5") = (long)(arg6); \ - \ - asm volatile ( \ - "svc #0\n" \ - : "=r" (_arg1) \ - : "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5), \ - "r"(_arg6), "r"(_num) \ - : "memory", "cc" \ - ); \ - _arg1; \ -}) - -/* startup code */ -asm(".section .text\n" - ".global _start\n" - "_start:\n" - "ldr x0, [sp]\n" // argc (x0) was in the stack - "add x1, sp, 8\n" // argv (x1) = sp - "lsl x2, x0, 3\n" // envp (x2) = 8*argc ... - "add x2, x2, 8\n" // + 8 (skip null) - "add x2, x2, x1\n" // + argv - "and sp, x1, -16\n" // sp must be 16-byte aligned in the callee - "bl main\n" // main() returns the status code, we'll exit with it. - "and x0, x0, 0xff\n" // limit exit code to 8 bits - "mov x8, 93\n" // NR_exit == 93 - "svc #0\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_CREAT 0x40 -#define O_EXCL 0x80 -#define O_NOCTTY 0x100 -#define O_TRUNC 0x200 -#define O_APPEND 0x400 -#define O_NONBLOCK 0x800 -#define O_DIRECTORY 0x4000 - -/* The struct returned by the newfstatat() syscall. Differs slightly from the - * x86_64's stat one by field ordering, so be careful. - */ -struct sys_stat_struct { - unsigned long st_dev; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - - unsigned long st_rdev; - unsigned long __pad1; - long st_size; - int st_blksize; - int __pad2; - - long st_blocks; - long st_atime; - unsigned long st_atime_nsec; - long st_mtime; - - unsigned long st_mtime_nsec; - long st_ctime; - unsigned long st_ctime_nsec; - unsigned int __unused[2]; -}; - -#elif defined(__mips__) && defined(_ABIO32) -/* Syscalls for MIPS ABI O32 : - * - WARNING! there's always a delayed slot! - * - WARNING again, the syntax is different, registers take a '$' and numbers - * do not. - * - registers are 32-bit - * - stack is 8-byte aligned - * - syscall number is passed in v0 (starts at 0xfa0). - * - arguments are in a0, a1, a2, a3, then the stack. The caller needs to - * leave some room in the stack for the callee to save a0..a3 if needed. - * - Many registers are clobbered, in fact only a0..a2 and s0..s8 are - * preserved. See: https://www.linux-mips.org/wiki/Syscall as well as - * scall32-o32.S in the kernel sources. - * - the system call is performed by calling "syscall" - * - syscall return comes in v0, and register a3 needs to be checked to know - * if an error occured, in which case errno is in v0. - * - the arguments are cast to long and assigned into the target registers - * which are then simply passed as registers to the asm code, so that we - * don't have to experience issues with register constraints. - */ - -#define my_syscall0(num) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "r"(_num) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall1(num, arg1) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall2(num, arg1, arg2) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall3(num, arg1, arg2, arg3) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3"); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r"(_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall4(num, arg1, arg2, arg3, arg4) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "syscall\n" \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -#define my_syscall5(num, arg1, arg2, arg3, arg4, arg5) \ -({ \ - register long _num asm("v0") = (num); \ - register long _arg1 asm("a0") = (long)(arg1); \ - register long _arg2 asm("a1") = (long)(arg2); \ - register long _arg3 asm("a2") = (long)(arg3); \ - register long _arg4 asm("a3") = (long)(arg4); \ - register long _arg5 = (long)(arg5); \ - \ - asm volatile ( \ - "addiu $sp, $sp, -32\n" \ - "sw %7, 16($sp)\n" \ - "syscall\n " \ - "addiu $sp, $sp, 32\n" \ - : "=r" (_num), "=r"(_arg4) \ - : "0"(_num), \ - "r"(_arg1), "r"(_arg2), "r"(_arg3), "r"(_arg4), "r"(_arg5) \ - : "memory", "cc", "at", "v1", "hi", "lo", \ - "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9" \ - ); \ - _arg4 ? -_num : _num; \ -}) - -/* startup code, note that it's called __start on MIPS */ -asm(".section .text\n" - ".set nomips16\n" - ".global __start\n" - ".set noreorder\n" - ".option pic0\n" - ".ent __start\n" - "__start:\n" - "lw $a0,($sp)\n" // argc was in the stack - "addiu $a1, $sp, 4\n" // argv = sp + 4 - "sll $a2, $a0, 2\n" // a2 = argc * 4 - "add $a2, $a2, $a1\n" // envp = argv + 4*argc ... - "addiu $a2, $a2, 4\n" // ... + 4 - "li $t0, -8\n" - "and $sp, $sp, $t0\n" // sp must be 8-byte aligned - "addiu $sp,$sp,-16\n" // the callee expects to save a0..a3 there! - "jal main\n" // main() returns the status code, we'll exit with it. - "nop\n" // delayed slot - "and $a0, $v0, 0xff\n" // limit exit code to 8 bits - "li $v0, 4001\n" // NR_exit == 4001 - "syscall\n" - ".end __start\n" - ""); - -/* fcntl / open */ -#define O_RDONLY 0 -#define O_WRONLY 1 -#define O_RDWR 2 -#define O_APPEND 0x0008 -#define O_NONBLOCK 0x0080 -#define O_CREAT 0x0100 -#define O_TRUNC 0x0200 -#define O_EXCL 0x0400 -#define O_NOCTTY 0x0800 -#define O_DIRECTORY 0x10000 - -/* The struct returned by the stat() syscall. 88 bytes are returned by the - * syscall. - */ -struct sys_stat_struct { - unsigned int st_dev; - long st_pad1[3]; - unsigned long st_ino; - unsigned int st_mode; - unsigned int st_nlink; - unsigned int st_uid; - unsigned int st_gid; - unsigned int st_rdev; - long st_pad2[2]; - long st_size; - long st_pad3; - long st_atime; - long st_atime_nsec; - long st_mtime; - long st_mtime_nsec; - long st_ctime; - long st_ctime_nsec; - long st_blksize; - long st_blocks; - long st_pad4[14]; -}; - -#endif - - -/* Below are the C functions used to declare the raw syscalls. They try to be - * architecture-agnostic, and return either a success or -errno. Declaring them - * static will lead to them being inlined in most cases, but it's still possible - * to reference them by a pointer if needed. - */ -static __attribute__((unused)) -void *sys_brk(void *addr) -{ - return (void *)my_syscall1(__NR_brk, addr); -} - -static __attribute__((noreturn,unused)) -void sys_exit(int status) -{ - my_syscall1(__NR_exit, status & 255); - while(1); // shut the "noreturn" warnings. -} - -static __attribute__((unused)) -int sys_chdir(const char *path) -{ - return my_syscall1(__NR_chdir, path); -} - -static __attribute__((unused)) -int sys_chmod(const char *path, mode_t mode) -{ -#ifdef __NR_fchmodat - return my_syscall4(__NR_fchmodat, AT_FDCWD, path, mode, 0); -#else - return my_syscall2(__NR_chmod, path, mode); -#endif -} - -static __attribute__((unused)) -int sys_chown(const char *path, uid_t owner, gid_t group) -{ -#ifdef __NR_fchownat - return my_syscall5(__NR_fchownat, AT_FDCWD, path, owner, group, 0); -#else - return my_syscall3(__NR_chown, path, owner, group); -#endif -} - -static __attribute__((unused)) -int sys_chroot(const char *path) -{ - return my_syscall1(__NR_chroot, path); -} - -static __attribute__((unused)) -int sys_close(int fd) -{ - return my_syscall1(__NR_close, fd); -} - -static __attribute__((unused)) -int sys_dup(int fd) -{ - return my_syscall1(__NR_dup, fd); -} - -static __attribute__((unused)) -int sys_dup2(int old, int new) -{ - return my_syscall2(__NR_dup2, old, new); -} - -static __attribute__((unused)) -int sys_execve(const char *filename, char *const argv[], char *const envp[]) -{ - return my_syscall3(__NR_execve, filename, argv, envp); -} - -static __attribute__((unused)) -pid_t sys_fork(void) -{ - return my_syscall0(__NR_fork); -} - -static __attribute__((unused)) -int sys_fsync(int fd) -{ - return my_syscall1(__NR_fsync, fd); -} - -static __attribute__((unused)) -int sys_getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - return my_syscall3(__NR_getdents64, fd, dirp, count); -} - -static __attribute__((unused)) -pid_t sys_getpgrp(void) -{ - return my_syscall0(__NR_getpgrp); -} - -static __attribute__((unused)) -pid_t sys_getpid(void) -{ - return my_syscall0(__NR_getpid); -} - -static __attribute__((unused)) -int sys_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - return my_syscall2(__NR_gettimeofday, tv, tz); -} - -static __attribute__((unused)) -int sys_ioctl(int fd, unsigned long req, void *value) -{ - return my_syscall3(__NR_ioctl, fd, req, value); -} - -static __attribute__((unused)) -int sys_kill(pid_t pid, int signal) -{ - return my_syscall2(__NR_kill, pid, signal); -} - -static __attribute__((unused)) -int sys_link(const char *old, const char *new) -{ -#ifdef __NR_linkat - return my_syscall5(__NR_linkat, AT_FDCWD, old, AT_FDCWD, new, 0); -#else - return my_syscall2(__NR_link, old, new); -#endif -} - -static __attribute__((unused)) -off_t sys_lseek(int fd, off_t offset, int whence) -{ - return my_syscall3(__NR_lseek, fd, offset, whence); -} - -static __attribute__((unused)) -int sys_mkdir(const char *path, mode_t mode) -{ -#ifdef __NR_mkdirat - return my_syscall3(__NR_mkdirat, AT_FDCWD, path, mode); -#else - return my_syscall2(__NR_mkdir, path, mode); -#endif -} - -static __attribute__((unused)) -long sys_mknod(const char *path, mode_t mode, dev_t dev) -{ -#ifdef __NR_mknodat - return my_syscall4(__NR_mknodat, AT_FDCWD, path, mode, dev); -#else - return my_syscall3(__NR_mknod, path, mode, dev); -#endif -} - -static __attribute__((unused)) -int sys_mount(const char *src, const char *tgt, const char *fst, - unsigned long flags, const void *data) -{ - return my_syscall5(__NR_mount, src, tgt, fst, flags, data); -} - -static __attribute__((unused)) -int sys_open(const char *path, int flags, mode_t mode) -{ -#ifdef __NR_openat - return my_syscall4(__NR_openat, AT_FDCWD, path, flags, mode); -#else - return my_syscall3(__NR_open, path, flags, mode); -#endif -} - -static __attribute__((unused)) -int sys_pivot_root(const char *new, const char *old) -{ - return my_syscall2(__NR_pivot_root, new, old); -} - -static __attribute__((unused)) -int sys_poll(struct pollfd *fds, int nfds, int timeout) -{ - return my_syscall3(__NR_poll, fds, nfds, timeout); -} - -static __attribute__((unused)) -ssize_t sys_read(int fd, void *buf, size_t count) -{ - return my_syscall3(__NR_read, fd, buf, count); -} - -static __attribute__((unused)) -ssize_t sys_reboot(int magic1, int magic2, int cmd, void *arg) -{ - return my_syscall4(__NR_reboot, magic1, magic2, cmd, arg); -} - -static __attribute__((unused)) -int sys_sched_yield(void) -{ - return my_syscall0(__NR_sched_yield); -} - -static __attribute__((unused)) -int sys_select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ -#if defined(__ARCH_WANT_SYS_OLD_SELECT) && !defined(__NR__newselect) - struct sel_arg_struct { - unsigned long n; - fd_set *r, *w, *e; - struct timeval *t; - } arg = { .n = nfds, .r = rfds, .w = wfds, .e = efds, .t = timeout }; - return my_syscall1(__NR_select, &arg); -#elif defined(__ARCH_WANT_SYS_PSELECT6) && defined(__NR_pselect6) - struct timespec t; - - if (timeout) { - t.tv_sec = timeout->tv_sec; - t.tv_nsec = timeout->tv_usec * 1000; - } - return my_syscall6(__NR_pselect6, nfds, rfds, wfds, efds, timeout ? &t : NULL, NULL); -#else -#ifndef __NR__newselect -#define __NR__newselect __NR_select -#endif - return my_syscall5(__NR__newselect, nfds, rfds, wfds, efds, timeout); -#endif -} - -static __attribute__((unused)) -int sys_setpgid(pid_t pid, pid_t pgid) -{ - return my_syscall2(__NR_setpgid, pid, pgid); -} - -static __attribute__((unused)) -pid_t sys_setsid(void) -{ - return my_syscall0(__NR_setsid); -} - -static __attribute__((unused)) -int sys_stat(const char *path, struct stat *buf) -{ - struct sys_stat_struct stat; - long ret; - -#ifdef __NR_newfstatat - /* only solution for arm64 */ - ret = my_syscall4(__NR_newfstatat, AT_FDCWD, path, &stat, 0); -#else - ret = my_syscall2(__NR_stat, path, &stat); -#endif - buf->st_dev = stat.st_dev; - buf->st_ino = stat.st_ino; - buf->st_mode = stat.st_mode; - buf->st_nlink = stat.st_nlink; - buf->st_uid = stat.st_uid; - buf->st_gid = stat.st_gid; - buf->st_rdev = stat.st_rdev; - buf->st_size = stat.st_size; - buf->st_blksize = stat.st_blksize; - buf->st_blocks = stat.st_blocks; - buf->st_atime = stat.st_atime; - buf->st_mtime = stat.st_mtime; - buf->st_ctime = stat.st_ctime; - return ret; -} - - -static __attribute__((unused)) -int sys_symlink(const char *old, const char *new) -{ -#ifdef __NR_symlinkat - return my_syscall3(__NR_symlinkat, old, AT_FDCWD, new); -#else - return my_syscall2(__NR_symlink, old, new); -#endif -} - -static __attribute__((unused)) -mode_t sys_umask(mode_t mode) -{ - return my_syscall1(__NR_umask, mode); -} - -static __attribute__((unused)) -int sys_umount2(const char *path, int flags) -{ - return my_syscall2(__NR_umount2, path, flags); -} - -static __attribute__((unused)) -int sys_unlink(const char *path) -{ -#ifdef __NR_unlinkat - return my_syscall3(__NR_unlinkat, AT_FDCWD, path, 0); -#else - return my_syscall1(__NR_unlink, path); -#endif -} - -static __attribute__((unused)) -pid_t sys_wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - return my_syscall4(__NR_wait4, pid, status, options, rusage); -} - -static __attribute__((unused)) -pid_t sys_waitpid(pid_t pid, int *status, int options) -{ - return sys_wait4(pid, status, options, 0); -} - -static __attribute__((unused)) -pid_t sys_wait(int *status) -{ - return sys_waitpid(-1, status, 0); -} - -static __attribute__((unused)) -ssize_t sys_write(int fd, const void *buf, size_t count) -{ - return my_syscall3(__NR_write, fd, buf, count); -} - - -/* Below are the libc-compatible syscalls which return x or -1 and set errno. - * They rely on the functions above. Similarly they're marked static so that it - * is possible to assign pointers to them if needed. - */ - -static __attribute__((unused)) -int brk(void *addr) -{ - void *ret = sys_brk(addr); - - if (!ret) { - SET_ERRNO(ENOMEM); - return -1; - } - return 0; -} - -static __attribute__((noreturn,unused)) -void exit(int status) -{ - sys_exit(status); -} - -static __attribute__((unused)) -int chdir(const char *path) -{ - int ret = sys_chdir(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chmod(const char *path, mode_t mode) -{ - int ret = sys_chmod(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chown(const char *path, uid_t owner, gid_t group) -{ - int ret = sys_chown(path, owner, group); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int chroot(const char *path) -{ - int ret = sys_chroot(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int close(int fd) -{ - int ret = sys_close(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int dup2(int old, int new) -{ - int ret = sys_dup2(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int execve(const char *filename, char *const argv[], char *const envp[]) -{ - int ret = sys_execve(filename, argv, envp); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t fork(void) -{ - pid_t ret = sys_fork(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int fsync(int fd) -{ - int ret = sys_fsync(fd); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int getdents64(int fd, struct linux_dirent64 *dirp, int count) -{ - int ret = sys_getdents64(fd, dirp, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpgrp(void) -{ - pid_t ret = sys_getpgrp(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t getpid(void) -{ - pid_t ret = sys_getpid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int gettimeofday(struct timeval *tv, struct timezone *tz) -{ - int ret = sys_gettimeofday(tv, tz); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int ioctl(int fd, unsigned long req, void *value) -{ - int ret = sys_ioctl(fd, req, value); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int kill(pid_t pid, int signal) -{ - int ret = sys_kill(pid, signal); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int link(const char *old, const char *new) -{ - int ret = sys_link(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -off_t lseek(int fd, off_t offset, int whence) -{ - off_t ret = sys_lseek(fd, offset, whence); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mkdir(const char *path, mode_t mode) -{ - int ret = sys_mkdir(path, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mknod(const char *path, mode_t mode, dev_t dev) -{ - int ret = sys_mknod(path, mode, dev); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int mount(const char *src, const char *tgt, - const char *fst, unsigned long flags, - const void *data) -{ - int ret = sys_mount(src, tgt, fst, flags, data); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int open(const char *path, int flags, mode_t mode) -{ - int ret = sys_open(path, flags, mode); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int pivot_root(const char *new, const char *old) -{ - int ret = sys_pivot_root(new, old); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int poll(struct pollfd *fds, int nfds, int timeout) -{ - int ret = sys_poll(fds, nfds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t read(int fd, void *buf, size_t count) -{ - ssize_t ret = sys_read(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int reboot(int cmd) -{ - int ret = sys_reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, cmd, 0); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -void *sbrk(intptr_t inc) -{ - void *ret; - - /* first call to find current end */ - if ((ret = sys_brk(0)) && (sys_brk(ret + inc) == ret + inc)) - return ret + inc; - - SET_ERRNO(ENOMEM); - return (void *)-1; -} - -static __attribute__((unused)) -int sched_yield(void) -{ - int ret = sys_sched_yield(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int select(int nfds, fd_set *rfds, fd_set *wfds, fd_set *efds, struct timeval *timeout) -{ - int ret = sys_select(nfds, rfds, wfds, efds, timeout); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int setpgid(pid_t pid, pid_t pgid) -{ - int ret = sys_setpgid(pid, pgid); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t setsid(void) -{ - pid_t ret = sys_setsid(); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -unsigned int sleep(unsigned int seconds) -{ - struct timeval my_timeval = { seconds, 0 }; - - if (sys_select(0, 0, 0, 0, &my_timeval) < 0) - return my_timeval.tv_sec + !!my_timeval.tv_usec; - else - return 0; -} - -static __attribute__((unused)) -int stat(const char *path, struct stat *buf) -{ - int ret = sys_stat(path, buf); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int symlink(const char *old, const char *new) -{ - int ret = sys_symlink(old, new); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int tcsetpgrp(int fd, pid_t pid) -{ - return ioctl(fd, TIOCSPGRP, &pid); -} - -static __attribute__((unused)) -mode_t umask(mode_t mode) -{ - return sys_umask(mode); -} - -static __attribute__((unused)) -int umount2(const char *path, int flags) -{ - int ret = sys_umount2(path, flags); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -int unlink(const char *path) -{ - int ret = sys_unlink(path); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait4(pid_t pid, int *status, int options, struct rusage *rusage) -{ - pid_t ret = sys_wait4(pid, status, options, rusage); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t waitpid(pid_t pid, int *status, int options) -{ - pid_t ret = sys_waitpid(pid, status, options); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -pid_t wait(int *status) -{ - pid_t ret = sys_wait(status); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -static __attribute__((unused)) -ssize_t write(int fd, const void *buf, size_t count) -{ - ssize_t ret = sys_write(fd, buf, count); - - if (ret < 0) { - SET_ERRNO(-ret); - ret = -1; - } - return ret; -} - -/* some size-optimized reimplementations of a few common str* and mem* - * functions. They're marked static, except memcpy() and raise() which are used - * by libgcc on ARM, so they are marked weak instead in order not to cause an - * error when building a program made of multiple files (not recommended). - */ - -static __attribute__((unused)) -void *memmove(void *dst, const void *src, size_t len) -{ - ssize_t pos = (dst <= src) ? -1 : (long)len; - void *ret = dst; - - while (len--) { - pos += (dst <= src) ? 1 : -1; - ((char *)dst)[pos] = ((char *)src)[pos]; - } - return ret; -} - -static __attribute__((unused)) -void *memset(void *dst, int b, size_t len) -{ - char *p = dst; - - while (len--) - *(p++) = b; - return dst; -} - -static __attribute__((unused)) -int memcmp(const void *s1, const void *s2, size_t n) -{ - size_t ofs = 0; - char c1 = 0; - - while (ofs < n && !(c1 = ((char *)s1)[ofs] - ((char *)s2)[ofs])) { - ofs++; - } - return c1; -} - -static __attribute__((unused)) -char *strcpy(char *dst, const char *src) -{ - char *ret = dst; - - while ((*dst++ = *src++)); - return ret; -} - -static __attribute__((unused)) -char *strchr(const char *s, int c) -{ - while (*s) { - if (*s == (char)c) - return (char *)s; - s++; - } - return NULL; -} - -static __attribute__((unused)) -char *strrchr(const char *s, int c) -{ - const char *ret = NULL; - - while (*s) { - if (*s == (char)c) - ret = s; - s++; - } - return (char *)ret; -} - -static __attribute__((unused)) -size_t nolibc_strlen(const char *str) -{ - size_t len; - - for (len = 0; str[len]; len++); - return len; -} - -#define strlen(str) ({ \ - __builtin_constant_p((str)) ? \ - __builtin_strlen((str)) : \ - nolibc_strlen((str)); \ -}) - -static __attribute__((unused)) -int isdigit(int c) -{ - return (unsigned int)(c - '0') <= 9; -} - -static __attribute__((unused)) -long atol(const char *s) -{ - unsigned long ret = 0; - unsigned long d; - int neg = 0; - - if (*s == '-') { - neg = 1; - s++; - } - - while (1) { - d = (*s++) - '0'; - if (d > 9) - break; - ret *= 10; - ret += d; - } - - return neg ? -ret : ret; -} - -static __attribute__((unused)) -int atoi(const char *s) -{ - return atol(s); -} - -static __attribute__((unused)) -const char *ltoa(long in) -{ - /* large enough for -9223372036854775808 */ - static char buffer[21]; - char *pos = buffer + sizeof(buffer) - 1; - int neg = in < 0; - unsigned long n = neg ? -in : in; - - *pos-- = '\0'; - do { - *pos-- = '0' + n % 10; - n /= 10; - if (pos < buffer) - return pos + 1; - } while (n); - - if (neg) - *pos-- = '-'; - return pos + 1; -} - -__attribute__((weak,unused)) -void *memcpy(void *dst, const void *src, size_t len) -{ - return memmove(dst, src, len); -} - -/* needed by libgcc for divide by zero */ -__attribute__((weak,unused)) -int raise(int signal) -{ - return kill(getpid(), signal); -} - -/* Here come a few helper functions */ - -static __attribute__((unused)) -void FD_ZERO(fd_set *set) -{ - memset(set, 0, sizeof(*set)); -} - -static __attribute__((unused)) -void FD_SET(int fd, fd_set *set) -{ - if (fd < 0 || fd >= FD_SETSIZE) - return; - set->fd32[fd / 32] |= 1 << (fd & 31); -} - -/* WARNING, it only deals with the 4096 first majors and 256 first minors */ -static __attribute__((unused)) -dev_t makedev(unsigned int major, unsigned int minor) -{ - return ((major & 0xfff) << 8) | (minor & 0xff); -} -- cgit v1.3-7-g2ca7 From 71368af9027f18fe5d1c6f372cfdff7e4bde8b48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Jan 2019 17:01:36 -0500 Subject: x86/speculation: Add PR_SPEC_DISABLE_NOEXEC With the default SPEC_STORE_BYPASS_SECCOMP/SPEC_STORE_BYPASS_PRCTL mode, the TIF_SSBD bit will be inherited when a new task is fork'ed or cloned. It will also remain when a new program is execve'ed. Only certain class of applications (like Java) that can run on behalf of multiple users on a single thread will require disabling speculative store bypass for security purposes. Those applications will call prctl(2) at startup time to disable SSB. They won't rely on the fact the SSB might have been disabled. Other applications that don't need SSBD will just move on without checking if SSBD has been turned on or not. The fact that the TIF_SSBD is inherited across execve(2) boundary will cause performance of applications that don't need SSBD but their predecessors have SSBD on to be unwittingly impacted especially if they write to memory a lot. To remedy this problem, a new PR_SPEC_DISABLE_NOEXEC argument for the PR_SET_SPECULATION_CTRL option of prctl(2) is added to allow applications to specify that the SSBD feature bit on the task structure should be cleared whenever a new program is being execve'ed. Suggested-by: Thomas Gleixner Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: David Woodhouse Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Tim Chen Cc: KarimAllah Ahmed Cc: Peter Zijlstra Cc: Konrad Rzeszutek Wilk Link: https://lkml.kernel.org/r/1547676096-3281-1-git-send-email-longman@redhat.com --- Documentation/userspace-api/spec_ctrl.rst | 27 +++++++++++++++------------ arch/x86/kernel/cpu/bugs.c | 12 ++++++++++++ arch/x86/kernel/process.c | 12 ++++++++++++ include/linux/sched.h | 5 +++++ include/uapi/linux/prctl.h | 1 + tools/include/uapi/linux/prctl.h | 1 + 6 files changed, 46 insertions(+), 12 deletions(-) (limited to 'tools/include') diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index c4dbe6f7cdae..1129c7550a48 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -28,18 +28,20 @@ PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature which is selected with arg2 of prctl(2). The return value uses bits 0-3 with the following meaning: -==== ===================== =================================================== -Bit Define Description -==== ===================== =================================================== -0 PR_SPEC_PRCTL Mitigation can be controlled per task by - PR_SET_SPECULATION_CTRL. -1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is - disabled. -2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is - enabled. -3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A - subsequent prctl(..., PR_SPEC_ENABLE) will fail. -==== ===================== =================================================== +==== ====================== ================================================== +Bit Define Description +==== ====================== ================================================== +0 PR_SPEC_PRCTL Mitigation can be controlled per task by + PR_SET_SPECULATION_CTRL. +1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is + disabled. +2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is + enabled. +3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A + subsequent prctl(..., PR_SPEC_ENABLE) will fail. +4 PR_SPEC_DISABLE_NOEXEC Same as PR_SPEC_DISABLE, but the state will be + cleared on :manpage:`execve(2)`. +==== ====================== ================================================== If all bits are 0 the CPU is not affected by the speculation misfeature. @@ -92,6 +94,7 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE_NOEXEC, 0, 0); - PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes (Mitigate Spectre V2 style attacks against user processes) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1de0f4170178..2faeaf46347a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -798,15 +798,25 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_noexec(task); task_update_spec_tif(task); break; default: @@ -885,6 +895,8 @@ static int ssb_prctl_get(struct task_struct *task) case SPEC_STORE_BYPASS_PRCTL: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 90ae0ca51083..58ac7be52c7a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -255,6 +255,18 @@ void arch_setup_new_exec(void) /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); + + /* + * Don't inherit TIF_SSBD across exec boundary when + * PR_SPEC_DISABLE_NOEXEC is used. + */ + if (test_thread_flag(TIF_SSBD) && + task_spec_ssb_noexec(current)) { + clear_thread_flag(TIF_SSBD); + task_clear_spec_ssb_disable(current); + task_clear_spec_ssb_noexec(current); + speculation_ctrl_update(task_thread_info(current)->flags); + } } static inline void switch_to_bitmap(struct thread_struct *prev, diff --git a/include/linux/sched.h b/include/linux/sched.h index d2f90fa92468..fc836dc71bba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,7 @@ static inline bool is_percpu_thread(void) #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ +#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1487,6 +1488,10 @@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) + TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 -- cgit v1.3-7-g2ca7 From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001 From: Patrick Lerda Date: Thu, 17 Jan 2019 03:50:13 -0500 Subject: media: rc: rcmm decoder and encoder media: add support for RCMM infrared remote controls. Signed-off-by: Patrick Lerda Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/lirc.h.rst.exceptions | 3 + MAINTAINERS | 5 + drivers/media/rc/Kconfig | 13 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/ir-rcmm-decoder.c | 254 ++++++++++++++++++++++++++++++ drivers/media/rc/rc-core-priv.h | 5 + drivers/media/rc/rc-main.c | 9 ++ include/media/rc-map.h | 14 +- include/uapi/linux/lirc.h | 6 + tools/include/uapi/linux/lirc.h | 12 ++ tools/testing/selftests/ir/ir_loopback.c | 9 ++ 11 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 drivers/media/rc/ir-rcmm-decoder.c (limited to 'tools/include') diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index 379b9e7df5d0..7a8b8ff4f076 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -60,6 +60,9 @@ ignore symbol RC_PROTO_SHARP ignore symbol RC_PROTO_XMP ignore symbol RC_PROTO_CEC ignore symbol RC_PROTO_IMON +ignore symbol RC_PROTO_RCMM12 +ignore symbol RC_PROTO_RCMM24 +ignore symbol RC_PROTO_RCMM32 # Undocumented macros diff --git a/MAINTAINERS b/MAINTAINERS index 914d333ee73d..9662ad6c58e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16536,6 +16536,11 @@ M: David Härdeman S: Maintained F: drivers/media/rc/winbond-cir.c +RCMM REMOTE CONTROLS DECODER +M: Patrick Lerda +S: Maintained +F: drivers/media/rc/ir-rcmm-decoder.c + WINSYSTEMS EBC-C384 WATCHDOG DRIVER M: William Breathitt Gray L: linux-watchdog@vger.kernel.org diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 8a216068a35a..8e95f6eac89d 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -133,6 +133,19 @@ config IR_IMON_DECODER remote control and you would like to use it with a raw IR receiver, or if you wish to use an encoder to transmit this IR. +config IR_RCMM_DECODER + tristate "Enable IR raw decoder for the RC-MM protocol" + depends on RC_CORE + help + Enable this option when you have IR with RC-MM protocol, and + you need the software decoder. The driver supports 12, + 24 and 32 bits RC-MM variants. You can enable or disable the + different modes using the following RC protocol keywords: + 'rc-mm-12', 'rc-mm-24' and 'rc-mm-32'. + + To compile this driver as a module, choose M here: the module + will be called ir-rcmm-decoder. + endif #RC_DECODERS menuconfig RC_DEVICES diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 92c163816849..48d23433b3c0 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o +obj-$(CONFIG_IR_RCMM_DECODER) += ir-rcmm-decoder.o # stand-alone IR receivers/transmitters obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o diff --git a/drivers/media/rc/ir-rcmm-decoder.c b/drivers/media/rc/ir-rcmm-decoder.c new file mode 100644 index 000000000000..f1096ac1e5c5 --- /dev/null +++ b/drivers/media/rc/ir-rcmm-decoder.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0+ +// ir-rcmm-decoder.c - A decoder for the RCMM IR protocol +// +// Copyright (C) 2018 by Patrick Lerda + +#include "rc-core-priv.h" +#include +#include + +#define RCMM_UNIT 166667 /* nanosecs */ +#define RCMM_PREFIX_PULSE 416666 /* 166666.666666666*2.5 */ +#define RCMM_PULSE_0 277777 /* 166666.666666666*(1+2/3) */ +#define RCMM_PULSE_1 444444 /* 166666.666666666*(2+2/3) */ +#define RCMM_PULSE_2 611111 /* 166666.666666666*(3+2/3) */ +#define RCMM_PULSE_3 777778 /* 166666.666666666*(4+2/3) */ + +enum rcmm_state { + STATE_INACTIVE, + STATE_LOW, + STATE_BUMP, + STATE_VALUE, + STATE_FINISHED, +}; + +static bool rcmm_mode(const struct rcmm_dec *data) +{ + return !((0x000c0000 & data->bits) == 0x000c0000); +} + +static int rcmm_miscmode(struct rc_dev *dev, struct rcmm_dec *data) +{ + switch (data->count) { + case 24: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM24) { + rc_keydown(dev, RC_PROTO_RCMM24, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + + case 12: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM12) { + rc_keydown(dev, RC_PROTO_RCMM12, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + } + + return -1; +} + +/** + * ir_rcmm_decode() - Decode one RCMM pulse or space + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space + * + * This function returns -EINVAL if the pulse violates the state machine + */ +static int ir_rcmm_decode(struct rc_dev *dev, struct ir_raw_event ev) +{ + struct rcmm_dec *data = &dev->raw->rcmm; + u32 scancode; + u8 toggle; + int value; + + if (!(dev->enabled_protocols & (RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12))) + return 0; + + if (!is_timing_event(ev)) { + if (ev.reset) + data->state = STATE_INACTIVE; + return 0; + } + + switch (data->state) { + case STATE_INACTIVE: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PREFIX_PULSE, RCMM_UNIT / 2)) + break; + + data->state = STATE_LOW; + data->count = 0; + data->bits = 0; + return 0; + + case STATE_LOW: + if (ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + break; + + data->state = STATE_BUMP; + return 0; + + case STATE_BUMP: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + data->state = STATE_VALUE; + return 0; + + case STATE_VALUE: + if (ev.pulse) + break; + + if (eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + value = 0; + else if (eq_margin(ev.duration, RCMM_PULSE_1, RCMM_UNIT / 2)) + value = 1; + else if (eq_margin(ev.duration, RCMM_PULSE_2, RCMM_UNIT / 2)) + value = 2; + else if (eq_margin(ev.duration, RCMM_PULSE_3, RCMM_UNIT / 2)) + value = 3; + else + value = -1; + + if (value == -1) { + if (!rcmm_miscmode(dev, data)) + return 0; + break; + } + + data->bits <<= 2; + data->bits |= value; + + data->count += 2; + + if (data->count < 32) + data->state = STATE_BUMP; + else + data->state = STATE_FINISHED; + + return 0; + + case STATE_FINISHED: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + if (rcmm_mode(data)) { + toggle = !!(0x8000 & data->bits); + scancode = data->bits & ~0x8000; + } else { + toggle = 0; + scancode = data->bits; + } + + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM32) { + rc_keydown(dev, RC_PROTO_RCMM32, scancode, toggle); + data->state = STATE_INACTIVE; + return 0; + } + + break; + } + + data->state = STATE_INACTIVE; + return -EINVAL; +} + +static const int rcmmspace[] = { + RCMM_PULSE_0, + RCMM_PULSE_1, + RCMM_PULSE_2, + RCMM_PULSE_3, +}; + +static int ir_rcmm_rawencoder(struct ir_raw_event **ev, unsigned int max, + unsigned int n, u32 data) +{ + int i; + int ret; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_PREFIX_PULSE, RCMM_PULSE_0); + if (ret) + return ret; + + for (i = n - 2; i >= 0; i -= 2) { + const unsigned int space = rcmmspace[(data >> i) & 3]; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, space); + if (ret) + return ret; + } + + return ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, RCMM_PULSE_3 * 2); +} + +static int ir_rcmm_encode(enum rc_proto protocol, u32 scancode, + struct ir_raw_event *events, unsigned int max) +{ + struct ir_raw_event *e = events; + int ret; + + switch (protocol) { + case RC_PROTO_RCMM32: + ret = ir_rcmm_rawencoder(&e, max, 32, scancode); + break; + case RC_PROTO_RCMM24: + ret = ir_rcmm_rawencoder(&e, max, 24, scancode); + break; + case RC_PROTO_RCMM12: + ret = ir_rcmm_rawencoder(&e, max, 12, scancode); + break; + default: + ret = -EINVAL; + } + + if (ret < 0) + return ret; + + return e - events; +} + +static struct ir_raw_handler rcmm_handler = { + .protocols = RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12, + .decode = ir_rcmm_decode, + .encode = ir_rcmm_encode, + .carrier = 36000, + .min_timeout = RCMM_PULSE_3 + RCMM_UNIT, +}; + +static int __init ir_rcmm_decode_init(void) +{ + ir_raw_handler_register(&rcmm_handler); + + pr_info("IR RCMM protocol handler initialized\n"); + return 0; +} + +static void __exit ir_rcmm_decode_exit(void) +{ + ir_raw_handler_unregister(&rcmm_handler); +} + +module_init(ir_rcmm_decode_init); +module_exit(ir_rcmm_decode_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick Lerda"); +MODULE_DESCRIPTION("RCMM IR protocol decoder"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index c2cbe7f6266c..9f21b3e8b377 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -131,6 +131,11 @@ struct ir_raw_event_ctrl { unsigned int bits; bool stick_keyboard; } imon; + struct rcmm_dec { + int state; + unsigned int count; + u32 bits; + } rcmm; }; /* Mutex for locking raw IR processing and handler change */ diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 66a174979b3c..dc38e9c0a2ff 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -70,6 +70,12 @@ static const struct { [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 }, [RC_PROTO_IMON] = { .name = "imon", .scancode_bits = 0x7fffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM12] = { .name = "rc-mm-12", + .scancode_bits = 0x00000fff, .repeat_period = 114 }, + [RC_PROTO_RCMM24] = { .name = "rc-mm-24", + .scancode_bits = 0x00ffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM32] = { .name = "rc-mm-32", + .scancode_bits = 0xffffffff, .repeat_period = 114 }, }; /* Used to keep track of known keymaps */ @@ -1006,6 +1012,9 @@ static const struct { { RC_PROTO_BIT_XMP, "xmp", "ir-xmp-decoder" }, { RC_PROTO_BIT_CEC, "cec", NULL }, { RC_PROTO_BIT_IMON, "imon", "ir-imon-decoder" }, + { RC_PROTO_BIT_RCMM12 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM32, "rc-mm", "ir-rcmm-decoder" }, }; /** diff --git a/include/media/rc-map.h b/include/media/rc-map.h index d621acadfbf3..e5e86d595645 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -37,6 +37,9 @@ #define RC_PROTO_BIT_XMP BIT_ULL(RC_PROTO_XMP) #define RC_PROTO_BIT_CEC BIT_ULL(RC_PROTO_CEC) #define RC_PROTO_BIT_IMON BIT_ULL(RC_PROTO_IMON) +#define RC_PROTO_BIT_RCMM12 BIT_ULL(RC_PROTO_RCMM12) +#define RC_PROTO_BIT_RCMM24 BIT_ULL(RC_PROTO_RCMM24) +#define RC_PROTO_BIT_RCMM32 BIT_ULL(RC_PROTO_RCMM32) #define RC_PROTO_BIT_ALL \ (RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \ @@ -51,7 +54,8 @@ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \ - RC_PROTO_BIT_IMON) + RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \ + RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32) /* All rc protocols for which we have decoders */ #define RC_PROTO_BIT_ALL_IR_DECODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -64,7 +68,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_PROTO_BIT_ALL_IR_ENCODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -77,7 +83,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | \ RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \ - RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_SCANCODE_UNKNOWN(x) (x) #define RC_SCANCODE_OTHER(x) (x) diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 6b319581882f..45fcbf99d72e 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -192,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -218,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h index f189931042a7..45fcbf99d72e 100644 --- a/tools/include/uapi/linux/lirc.h +++ b/tools/include/uapi/linux/lirc.h @@ -133,6 +133,12 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * Return the recording timeout, which is either set by + * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. + */ +#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) + /* * struct lirc_scancode - decoded scancode with protocol for use with * LIRC_MODE_SCANCODE @@ -186,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -212,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index 858c19caf224..570a7358942c 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -51,6 +51,10 @@ static const struct { { RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" }, { RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" }, { RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" }, + { RC_PROTO_IMON, "imon", 0x7fffffff, "imon" }, + { RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rcmm" }, + { RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rcmm" }, + { RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rcmm" }, }; int lirc_open(const char *rc) @@ -139,6 +143,11 @@ int main(int argc, char **argv) (((scancode >> 8) ^ ~scancode) & 0xff) == 0) continue; + if (rc_proto == RC_PROTO_RCMM32 && + (scancode & 0x000c0000) != 0x000c0000 && + scancode & 0x00008000) + continue; + struct lirc_scancode lsc = { .rc_proto = rc_proto, .scancode = scancode -- cgit v1.3-7-g2ca7 From 7c9eefe82ca1efec5890678c33e66d5d520c06f4 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 5 Mar 2019 15:43:01 -0800 Subject: tools/: replace open encodings for NUMA_NO_NODE This replaces all open encodings in tools with NUMA_NO_NODE. Also linux/numa.h is now needed for the perf build. [sfr@canb.auug.org.au: fix for replace open encodings for NUMA_NO_NODE] Link: http://lkml.kernel.org/r/20190108131141.730e9c4f@canb.auug.org.au Link: http://lkml.kernel.org/r/1545127933-10711-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Stephen Rothwell Signed-off-by: Anshuman Khandual Signed-off-by: Stephen Rothwell Cc: David Hildenbrand Cc: Doug Ledford [drivers/infiniband] Cc: Hans Verkuil Cc: Jeff Kirsher [ixgbe] Cc: Jens Axboe [mtip32xx] Cc: Joseph Qi Cc: Michael Ellerman [powerpc] Cc: Vinod Koul [dmaengine.c] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/numa.h | 16 ++++++++++++++++ tools/perf/bench/numa.c | 7 ++++--- 2 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 tools/include/linux/numa.h (limited to 'tools/include') diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h new file mode 100644 index 000000000000..110b0e5d0fb0 --- /dev/null +++ b/tools/include/linux/numa.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_NUMA_H +#define _LINUX_NUMA_H + + +#ifdef CONFIG_NODES_SHIFT +#define NODES_SHIFT CONFIG_NODES_SHIFT +#else +#define NODES_SHIFT 0 +#endif + +#define MAX_NUMNODES (1 << NODES_SHIFT) + +#define NUMA_NO_NODE (-1) + +#endif /* _LINUX_NUMA_H */ diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 44195514b19e..98ad783efc69 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node) CPU_ZERO(&mask); - if (target_node == -1) { + if (target_node == NUMA_NO_NODE) { for (cpu = 0; cpu < g->p.nr_cpus; cpu++) CPU_SET(cpu, &mask); } else { @@ -339,7 +340,7 @@ static void bind_to_memnode(int node) unsigned long nodemask; int ret; - if (node == -1) + if (node == NUMA_NO_NODE) return; BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); @@ -1363,7 +1364,7 @@ static void init_thread_data(void) int cpu; /* Allow all nodes by default: */ - td->bind_node = -1; + td->bind_node = NUMA_NO_NODE; /* Allow all CPUs by default: */ CPU_ZERO(&td->bind_cpumask); -- cgit v1.3-7-g2ca7 From 586187d7de71b4da7956ba588ae42253b9ff6482 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Mar 2019 23:31:26 -0700 Subject: Drop flex_arrays All existing users have been converted to generic radix trees Link: http://lkml.kernel.org/r/20181217131929.11727-8-kent.overstreet@gmail.com Signed-off-by: Kent Overstreet Acked-by: Dave Hansen Cc: Alexey Dobriyan Cc: Al Viro Cc: Eric Paris Cc: Marcelo Ricardo Leitner Cc: Matthew Wilcox Cc: Neil Horman Cc: Paul Moore Cc: Pravin B Shelar Cc: Shaohua Li Cc: Stephen Smalley Cc: Vlad Yasevich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/flexible-arrays.rst | 130 ---------- Documentation/flexible-arrays.txt | 123 --------- include/linux/flex_array.h | 149 ----------- include/linux/poison.h | 3 - lib/Makefile | 2 +- lib/flex_array.c | 398 ----------------------------- tools/include/linux/poison.h | 3 - 7 files changed, 1 insertion(+), 807 deletions(-) delete mode 100644 Documentation/core-api/flexible-arrays.rst delete mode 100644 Documentation/flexible-arrays.txt delete mode 100644 include/linux/flex_array.h delete mode 100644 lib/flex_array.c (limited to 'tools/include') diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst deleted file mode 100644 index b6b85a1b518e..000000000000 --- a/Documentation/core-api/flexible-arrays.rst +++ /dev/null @@ -1,130 +0,0 @@ - -=================================== -Using flexible arrays in the kernel -=================================== - -Large contiguous memory allocations can be unreliable in the Linux kernel. -Kernel programmers will sometimes respond to this problem by allocating -pages with :c:func:`vmalloc()`. This solution not ideal, though. On 32-bit -systems, memory from vmalloc() must be mapped into a relatively small address -space; it's easy to run out. On SMP systems, the page table changes required -by vmalloc() allocations can require expensive cross-processor interrupts on -all CPUs. And, on all systems, use of space in the vmalloc() range increases -pressure on the translation lookaside buffer (TLB), reducing the performance -of the system. - -In many cases, the need for memory from vmalloc() can be eliminated by piecing -together an array from smaller parts; the flexible array library exists to make -this task easier. - -A flexible array holds an arbitrary (within limits) number of fixed-sized -objects, accessed via an integer index. Sparse arrays are handled -reasonably well. Only single-page allocations are made, so memory -allocation failures should be relatively rare. The down sides are that the -arrays cannot be indexed directly, individual object size cannot exceed the -system page size, and putting data into a flexible array requires a copy -operation. It's also worth noting that flexible arrays do no internal -locking at all; if concurrent access to an array is possible, then the -caller must arrange for appropriate mutual exclusion. - -The creation of a flexible array is done with :c:func:`flex_array_alloc()`:: - - #include - - struct flex_array *flex_array_alloc(int element_size, - unsigned int total, - gfp_t flags); - -The individual object size is provided by ``element_size``, while total is the -maximum number of objects which can be stored in the array. The flags -argument is passed directly to the internal memory allocation calls. With -the current code, using flags to ask for high memory is likely to lead to -notably unpleasant side effects. - -It is also possible to define flexible arrays at compile time with:: - - DEFINE_FLEX_ARRAY(name, element_size, total); - -This macro will result in a definition of an array with the given name; the -element size and total will be checked for validity at compile time. - -Storing data into a flexible array is accomplished with a call to -:c:func:`flex_array_put()`:: - - int flex_array_put(struct flex_array *array, unsigned int element_nr, - void *src, gfp_t flags); - -This call will copy the data from src into the array, in the position -indicated by ``element_nr`` (which must be less than the maximum specified when -the array was created). If any memory allocations must be performed, flags -will be used. The return value is zero on success, a negative error code -otherwise. - -There might possibly be a need to store data into a flexible array while -running in some sort of atomic context; in this situation, sleeping in the -memory allocator would be a bad thing. That can be avoided by using -``GFP_ATOMIC`` for the flags value, but, often, there is a better way. The -trick is to ensure that any needed memory allocations are done before -entering atomic context, using :c:func:`flex_array_prealloc()`:: - - int flex_array_prealloc(struct flex_array *array, unsigned int start, - unsigned int nr_elements, gfp_t flags); - -This function will ensure that memory for the elements indexed in the range -defined by ``start`` and ``nr_elements`` has been allocated. Thereafter, a -``flex_array_put()`` call on an element in that range is guaranteed not to -block. - -Getting data back out of the array is done with :c:func:`flex_array_get()`:: - - void *flex_array_get(struct flex_array *fa, unsigned int element_nr); - -The return value is a pointer to the data element, or NULL if that -particular element has never been allocated. - -Note that it is possible to get back a valid pointer for an element which -has never been stored in the array. Memory for array elements is allocated -one page at a time; a single allocation could provide memory for several -adjacent elements. Flexible array elements are normally initialized to the -value ``FLEX_ARRAY_FREE`` (defined as 0x6c in ), so errors -involving that number probably result from use of unstored array entries. -Note that, if array elements are allocated with ``__GFP_ZERO``, they will be -initialized to zero and this poisoning will not happen. - -Individual elements in the array can be cleared with -:c:func:`flex_array_clear()`:: - - int flex_array_clear(struct flex_array *array, unsigned int element_nr); - -This function will set the given element to ``FLEX_ARRAY_FREE`` and return -zero. If storage for the indicated element is not allocated for the array, -``flex_array_clear()`` will return ``-EINVAL`` instead. Note that clearing an -element does not release the storage associated with it; to reduce the -allocated size of an array, call :c:func:`flex_array_shrink()`:: - - int flex_array_shrink(struct flex_array *array); - -The return value will be the number of pages of memory actually freed. -This function works by scanning the array for pages containing nothing but -``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work -if the array's pages are allocated with ``__GFP_ZERO``. - -It is possible to remove all elements of an array with a call to -:c:func:`flex_array_free_parts()`:: - - void flex_array_free_parts(struct flex_array *array); - -This call frees all elements, but leaves the array itself in place. -Freeing the entire array is done with :c:func:`flex_array_free()`:: - - void flex_array_free(struct flex_array *array); - -As of this writing, there are no users of flexible arrays in the mainline -kernel. The functions described here are also not exported to modules; -that will probably be fixed when somebody comes up with a need for it. - - -Flexible array functions ------------------------- - -.. kernel-doc:: include/linux/flex_array.h diff --git a/Documentation/flexible-arrays.txt b/Documentation/flexible-arrays.txt deleted file mode 100644 index a0f2989dd804..000000000000 --- a/Documentation/flexible-arrays.txt +++ /dev/null @@ -1,123 +0,0 @@ -=================================== -Using flexible arrays in the kernel -=================================== - -:Updated: Last updated for 2.6.32 -:Author: Jonathan Corbet - -Large contiguous memory allocations can be unreliable in the Linux kernel. -Kernel programmers will sometimes respond to this problem by allocating -pages with vmalloc(). This solution not ideal, though. On 32-bit systems, -memory from vmalloc() must be mapped into a relatively small address space; -it's easy to run out. On SMP systems, the page table changes required by -vmalloc() allocations can require expensive cross-processor interrupts on -all CPUs. And, on all systems, use of space in the vmalloc() range -increases pressure on the translation lookaside buffer (TLB), reducing the -performance of the system. - -In many cases, the need for memory from vmalloc() can be eliminated by -piecing together an array from smaller parts; the flexible array library -exists to make this task easier. - -A flexible array holds an arbitrary (within limits) number of fixed-sized -objects, accessed via an integer index. Sparse arrays are handled -reasonably well. Only single-page allocations are made, so memory -allocation failures should be relatively rare. The down sides are that the -arrays cannot be indexed directly, individual object size cannot exceed the -system page size, and putting data into a flexible array requires a copy -operation. It's also worth noting that flexible arrays do no internal -locking at all; if concurrent access to an array is possible, then the -caller must arrange for appropriate mutual exclusion. - -The creation of a flexible array is done with:: - - #include - - struct flex_array *flex_array_alloc(int element_size, - unsigned int total, - gfp_t flags); - -The individual object size is provided by element_size, while total is the -maximum number of objects which can be stored in the array. The flags -argument is passed directly to the internal memory allocation calls. With -the current code, using flags to ask for high memory is likely to lead to -notably unpleasant side effects. - -It is also possible to define flexible arrays at compile time with:: - - DEFINE_FLEX_ARRAY(name, element_size, total); - -This macro will result in a definition of an array with the given name; the -element size and total will be checked for validity at compile time. - -Storing data into a flexible array is accomplished with a call to:: - - int flex_array_put(struct flex_array *array, unsigned int element_nr, - void *src, gfp_t flags); - -This call will copy the data from src into the array, in the position -indicated by element_nr (which must be less than the maximum specified when -the array was created). If any memory allocations must be performed, flags -will be used. The return value is zero on success, a negative error code -otherwise. - -There might possibly be a need to store data into a flexible array while -running in some sort of atomic context; in this situation, sleeping in the -memory allocator would be a bad thing. That can be avoided by using -GFP_ATOMIC for the flags value, but, often, there is a better way. The -trick is to ensure that any needed memory allocations are done before -entering atomic context, using:: - - int flex_array_prealloc(struct flex_array *array, unsigned int start, - unsigned int nr_elements, gfp_t flags); - -This function will ensure that memory for the elements indexed in the range -defined by start and nr_elements has been allocated. Thereafter, a -flex_array_put() call on an element in that range is guaranteed not to -block. - -Getting data back out of the array is done with:: - - void *flex_array_get(struct flex_array *fa, unsigned int element_nr); - -The return value is a pointer to the data element, or NULL if that -particular element has never been allocated. - -Note that it is possible to get back a valid pointer for an element which -has never been stored in the array. Memory for array elements is allocated -one page at a time; a single allocation could provide memory for several -adjacent elements. Flexible array elements are normally initialized to the -value FLEX_ARRAY_FREE (defined as 0x6c in ), so errors -involving that number probably result from use of unstored array entries. -Note that, if array elements are allocated with __GFP_ZERO, they will be -initialized to zero and this poisoning will not happen. - -Individual elements in the array can be cleared with:: - - int flex_array_clear(struct flex_array *array, unsigned int element_nr); - -This function will set the given element to FLEX_ARRAY_FREE and return -zero. If storage for the indicated element is not allocated for the array, -flex_array_clear() will return -EINVAL instead. Note that clearing an -element does not release the storage associated with it; to reduce the -allocated size of an array, call:: - - int flex_array_shrink(struct flex_array *array); - -The return value will be the number of pages of memory actually freed. -This function works by scanning the array for pages containing nothing but -FLEX_ARRAY_FREE bytes, so (1) it can be expensive, and (2) it will not work -if the array's pages are allocated with __GFP_ZERO. - -It is possible to remove all elements of an array with a call to:: - - void flex_array_free_parts(struct flex_array *array); - -This call frees all elements, but leaves the array itself in place. -Freeing the entire array is done with:: - - void flex_array_free(struct flex_array *array); - -As of this writing, there are no users of flexible arrays in the mainline -kernel. The functions described here are also not exported to modules; -that will probably be fixed when somebody comes up with a need for it. diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h deleted file mode 100644 index b94fa61b51fb..000000000000 --- a/include/linux/flex_array.h +++ /dev/null @@ -1,149 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _FLEX_ARRAY_H -#define _FLEX_ARRAY_H - -#include -#include -#include - -#define FLEX_ARRAY_PART_SIZE PAGE_SIZE -#define FLEX_ARRAY_BASE_SIZE PAGE_SIZE - -struct flex_array_part; - -/* - * This is meant to replace cases where an array-like - * structure has gotten too big to fit into kmalloc() - * and the developer is getting tempted to use - * vmalloc(). - */ - -struct flex_array { - union { - struct { - int element_size; - int total_nr_elements; - int elems_per_part; - struct reciprocal_value reciprocal_elems; - struct flex_array_part *parts[]; - }; - /* - * This little trick makes sure that - * sizeof(flex_array) == PAGE_SIZE - */ - char padding[FLEX_ARRAY_BASE_SIZE]; - }; -}; - -/* Number of bytes left in base struct flex_array, excluding metadata */ -#define FLEX_ARRAY_BASE_BYTES_LEFT \ - (FLEX_ARRAY_BASE_SIZE - offsetof(struct flex_array, parts)) - -/* Number of pointers in base to struct flex_array_part pages */ -#define FLEX_ARRAY_NR_BASE_PTRS \ - (FLEX_ARRAY_BASE_BYTES_LEFT / sizeof(struct flex_array_part *)) - -/* Number of elements of size that fit in struct flex_array_part */ -#define FLEX_ARRAY_ELEMENTS_PER_PART(size) \ - (FLEX_ARRAY_PART_SIZE / size) - -/* - * Defines a statically allocated flex array and ensures its parameters are - * valid. - */ -#define DEFINE_FLEX_ARRAY(__arrayname, __element_size, __total) \ - struct flex_array __arrayname = { { { \ - .element_size = (__element_size), \ - .total_nr_elements = (__total), \ - } } }; \ - static inline void __arrayname##_invalid_parameter(void) \ - { \ - BUILD_BUG_ON((__total) > FLEX_ARRAY_NR_BASE_PTRS * \ - FLEX_ARRAY_ELEMENTS_PER_PART(__element_size)); \ - } - -/** - * flex_array_alloc() - Creates a flexible array. - * @element_size: individual object size. - * @total: maximum number of objects which can be stored. - * @flags: GFP flags - * - * Return: Returns an object of structure flex_array. - */ -struct flex_array *flex_array_alloc(int element_size, unsigned int total, - gfp_t flags); - -/** - * flex_array_prealloc() - Ensures that memory for the elements indexed in the - * range defined by start and nr_elements has been allocated. - * @fa: array to allocate memory to. - * @start: start address - * @nr_elements: number of elements to be allocated. - * @flags: GFP flags - * - */ -int flex_array_prealloc(struct flex_array *fa, unsigned int start, - unsigned int nr_elements, gfp_t flags); - -/** - * flex_array_free() - Removes all elements of a flexible array. - * @fa: array to be freed. - */ -void flex_array_free(struct flex_array *fa); - -/** - * flex_array_free_parts() - Removes all elements of a flexible array, but - * leaves the array itself in place. - * @fa: array to be emptied. - */ -void flex_array_free_parts(struct flex_array *fa); - -/** - * flex_array_put() - Stores data into a flexible array. - * @fa: array where element is to be stored. - * @element_nr: position to copy, must be less than the maximum specified when - * the array was created. - * @src: data source to be copied into the array. - * @flags: GFP flags - * - * Return: Returns zero on success, a negative error code otherwise. - */ -int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, - gfp_t flags); - -/** - * flex_array_clear() - Clears an individual element in the array, sets the - * given element to FLEX_ARRAY_FREE. - * @element_nr: element position to clear. - * @fa: array to which element to be cleared belongs. - * - * Return: Returns zero on success, -EINVAL otherwise. - */ -int flex_array_clear(struct flex_array *fa, unsigned int element_nr); - -/** - * flex_array_get() - Retrieves data into a flexible array. - * - * @element_nr: Element position to retrieve data from. - * @fa: array from which data is to be retrieved. - * - * Return: Returns a pointer to the data element, or NULL if that - * particular element has never been allocated. - */ -void *flex_array_get(struct flex_array *fa, unsigned int element_nr); - -/** - * flex_array_shrink() - Reduces the allocated size of an array. - * @fa: array to shrink. - * - * Return: Returns number of pages of memory actually freed. - * - */ -int flex_array_shrink(struct flex_array *fa); - -#define flex_array_put_ptr(fa, nr, src, gfp) \ - flex_array_put(fa, nr, (void *)&(src), gfp) - -void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr); - -#endif /* _FLEX_ARRAY_H */ diff --git a/include/linux/poison.h b/include/linux/poison.h index 5046bad0c1c5..d6d980a681c7 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -83,9 +83,6 @@ #define MUTEX_DEBUG_FREE 0x22 #define MUTEX_POISON_WW_CTX ((void *) 0x500 + POISON_POINTER_DELTA) -/********** lib/flex_array.c **********/ -#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */ - /********** security/ **********/ #define KEY_DESTROY 0xbd diff --git a/lib/Makefile b/lib/Makefile index b798b41d01ae..4e066120a0d6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -35,7 +35,7 @@ obj-y += lockref.o obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ - gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ + gcd.o lcm.o list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o rhashtable.o reciprocal_div.o \ once.o refcount.o usercopy.o errseq.o bucket_locks.o \ diff --git a/lib/flex_array.c b/lib/flex_array.c deleted file mode 100644 index 2eed22fa507c..000000000000 --- a/lib/flex_array.c +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Flexible array managed in PAGE_SIZE parts - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright IBM Corporation, 2009 - * - * Author: Dave Hansen - */ - -#include -#include -#include -#include -#include - -struct flex_array_part { - char elements[FLEX_ARRAY_PART_SIZE]; -}; - -/* - * If a user requests an allocation which is small - * enough, we may simply use the space in the - * flex_array->parts[] array to store the user - * data. - */ -static inline int elements_fit_in_base(struct flex_array *fa) -{ - int data_size = fa->element_size * fa->total_nr_elements; - if (data_size <= FLEX_ARRAY_BASE_BYTES_LEFT) - return 1; - return 0; -} - -/** - * flex_array_alloc - allocate a new flexible array - * @element_size: the size of individual elements in the array - * @total: total number of elements that this should hold - * @flags: page allocation flags to use for base array - * - * Note: all locking must be provided by the caller. - * - * @total is used to size internal structures. If the user ever - * accesses any array indexes >=@total, it will produce errors. - * - * The maximum number of elements is defined as: the number of - * elements that can be stored in a page times the number of - * page pointers that we can fit in the base structure or (using - * integer math): - * - * (PAGE_SIZE/element_size) * (PAGE_SIZE-8)/sizeof(void *) - * - * Here's a table showing example capacities. Note that the maximum - * index that the get/put() functions is just nr_objects-1. This - * basically means that you get 4MB of storage on 32-bit and 2MB on - * 64-bit. - * - * - * Element size | Objects | Objects | - * PAGE_SIZE=4k | 32-bit | 64-bit | - * ---------------------------------| - * 1 bytes | 4177920 | 2088960 | - * 2 bytes | 2088960 | 1044480 | - * 3 bytes | 1392300 | 696150 | - * 4 bytes | 1044480 | 522240 | - * 32 bytes | 130560 | 65408 | - * 33 bytes | 126480 | 63240 | - * 2048 bytes | 2040 | 1020 | - * 2049 bytes | 1020 | 510 | - * void * | 1044480 | 261120 | - * - * Since 64-bit pointers are twice the size, we lose half the - * capacity in the base structure. Also note that no effort is made - * to efficiently pack objects across page boundaries. - */ -struct flex_array *flex_array_alloc(int element_size, unsigned int total, - gfp_t flags) -{ - struct flex_array *ret; - int elems_per_part = 0; - int max_size = 0; - struct reciprocal_value reciprocal_elems = { 0 }; - - if (element_size) { - elems_per_part = FLEX_ARRAY_ELEMENTS_PER_PART(element_size); - reciprocal_elems = reciprocal_value(elems_per_part); - max_size = FLEX_ARRAY_NR_BASE_PTRS * elems_per_part; - } - - /* max_size will end up 0 if element_size > PAGE_SIZE */ - if (total > max_size) - return NULL; - ret = kzalloc(sizeof(struct flex_array), flags); - if (!ret) - return NULL; - ret->element_size = element_size; - ret->total_nr_elements = total; - ret->elems_per_part = elems_per_part; - ret->reciprocal_elems = reciprocal_elems; - if (elements_fit_in_base(ret) && !(flags & __GFP_ZERO)) - memset(&ret->parts[0], FLEX_ARRAY_FREE, - FLEX_ARRAY_BASE_BYTES_LEFT); - return ret; -} -EXPORT_SYMBOL(flex_array_alloc); - -static int fa_element_to_part_nr(struct flex_array *fa, - unsigned int element_nr) -{ - /* - * if element_size == 0 we don't get here, so we never touch - * the zeroed fa->reciprocal_elems, which would yield invalid - * results - */ - return reciprocal_divide(element_nr, fa->reciprocal_elems); -} - -/** - * flex_array_free_parts - just free the second-level pages - * @fa: the flex array from which to free parts - * - * This is to be used in cases where the base 'struct flex_array' - * has been statically allocated and should not be free. - */ -void flex_array_free_parts(struct flex_array *fa) -{ - int part_nr; - - if (elements_fit_in_base(fa)) - return; - for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) - kfree(fa->parts[part_nr]); -} -EXPORT_SYMBOL(flex_array_free_parts); - -void flex_array_free(struct flex_array *fa) -{ - flex_array_free_parts(fa); - kfree(fa); -} -EXPORT_SYMBOL(flex_array_free); - -static unsigned int index_inside_part(struct flex_array *fa, - unsigned int element_nr, - unsigned int part_nr) -{ - unsigned int part_offset; - - part_offset = element_nr - part_nr * fa->elems_per_part; - return part_offset * fa->element_size; -} - -static struct flex_array_part * -__fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) -{ - struct flex_array_part *part = fa->parts[part_nr]; - if (!part) { - part = kmalloc(sizeof(struct flex_array_part), flags); - if (!part) - return NULL; - if (!(flags & __GFP_ZERO)) - memset(part, FLEX_ARRAY_FREE, - sizeof(struct flex_array_part)); - fa->parts[part_nr] = part; - } - return part; -} - -/** - * flex_array_put - copy data into the array at @element_nr - * @fa: the flex array to copy data into - * @element_nr: index of the position in which to insert - * the new element. - * @src: address of data to copy into the array - * @flags: page allocation flags to use for array expansion - * - * - * Note that this *copies* the contents of @src into - * the array. If you are trying to store an array of - * pointers, make sure to pass in &ptr instead of ptr. - * You may instead wish to use the flex_array_put_ptr() - * helper function. - * - * Locking must be provided by the caller. - */ -int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, - gfp_t flags) -{ - int part_nr = 0; - struct flex_array_part *part; - void *dst; - - if (element_nr >= fa->total_nr_elements) - return -ENOSPC; - if (!fa->element_size) - return 0; - if (elements_fit_in_base(fa)) - part = (struct flex_array_part *)&fa->parts[0]; - else { - part_nr = fa_element_to_part_nr(fa, element_nr); - part = __fa_get_part(fa, part_nr, flags); - if (!part) - return -ENOMEM; - } - dst = &part->elements[index_inside_part(fa, element_nr, part_nr)]; - memcpy(dst, src, fa->element_size); - return 0; -} -EXPORT_SYMBOL(flex_array_put); - -/** - * flex_array_clear - clear element in array at @element_nr - * @fa: the flex array of the element. - * @element_nr: index of the position to clear. - * - * Locking must be provided by the caller. - */ -int flex_array_clear(struct flex_array *fa, unsigned int element_nr) -{ - int part_nr = 0; - struct flex_array_part *part; - void *dst; - - if (element_nr >= fa->total_nr_elements) - return -ENOSPC; - if (!fa->element_size) - return 0; - if (elements_fit_in_base(fa)) - part = (struct flex_array_part *)&fa->parts[0]; - else { - part_nr = fa_element_to_part_nr(fa, element_nr); - part = fa->parts[part_nr]; - if (!part) - return -EINVAL; - } - dst = &part->elements[index_inside_part(fa, element_nr, part_nr)]; - memset(dst, FLEX_ARRAY_FREE, fa->element_size); - return 0; -} -EXPORT_SYMBOL(flex_array_clear); - -/** - * flex_array_prealloc - guarantee that array space exists - * @fa: the flex array for which to preallocate parts - * @start: index of first array element for which space is allocated - * @nr_elements: number of elements for which space is allocated - * @flags: page allocation flags - * - * This will guarantee that no future calls to flex_array_put() - * will allocate memory. It can be used if you are expecting to - * be holding a lock or in some atomic context while writing - * data into the array. - * - * Locking must be provided by the caller. - */ -int flex_array_prealloc(struct flex_array *fa, unsigned int start, - unsigned int nr_elements, gfp_t flags) -{ - int start_part; - int end_part; - int part_nr; - unsigned int end; - struct flex_array_part *part; - - if (!start && !nr_elements) - return 0; - if (start >= fa->total_nr_elements) - return -ENOSPC; - if (!nr_elements) - return 0; - - end = start + nr_elements - 1; - - if (end >= fa->total_nr_elements) - return -ENOSPC; - if (!fa->element_size) - return 0; - if (elements_fit_in_base(fa)) - return 0; - start_part = fa_element_to_part_nr(fa, start); - end_part = fa_element_to_part_nr(fa, end); - for (part_nr = start_part; part_nr <= end_part; part_nr++) { - part = __fa_get_part(fa, part_nr, flags); - if (!part) - return -ENOMEM; - } - return 0; -} -EXPORT_SYMBOL(flex_array_prealloc); - -/** - * flex_array_get - pull data back out of the array - * @fa: the flex array from which to extract data - * @element_nr: index of the element to fetch from the array - * - * Returns a pointer to the data at index @element_nr. Note - * that this is a copy of the data that was passed in. If you - * are using this to store pointers, you'll get back &ptr. You - * may instead wish to use the flex_array_get_ptr helper. - * - * Locking must be provided by the caller. - */ -void *flex_array_get(struct flex_array *fa, unsigned int element_nr) -{ - int part_nr = 0; - struct flex_array_part *part; - - if (!fa->element_size) - return NULL; - if (element_nr >= fa->total_nr_elements) - return NULL; - if (elements_fit_in_base(fa)) - part = (struct flex_array_part *)&fa->parts[0]; - else { - part_nr = fa_element_to_part_nr(fa, element_nr); - part = fa->parts[part_nr]; - if (!part) - return NULL; - } - return &part->elements[index_inside_part(fa, element_nr, part_nr)]; -} -EXPORT_SYMBOL(flex_array_get); - -/** - * flex_array_get_ptr - pull a ptr back out of the array - * @fa: the flex array from which to extract data - * @element_nr: index of the element to fetch from the array - * - * Returns the pointer placed in the flex array at element_nr using - * flex_array_put_ptr(). This function should not be called if the - * element in question was not set using the _put_ptr() helper. - */ -void *flex_array_get_ptr(struct flex_array *fa, unsigned int element_nr) -{ - void **tmp; - - tmp = flex_array_get(fa, element_nr); - if (!tmp) - return NULL; - - return *tmp; -} -EXPORT_SYMBOL(flex_array_get_ptr); - -static int part_is_free(struct flex_array_part *part) -{ - int i; - - for (i = 0; i < sizeof(struct flex_array_part); i++) - if (part->elements[i] != FLEX_ARRAY_FREE) - return 0; - return 1; -} - -/** - * flex_array_shrink - free unused second-level pages - * @fa: the flex array to shrink - * - * Frees all second-level pages that consist solely of unused - * elements. Returns the number of pages freed. - * - * Locking must be provided by the caller. - */ -int flex_array_shrink(struct flex_array *fa) -{ - struct flex_array_part *part; - int part_nr; - int ret = 0; - - if (!fa->total_nr_elements || !fa->element_size) - return 0; - if (elements_fit_in_base(fa)) - return ret; - for (part_nr = 0; part_nr < FLEX_ARRAY_NR_BASE_PTRS; part_nr++) { - part = fa->parts[part_nr]; - if (!part) - continue; - if (part_is_free(part)) { - fa->parts[part_nr] = NULL; - kfree(part); - ret++; - } - } - return ret; -} -EXPORT_SYMBOL(flex_array_shrink); diff --git a/tools/include/linux/poison.h b/tools/include/linux/poison.h index 9fdcd3eaac3b..d29725769107 100644 --- a/tools/include/linux/poison.h +++ b/tools/include/linux/poison.h @@ -87,9 +87,6 @@ #define MUTEX_DEBUG_INIT 0x11 #define MUTEX_DEBUG_FREE 0x22 -/********** lib/flex_array.c **********/ -#define FLEX_ARRAY_FREE 0x6c /* for use-after-free poisoning */ - /********** security/ **********/ #define KEY_DESTROY 0xbd -- cgit v1.3-7-g2ca7 From ef776a272b093fb52612e6d8f4b3e77f9bb49554 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 12 Mar 2019 10:23:06 -0700 Subject: bpf: Sync bpf.h to tools/ This patch sync the uapi bpf.h to tools/. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3c38ac9a92a7..983b25cb608d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2366,6 +2366,14 @@ union bpf_attr { * current value is ect (ECN capable). Works with IPv6 and IPv4. * Return * 1 if set, 0 if not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in TCP_LISTEN state. + * bpf_sk_release() is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2465,7 +2473,8 @@ union bpf_attr { FN(spin_unlock), \ FN(sk_fullsock), \ FN(tcp_sock), \ - FN(skb_ecn_set_ce), + FN(skb_ecn_set_ce), \ + FN(get_listener_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.3-7-g2ca7 From ea6eced00e4b28821804eee24e80d9528f48972a Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Thu, 14 Mar 2019 12:38:41 +0000 Subject: tools: bpf: synchronise BPF UAPI header with tools Synchronise the bpf.h header under tools, to report the latest fixes and additions to the documentation for the BPF helpers. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Alexei Starovoitov --- tools/include/uapi/linux/bpf.h | 183 +++++++++++++++++++++++++++-------------- 1 file changed, 120 insertions(+), 63 deletions(-) (limited to 'tools/include') diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 983b25cb608d..929c8e537a14 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -502,16 +502,6 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) - * Description - * Push an element *value* in *map*. *flags* is one of: - * - * **BPF_EXIST** - * If the queue/stack is full, the oldest element is removed to - * make room for this. - * Return - * 0 on success, or a negative error in case of failure. - * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -1435,14 +1425,14 @@ union bpf_attr { * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_addr** contex. + * *skb*, but gets socket from **struct bpf_sock_addr** context. * Return * A 8-byte long non-decreasing number. * * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) * Description * Equivalent to bpf_get_socket_cookie() helper that accepts - * *skb*, but gets socket from **struct bpf_sock_ops** contex. + * *skb*, but gets socket from **struct bpf_sock_ops** context. * Return * A 8-byte long non-decreasing number. * @@ -2098,52 +2088,52 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * int bpf_rc_repeat(void *ctx) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded key press with *scancode*, - * *toggle* value in the given *protocol*. The scancode will be - * translated to a keycode using the rc keymap, and reported as - * an input key down event. After a period a key up event is - * generated. This period can be extended by calling either - * **bpf_rc_keydown**\ () again with the same values, or calling - * **bpf_rc_repeat**\ (). + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. * - * Some protocols include a toggle bit, in case the button was - * released and pressed again between consecutive scancodes. + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. * * The *ctx* should point to the lirc sample as passed into * the program. * - * The *protocol* is the decoded protocol number (see - * **enum rc_proto** for some predefined values). - * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * int bpf_rc_repeat(void *ctx) + * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) * Description * This helper is used in programs implementing IR decoding, to - * report a successfully decoded repeat key message. This delays - * the generation of a key up event for previously generated - * key down event. + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). * - * Some IR protocols like NEC have a special IR message for - * repeating last button, for when a button is held down. + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. * * The *ctx* should point to the lirc sample as passed into * the program. * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * * This helper is only available is the kernel was compiled with * the **CONFIG_BPF_LIRC_MODE2** configuration option set to * "**y**". * Return * 0 * - * uint64_t bpf_skb_cgroup_id(struct sk_buff *skb) + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) * Description * Return the cgroup v2 id of the socket associated with the *skb*. * This is roughly similar to the **bpf_get_cgroup_classid**\ () @@ -2159,30 +2149,12 @@ union bpf_attr { * Return * The id is returned or 0 in case the id could not be retrieved. * - * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) - * Description - * Return id of cgroup v2 that is ancestor of cgroup associated - * with the *skb* at the *ancestor_level*. The root cgroup is at - * *ancestor_level* zero and each step down the hierarchy - * increments the level. If *ancestor_level* == level of cgroup - * associated with *skb*, then return value will be same as that - * of **bpf_skb_cgroup_id**\ (). - * - * The helper is useful to implement policies based on cgroups - * that are upper in hierarchy than immediate cgroup associated - * with *skb*. - * - * The format of returned id and helper limitations are same as in - * **bpf_skb_cgroup_id**\ (). - * Return - * The id is returned or 0 in case the id could not be retrieved. - * * u64 bpf_get_current_cgroup_id(void) * Return * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. * - * void* get_local_storage(void *map, u64 flags) + * void *bpf_get_local_storage(void *map, u64 flags) * Description * Get the pointer to the local storage area. * The type and the size of the local storage is defined @@ -2209,6 +2181,24 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) * Description * Look for TCP socket matching *tuple*, optionally in a child @@ -2289,6 +2279,16 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_map_pop_elem(struct bpf_map *map, void *value) * Description * Pop an element from *map*. @@ -2343,36 +2343,93 @@ union bpf_attr { * Return * 0 * + * int bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * int bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_sock** pointer such - * that all the fields in bpf_sock can be accessed. + * that all the fields in this **bpf_sock** can be accessed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. * * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) * Description * This helper gets a **struct bpf_tcp_sock** pointer from a * **struct bpf_sock** pointer. - * * Return - * A **struct bpf_tcp_sock** pointer on success, or NULL in + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. * * int bpf_skb_ecn_set_ce(struct sk_buf *skb) - * Description - * Sets ECN of IP header to ce (congestion encountered) if - * current value is ect (ECN capable). Works with IPv6 and IPv4. - * Return - * 1 if set, 0 if not set. + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. * * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) * Description - * Return a **struct bpf_sock** pointer in TCP_LISTEN state. - * bpf_sk_release() is unnecessary and not allowed. + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. * Return - * A **struct bpf_sock** pointer on success, or NULL in + * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ -- cgit v1.3-7-g2ca7