From 50069a5851323ba5def0e414a21e234345016870 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 31 May 2012 16:26:34 -0700 Subject: selftests: add mq_open_tests Add a directory to house POSIX message queue subsystem specific tests. Add first test which checks the operation of mq_open() under various corner conditions. Signed-off-by: Doug Ledford Cc: KOSAKI Motohiro Cc: Doug Ledford Cc: Joe Korty Cc: Amerigo Wang Cc: Serge E. Hallyn Cc: Jiri Slaby Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/mqueue/Makefile | 8 + tools/testing/selftests/mqueue/mq_open_tests.c | 492 +++++++++++++++++++++++++ 3 files changed, 501 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/mqueue/Makefile create mode 100644 tools/testing/selftests/mqueue/mq_open_tests.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 28bc57ee757c..14972017a43e 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints vm +TARGETS = breakpoints mqueue vm all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile new file mode 100644 index 000000000000..bd74142a173f --- /dev/null +++ b/tools/testing/selftests/mqueue/Makefile @@ -0,0 +1,8 @@ +all: + gcc -O2 -lrt mq_open_tests.c -o mq_open_tests + +run_tests: + ./mq_open_tests /test1 + +clean: + rm -f mq_open_tests diff --git a/tools/testing/selftests/mqueue/mq_open_tests.c b/tools/testing/selftests/mqueue/mq_open_tests.c new file mode 100644 index 000000000000..711cc2923047 --- /dev/null +++ b/tools/testing/selftests/mqueue/mq_open_tests.c @@ -0,0 +1,492 @@ +/* + * This application is Copyright 2012 Red Hat, Inc. + * Doug Ledford + * + * mq_open_tests is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3. + * + * mq_open_tests is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * For the full text of the license, see . + * + * mq_open_tests.c + * Tests the various situations that should either succeed or fail to + * open a posix message queue and then reports whether or not they + * did as they were supposed to. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char *usage = +"Usage:\n" +" %s path\n" +"\n" +" path Path name of the message queue to create\n" +"\n" +" Note: this program must be run as root in order to enable all tests\n" +"\n"; + +char *DEF_MSGS = "/proc/sys/fs/mqueue/msg_default"; +char *DEF_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_default"; +char *MAX_MSGS = "/proc/sys/fs/mqueue/msg_max"; +char *MAX_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_max"; + +int default_settings; +struct rlimit saved_limits, cur_limits; +int saved_def_msgs, saved_def_msgsize, saved_max_msgs, saved_max_msgsize; +int cur_def_msgs, cur_def_msgsize, cur_max_msgs, cur_max_msgsize; +FILE *def_msgs, *def_msgsize, *max_msgs, *max_msgsize; +char *queue_path; +mqd_t queue = -1; + +static inline void __set(FILE *stream, int value, char *err_msg); +void shutdown(int exit_val, char *err_cause, int line_no); +static inline int get(FILE *stream); +static inline void set(FILE *stream, int value); +static inline void getr(int type, struct rlimit *rlim); +static inline void setr(int type, struct rlimit *rlim); +void validate_current_settings(); +static inline void test_queue(struct mq_attr *attr, struct mq_attr *result); +static inline int test_queue_fail(struct mq_attr *attr, struct mq_attr *result); + +static inline void __set(FILE *stream, int value, char *err_msg) +{ + rewind(stream); + if (fprintf(stream, "%d", value) < 0) + perror(err_msg); +} + + +void shutdown(int exit_val, char *err_cause, int line_no) +{ + static int in_shutdown = 0; + + /* In case we get called recursively by a set() call below */ + if (in_shutdown++) + return; + + seteuid(0); + + if (queue != -1) + if (mq_close(queue)) + perror("mq_close() during shutdown"); + if (queue_path) + /* + * Be silent if this fails, if we cleaned up already it's + * expected to fail + */ + mq_unlink(queue_path); + if (default_settings) { + if (saved_def_msgs) + __set(def_msgs, saved_def_msgs, + "failed to restore saved_def_msgs"); + if (saved_def_msgsize) + __set(def_msgsize, saved_def_msgsize, + "failed to restore saved_def_msgsize"); + } + if (saved_max_msgs) + __set(max_msgs, saved_max_msgs, + "failed to restore saved_max_msgs"); + if (saved_max_msgsize) + __set(max_msgsize, saved_max_msgsize, + "failed to restore saved_max_msgsize"); + if (exit_val) + error(exit_val, errno, "%s at %d", err_cause, line_no); + exit(0); +} + +static inline int get(FILE *stream) +{ + int value; + rewind(stream); + if (fscanf(stream, "%d", &value) != 1) + shutdown(4, "Error reading /proc entry", __LINE__ - 1); + return value; +} + +static inline void set(FILE *stream, int value) +{ + int new_value; + + rewind(stream); + if (fprintf(stream, "%d", value) < 0) + return shutdown(5, "Failed writing to /proc file", + __LINE__ - 1); + new_value = get(stream); + if (new_value != value) + return shutdown(5, "We didn't get what we wrote to /proc back", + __LINE__ - 1); +} + +static inline void getr(int type, struct rlimit *rlim) +{ + if (getrlimit(type, rlim)) + shutdown(6, "getrlimit()", __LINE__ - 1); +} + +static inline void setr(int type, struct rlimit *rlim) +{ + if (setrlimit(type, rlim)) + shutdown(7, "setrlimit()", __LINE__ - 1); +} + +void validate_current_settings() +{ + int rlim_needed; + + if (cur_limits.rlim_cur < 4096) { + printf("Current rlimit value for POSIX message queue bytes is " + "unreasonably low,\nincreasing.\n\n"); + cur_limits.rlim_cur = 8192; + cur_limits.rlim_max = 16384; + setr(RLIMIT_MSGQUEUE, &cur_limits); + } + + if (default_settings) { + rlim_needed = (cur_def_msgs + 1) * (cur_def_msgsize + 1 + + 2 * sizeof(void *)); + if (rlim_needed > cur_limits.rlim_cur) { + printf("Temporarily lowering default queue parameters " + "to something that will work\n" + "with the current rlimit values.\n\n"); + set(def_msgs, 10); + cur_def_msgs = 10; + set(def_msgsize, 128); + cur_def_msgsize = 128; + } + } else { + rlim_needed = (cur_max_msgs + 1) * (cur_max_msgsize + 1 + + 2 * sizeof(void *)); + if (rlim_needed > cur_limits.rlim_cur) { + printf("Temporarily lowering maximum queue parameters " + "to something that will work\n" + "with the current rlimit values in case this is " + "a kernel that ties the default\n" + "queue parameters to the maximum queue " + "parameters.\n\n"); + set(max_msgs, 10); + cur_max_msgs = 10; + set(max_msgsize, 128); + cur_max_msgsize = 128; + } + } +} + +/* + * test_queue - Test opening a queue, shutdown if we fail. This should + * only be called in situations that should never fail. We clean up + * after ourselves and return the queue attributes in *result. + */ +static inline void test_queue(struct mq_attr *attr, struct mq_attr *result) +{ + int flags = O_RDWR | O_EXCL | O_CREAT; + int perms = DEFFILEMODE; + + if ((queue = mq_open(queue_path, flags, perms, attr)) == -1) + shutdown(1, "mq_open()", __LINE__); + if (mq_getattr(queue, result)) + shutdown(1, "mq_getattr()", __LINE__); + if (mq_close(queue)) + shutdown(1, "mq_close()", __LINE__); + queue = -1; + if (mq_unlink(queue_path)) + shutdown(1, "mq_unlink()", __LINE__); +} + +/* + * Same as test_queue above, but failure is not fatal. + * Returns: + * 0 - Failed to create a queue + * 1 - Created a queue, attributes in *result + */ +static inline int test_queue_fail(struct mq_attr *attr, struct mq_attr *result) +{ + int flags = O_RDWR | O_EXCL | O_CREAT; + int perms = DEFFILEMODE; + + if ((queue = mq_open(queue_path, flags, perms, attr)) == -1) + return 0; + if (mq_getattr(queue, result)) + shutdown(1, "mq_getattr()", __LINE__); + if (mq_close(queue)) + shutdown(1, "mq_close()", __LINE__); + queue = -1; + if (mq_unlink(queue_path)) + shutdown(1, "mq_unlink()", __LINE__); + return 1; +} + +int main(int argc, char *argv[]) +{ + struct mq_attr attr, result; + + if (argc != 2) { + fprintf(stderr, "Must pass a valid queue name\n\n"); + fprintf(stderr, usage, argv[0]); + exit(1); + } + + /* + * Although we can create a msg queue with a non-absolute path name, + * unlink will fail. So, if the name doesn't start with a /, add one + * when we save it. + */ + if (*argv[1] == '/') + queue_path = strdup(argv[1]); + else { + queue_path = malloc(strlen(argv[1]) + 2); + if (!queue_path) { + perror("malloc()"); + exit(1); + } + queue_path[0] = '/'; + queue_path[1] = 0; + strcat(queue_path, argv[1]); + } + + if (getuid() != 0) { + fprintf(stderr, "Not running as root, but almost all tests " + "require root in order to modify\nsystem settings. " + "Exiting.\n"); + exit(1); + } + + /* Find out what files there are for us to make tweaks in */ + def_msgs = fopen(DEF_MSGS, "r+"); + def_msgsize = fopen(DEF_MSGSIZE, "r+"); + max_msgs = fopen(MAX_MSGS, "r+"); + max_msgsize = fopen(MAX_MSGSIZE, "r+"); + + if (!max_msgs) + shutdown(2, "Failed to open msg_max", __LINE__); + if (!max_msgsize) + shutdown(2, "Failed to open msgsize_max", __LINE__); + if (def_msgs || def_msgsize) + default_settings = 1; + + /* Load up the current system values for everything we can */ + getr(RLIMIT_MSGQUEUE, &saved_limits); + cur_limits = saved_limits; + if (default_settings) { + saved_def_msgs = cur_def_msgs = get(def_msgs); + saved_def_msgsize = cur_def_msgsize = get(def_msgsize); + } + saved_max_msgs = cur_max_msgs = get(max_msgs); + saved_max_msgsize = cur_max_msgsize = get(max_msgsize); + + /* Tell the user our initial state */ + printf("\nInitial system state:\n"); + printf("\tUsing queue path:\t\t%s\n", queue_path); + printf("\tRLIMIT_MSGQUEUE(soft):\t\t%d\n", saved_limits.rlim_cur); + printf("\tRLIMIT_MSGQUEUE(hard):\t\t%d\n", saved_limits.rlim_max); + printf("\tMaximum Message Size:\t\t%d\n", saved_max_msgsize); + printf("\tMaximum Queue Size:\t\t%d\n", saved_max_msgs); + if (default_settings) { + printf("\tDefault Message Size:\t\t%d\n", saved_def_msgsize); + printf("\tDefault Queue Size:\t\t%d\n", saved_def_msgs); + } else { + printf("\tDefault Message Size:\t\tNot Supported\n"); + printf("\tDefault Queue Size:\t\tNot Supported\n"); + } + printf("\n"); + + validate_current_settings(); + + printf("Adjusted system state for testing:\n"); + printf("\tRLIMIT_MSGQUEUE(soft):\t\t%d\n", cur_limits.rlim_cur); + printf("\tRLIMIT_MSGQUEUE(hard):\t\t%d\n", cur_limits.rlim_max); + printf("\tMaximum Message Size:\t\t%d\n", cur_max_msgsize); + printf("\tMaximum Queue Size:\t\t%d\n", cur_max_msgs); + if (default_settings) { + printf("\tDefault Message Size:\t\t%d\n", cur_def_msgsize); + printf("\tDefault Queue Size:\t\t%d\n", cur_def_msgs); + } + + printf("\n\nTest series 1, behavior when no attr struct " + "passed to mq_open:\n"); + if (!default_settings) { + test_queue(NULL, &result); + printf("Given sane system settings, mq_open without an attr " + "struct succeeds:\tPASS\n"); + if (result.mq_maxmsg != cur_max_msgs || + result.mq_msgsize != cur_max_msgsize) { + printf("Kernel does not support setting the default " + "mq attributes,\nbut also doesn't tie the " + "defaults to the maximums:\t\t\tPASS\n"); + } else { + set(max_msgs, ++cur_max_msgs); + set(max_msgsize, ++cur_max_msgsize); + test_queue(NULL, &result); + if (result.mq_maxmsg == cur_max_msgs && + result.mq_msgsize == cur_max_msgsize) + printf("Kernel does not support setting the " + "default mq attributes and\n" + "also ties system wide defaults to " + "the system wide maximums:\t\t" + "FAIL\n"); + else + printf("Kernel does not support setting the " + "default mq attributes,\n" + "but also doesn't tie the defaults to " + "the maximums:\t\t\tPASS\n"); + } + } else { + printf("Kernel supports setting defaults separately from " + "maximums:\t\tPASS\n"); + /* + * While we are here, go ahead and test that the kernel + * properly follows the default settings + */ + test_queue(NULL, &result); + printf("Given sane values, mq_open without an attr struct " + "succeeds:\t\tPASS\n"); + if (result.mq_maxmsg != cur_def_msgs || + result.mq_msgsize != cur_def_msgsize) + printf("Kernel supports setting defaults, but does " + "not actually honor them:\tFAIL\n\n"); + else { + set(def_msgs, ++cur_def_msgs); + set(def_msgsize, ++cur_def_msgsize); + /* In case max was the same as the default */ + set(max_msgs, ++cur_max_msgs); + set(max_msgsize, ++cur_max_msgsize); + test_queue(NULL, &result); + if (result.mq_maxmsg != cur_def_msgs || + result.mq_msgsize != cur_def_msgsize) + printf("Kernel supports setting defaults, but " + "does not actually honor them:\t" + "FAIL\n"); + else + printf("Kernel properly honors default setting " + "knobs:\t\t\t\tPASS\n"); + } + set(def_msgs, cur_max_msgs + 1); + cur_def_msgs = cur_max_msgs + 1; + set(def_msgsize, cur_max_msgsize + 1); + cur_def_msgsize = cur_max_msgsize + 1; + if (cur_def_msgs * (cur_def_msgsize + 2 * sizeof(void *)) >= + cur_limits.rlim_cur) { + cur_limits.rlim_cur = (cur_def_msgs + 2) * + (cur_def_msgsize + 2 * sizeof(void *)); + cur_limits.rlim_max = 2 * cur_limits.rlim_cur; + setr(RLIMIT_MSGQUEUE, &cur_limits); + } + if (test_queue_fail(NULL, &result)) { + if (result.mq_maxmsg == cur_max_msgs && + result.mq_msgsize == cur_max_msgsize) + printf("Kernel properly limits default values " + "to lesser of default/max:\t\tPASS\n"); + else + printf("Kernel does not properly set default " + "queue parameters when\ndefaults > " + "max:\t\t\t\t\t\t\t\tFAIL\n"); + } else + printf("Kernel fails to open mq because defaults are " + "greater than maximums:\tFAIL\n"); + set(def_msgs, --cur_def_msgs); + set(def_msgsize, --cur_def_msgsize); + cur_limits.rlim_cur = cur_limits.rlim_max = cur_def_msgs * + cur_def_msgsize; + setr(RLIMIT_MSGQUEUE, &cur_limits); + if (test_queue_fail(NULL, &result)) + printf("Kernel creates queue even though defaults " + "would exceed\nrlimit setting:" + "\t\t\t\t\t\t\t\tFAIL\n"); + else + printf("Kernel properly fails to create queue when " + "defaults would\nexceed rlimit:" + "\t\t\t\t\t\t\t\tPASS\n"); + } + + /* + * Test #2 - open with an attr struct that exceeds rlimit + */ + printf("\n\nTest series 2, behavior when attr struct is " + "passed to mq_open:\n"); + cur_max_msgs = 32; + cur_max_msgsize = cur_limits.rlim_max >> 4; + set(max_msgs, cur_max_msgs); + set(max_msgsize, cur_max_msgsize); + attr.mq_maxmsg = cur_max_msgs; + attr.mq_msgsize = cur_max_msgsize; + if (test_queue_fail(&attr, &result)) + printf("Queue open in excess of rlimit max when euid = 0 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open in excess of rlimit max when euid = 0 " + "failed:\t\tPASS\n"); + attr.mq_maxmsg = cur_max_msgs + 1; + attr.mq_msgsize = 10; + if (test_queue_fail(&attr, &result)) + printf("Queue open with mq_maxmsg > limit when euid = 0 " + "succeeded:\t\tPASS\n"); + else + printf("Queue open with mq_maxmsg > limit when euid = 0 " + "failed:\t\tFAIL\n"); + attr.mq_maxmsg = 1; + attr.mq_msgsize = cur_max_msgsize + 1; + if (test_queue_fail(&attr, &result)) + printf("Queue open with mq_msgsize > limit when euid = 0 " + "succeeded:\t\tPASS\n"); + else + printf("Queue open with mq_msgsize > limit when euid = 0 " + "failed:\t\tFAIL\n"); + attr.mq_maxmsg = 65536; + attr.mq_msgsize = 65536; + if (test_queue_fail(&attr, &result)) + printf("Queue open with total size > 2GB when euid = 0 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open with total size > 2GB when euid = 0 " + "failed:\t\t\tPASS\n"); + seteuid(99); + attr.mq_maxmsg = cur_max_msgs; + attr.mq_msgsize = cur_max_msgsize; + if (test_queue_fail(&attr, &result)) + printf("Queue open in excess of rlimit max when euid = 99 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open in excess of rlimit max when euid = 99 " + "failed:\t\tPASS\n"); + attr.mq_maxmsg = cur_max_msgs + 1; + attr.mq_msgsize = 10; + if (test_queue_fail(&attr, &result)) + printf("Queue open with mq_maxmsg > limit when euid = 99 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open with mq_maxmsg > limit when euid = 99 " + "failed:\t\tPASS\n"); + attr.mq_maxmsg = 1; + attr.mq_msgsize = cur_max_msgsize + 1; + if (test_queue_fail(&attr, &result)) + printf("Queue open with mq_msgsize > limit when euid = 99 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open with mq_msgsize > limit when euid = 99 " + "failed:\t\tPASS\n"); + attr.mq_maxmsg = 65536; + attr.mq_msgsize = 65536; + if (test_queue_fail(&attr, &result)) + printf("Queue open with total size > 2GB when euid = 99 " + "succeeded:\t\tFAIL\n"); + else + printf("Queue open with total size > 2GB when euid = 99 " + "failed:\t\t\tPASS\n"); + + shutdown(0,"",0); +} -- cgit v1.2.3-59-g8ed1b From 7820b0715b6fb1378fab41b27fb7aa3950852cb7 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 31 May 2012 16:26:37 -0700 Subject: tools/selftests: add mq_perf_tests Add the mq_perf_tests tool I used when creating my mq performance patch. Also add a local .gitignore to keep the binaries from showing up in git status output. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Doug Ledford Cc: Stephen Rothwell Cc: Manfred Spraul Cc: Frederic Weisbecker Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/mqueue/.gitignore | 2 + tools/testing/selftests/mqueue/Makefile | 4 +- tools/testing/selftests/mqueue/mq_perf_tests.c | 741 +++++++++++++++++++++++++ 3 files changed, 746 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/mqueue/.gitignore create mode 100644 tools/testing/selftests/mqueue/mq_perf_tests.c (limited to 'tools/testing') diff --git a/tools/testing/selftests/mqueue/.gitignore b/tools/testing/selftests/mqueue/.gitignore new file mode 100644 index 000000000000..d8d42377205a --- /dev/null +++ b/tools/testing/selftests/mqueue/.gitignore @@ -0,0 +1,2 @@ +mq_open_tests +mq_perf_tests diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile index bd74142a173f..54c0aad2b47c 100644 --- a/tools/testing/selftests/mqueue/Makefile +++ b/tools/testing/selftests/mqueue/Makefile @@ -1,8 +1,10 @@ all: gcc -O2 -lrt mq_open_tests.c -o mq_open_tests + gcc -O2 -lrt -lpthread -lpopt -o mq_perf_tests mq_perf_tests.c run_tests: ./mq_open_tests /test1 + ./mq_perf_tests clean: - rm -f mq_open_tests + rm -f mq_open_tests mq_perf_tests diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c new file mode 100644 index 000000000000..2fadd4b97045 --- /dev/null +++ b/tools/testing/selftests/mqueue/mq_perf_tests.c @@ -0,0 +1,741 @@ +/* + * This application is Copyright 2012 Red Hat, Inc. + * Doug Ledford + * + * mq_perf_tests is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3. + * + * mq_perf_tests is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * For the full text of the license, see . + * + * mq_perf_tests.c + * Tests various types of message queue workloads, concentrating on those + * situations that invole large message sizes, large message queue depths, + * or both, and reports back useful metrics about kernel message queue + * performance. + * + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char *usage = +"Usage:\n" +" %s [-c #[,#..] -f] path\n" +"\n" +" -c # Skip most tests and go straight to a high queue depth test\n" +" and then run that test continuously (useful for running at\n" +" the same time as some other workload to see how much the\n" +" cache thrashing caused by adding messages to a very deep\n" +" queue impacts the performance of other programs). The number\n" +" indicates which CPU core we should bind the process to during\n" +" the run. If you have more than one physical CPU, then you\n" +" will need one copy per physical CPU package, and you should\n" +" specify the CPU cores to pin ourself to via a comma separated\n" +" list of CPU values.\n" +" -f Only usable with continuous mode. Pin ourself to the CPUs\n" +" as requested, then instead of looping doing a high mq\n" +" workload, just busy loop. This will allow us to lock up a\n" +" single CPU just like we normally would, but without actually\n" +" thrashing the CPU cache. This is to make it easier to get\n" +" comparable numbers from some other workload running on the\n" +" other CPUs. One set of numbers with # CPUs locked up running\n" +" an mq workload, and another set of numbers with those same\n" +" CPUs locked away from the test workload, but not doing\n" +" anything to trash the cache like the mq workload might.\n" +" path Path name of the message queue to create\n" +"\n" +" Note: this program must be run as root in order to enable all tests\n" +"\n"; + +char *MAX_MSGS = "/proc/sys/fs/mqueue/msg_max"; +char *MAX_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_max"; + +#define min(a, b) ((a) < (b) ? (a) : (b)) +#define MAX_CPUS 64 +char *cpu_option_string; +int cpus_to_pin[MAX_CPUS]; +int num_cpus_to_pin; +pthread_t cpu_threads[MAX_CPUS]; +pthread_t main_thread; +cpu_set_t *cpu_set; +int cpu_set_size; +int cpus_online; + +#define MSG_SIZE 16 +#define TEST1_LOOPS 10000000 +#define TEST2_LOOPS 100000 +int continuous_mode; +int continuous_mode_fake; + +struct rlimit saved_limits, cur_limits; +int saved_max_msgs, saved_max_msgsize; +int cur_max_msgs, cur_max_msgsize; +FILE *max_msgs, *max_msgsize; +int cur_nice; +char *queue_path = "/mq_perf_tests"; +mqd_t queue = -1; +struct mq_attr result; +int mq_prio_max; + +const struct poptOption options[] = { + { + .longName = "continuous", + .shortName = 'c', + .argInfo = POPT_ARG_STRING, + .arg = &cpu_option_string, + .val = 'c', + .descrip = "Run continuous tests at a high queue depth in " + "order to test the effects of cache thrashing on " + "other tasks on the system. This test is intended " + "to be run on one core of each physical CPU while " + "some other CPU intensive task is run on all the other " + "cores of that same physical CPU and the other task " + "is timed. It is assumed that the process of adding " + "messages to the message queue in a tight loop will " + "impact that other task to some degree. Once the " + "tests are performed in this way, you should then " + "re-run the tests using fake mode in order to check " + "the difference in time required to perform the CPU " + "intensive task", + .argDescrip = "cpu[,cpu]", + }, + { + .longName = "fake", + .shortName = 'f', + .argInfo = POPT_ARG_NONE, + .arg = &continuous_mode_fake, + .val = 0, + .descrip = "Tie up the CPUs that we would normally tie up in" + "continuous mode, but don't actually do any mq stuff, " + "just keep the CPU busy so it can't be used to process " + "system level tasks as this would free up resources on " + "the other CPU cores and skew the comparison between " + "the no-mqueue work and mqueue work tests", + .argDescrip = NULL, + }, + { + .longName = "path", + .shortName = 'p', + .argInfo = POPT_ARG_STRING | POPT_ARGFLAG_SHOW_DEFAULT, + .arg = &queue_path, + .val = 'p', + .descrip = "The name of the path to use in the mqueue " + "filesystem for our tests", + .argDescrip = "pathname", + }, + POPT_AUTOHELP + POPT_TABLEEND +}; + +static inline void __set(FILE *stream, int value, char *err_msg); +void shutdown(int exit_val, char *err_cause, int line_no); +void sig_action_SIGUSR1(int signum, siginfo_t *info, void *context); +void sig_action(int signum, siginfo_t *info, void *context); +static inline int get(FILE *stream); +static inline void set(FILE *stream, int value); +static inline int try_set(FILE *stream, int value); +static inline void getr(int type, struct rlimit *rlim); +static inline void setr(int type, struct rlimit *rlim); +static inline void open_queue(struct mq_attr *attr); +void increase_limits(void); + +static inline void __set(FILE *stream, int value, char *err_msg) +{ + rewind(stream); + if (fprintf(stream, "%d", value) < 0) + perror(err_msg); +} + + +void shutdown(int exit_val, char *err_cause, int line_no) +{ + static int in_shutdown = 0; + int errno_at_shutdown = errno; + int i; + + /* In case we get called by multiple threads or from an sighandler */ + if (in_shutdown++) + return; + + for (i = 0; i < num_cpus_to_pin; i++) + if (cpu_threads[i]) { + pthread_kill(cpu_threads[i], SIGUSR1); + pthread_join(cpu_threads[i], NULL); + } + + if (queue != -1) + if (mq_close(queue)) + perror("mq_close() during shutdown"); + if (queue_path) + /* + * Be silent if this fails, if we cleaned up already it's + * expected to fail + */ + mq_unlink(queue_path); + if (saved_max_msgs) + __set(max_msgs, saved_max_msgs, + "failed to restore saved_max_msgs"); + if (saved_max_msgsize) + __set(max_msgsize, saved_max_msgsize, + "failed to restore saved_max_msgsize"); + if (exit_val) + error(exit_val, errno_at_shutdown, "%s at %d", + err_cause, line_no); + exit(0); +} + +void sig_action_SIGUSR1(int signum, siginfo_t *info, void *context) +{ + if (pthread_self() != main_thread) + pthread_exit(0); + else { + fprintf(stderr, "Caught signal %d in SIGUSR1 handler, " + "exiting\n", signum); + shutdown(0, "", 0); + fprintf(stderr, "\n\nReturned from shutdown?!?!\n\n"); + exit(0); + } +} + +void sig_action(int signum, siginfo_t *info, void *context) +{ + if (pthread_self() != main_thread) + pthread_kill(main_thread, signum); + else { + fprintf(stderr, "Caught signal %d, exiting\n", signum); + shutdown(0, "", 0); + fprintf(stderr, "\n\nReturned from shutdown?!?!\n\n"); + exit(0); + } +} + +static inline int get(FILE *stream) +{ + int value; + rewind(stream); + if (fscanf(stream, "%d", &value) != 1) + shutdown(4, "Error reading /proc entry", __LINE__); + return value; +} + +static inline void set(FILE *stream, int value) +{ + int new_value; + + rewind(stream); + if (fprintf(stream, "%d", value) < 0) + return shutdown(5, "Failed writing to /proc file", __LINE__); + new_value = get(stream); + if (new_value != value) + return shutdown(5, "We didn't get what we wrote to /proc back", + __LINE__); +} + +static inline int try_set(FILE *stream, int value) +{ + int new_value; + + rewind(stream); + fprintf(stream, "%d", value); + new_value = get(stream); + return new_value == value; +} + +static inline void getr(int type, struct rlimit *rlim) +{ + if (getrlimit(type, rlim)) + shutdown(6, "getrlimit()", __LINE__); +} + +static inline void setr(int type, struct rlimit *rlim) +{ + if (setrlimit(type, rlim)) + shutdown(7, "setrlimit()", __LINE__); +} + +/** + * open_queue - open the global queue for testing + * @attr - An attr struct specifying the desired queue traits + * @result - An attr struct that lists the actual traits the queue has + * + * This open is not allowed to fail, failure will result in an orderly + * shutdown of the program. The global queue_path is used to set what + * queue to open, the queue descriptor is saved in the global queue + * variable. + */ +static inline void open_queue(struct mq_attr *attr) +{ + int flags = O_RDWR | O_EXCL | O_CREAT | O_NONBLOCK; + int perms = DEFFILEMODE; + + queue = mq_open(queue_path, flags, perms, attr); + if (queue == -1) + shutdown(1, "mq_open()", __LINE__); + if (mq_getattr(queue, &result)) + shutdown(1, "mq_getattr()", __LINE__); + printf("\n\tQueue %s created:\n", queue_path); + printf("\t\tmq_flags:\t\t\t%s\n", result.mq_flags & O_NONBLOCK ? + "O_NONBLOCK" : "(null)"); + printf("\t\tmq_maxmsg:\t\t\t%d\n", result.mq_maxmsg); + printf("\t\tmq_msgsize:\t\t\t%d\n", result.mq_msgsize); + printf("\t\tmq_curmsgs:\t\t\t%d\n", result.mq_curmsgs); +} + +void *fake_cont_thread(void *arg) +{ + int i; + + for (i = 0; i < num_cpus_to_pin; i++) + if (cpu_threads[i] == pthread_self()) + break; + printf("\tStarted fake continuous mode thread %d on CPU %d\n", i, + cpus_to_pin[i]); + while (1) + ; +} + +void *cont_thread(void *arg) +{ + char buff[MSG_SIZE]; + int i, priority; + + for (i = 0; i < num_cpus_to_pin; i++) + if (cpu_threads[i] == pthread_self()) + break; + printf("\tStarted continuous mode thread %d on CPU %d\n", i, + cpus_to_pin[i]); + while (1) { + while (mq_send(queue, buff, sizeof(buff), 0) == 0) + ; + mq_receive(queue, buff, sizeof(buff), &priority); + } +} + +#define drain_queue() \ + while (mq_receive(queue, buff, MSG_SIZE, &prio_in) == MSG_SIZE) + +#define do_untimed_send() \ + do { \ + if (mq_send(queue, buff, MSG_SIZE, prio_out)) \ + shutdown(3, "Test send failure", __LINE__); \ + } while (0) + +#define do_send_recv() \ + do { \ + clock_gettime(clock, &start); \ + if (mq_send(queue, buff, MSG_SIZE, prio_out)) \ + shutdown(3, "Test send failure", __LINE__); \ + clock_gettime(clock, &middle); \ + if (mq_receive(queue, buff, MSG_SIZE, &prio_in) != MSG_SIZE) \ + shutdown(3, "Test receive failure", __LINE__); \ + clock_gettime(clock, &end); \ + nsec = ((middle.tv_sec - start.tv_sec) * 1000000000) + \ + (middle.tv_nsec - start.tv_nsec); \ + send_total.tv_nsec += nsec; \ + if (send_total.tv_nsec >= 1000000000) { \ + send_total.tv_sec++; \ + send_total.tv_nsec -= 1000000000; \ + } \ + nsec = ((end.tv_sec - middle.tv_sec) * 1000000000) + \ + (end.tv_nsec - middle.tv_nsec); \ + recv_total.tv_nsec += nsec; \ + if (recv_total.tv_nsec >= 1000000000) { \ + recv_total.tv_sec++; \ + recv_total.tv_nsec -= 1000000000; \ + } \ + } while (0) + +struct test { + char *desc; + void (*func)(int *); +}; + +void const_prio(int *prio) +{ + return; +} + +void inc_prio(int *prio) +{ + if (++*prio == mq_prio_max) + *prio = 0; +} + +void dec_prio(int *prio) +{ + if (--*prio < 0) + *prio = mq_prio_max - 1; +} + +void random_prio(int *prio) +{ + *prio = random() % mq_prio_max; +} + +struct test test2[] = { + {"\n\tTest #2a: Time send/recv message, queue full, constant prio\n", + const_prio}, + {"\n\tTest #2b: Time send/recv message, queue full, increasing prio\n", + inc_prio}, + {"\n\tTest #2c: Time send/recv message, queue full, decreasing prio\n", + dec_prio}, + {"\n\tTest #2d: Time send/recv message, queue full, random prio\n", + random_prio}, + {NULL, NULL} +}; + +/** + * Tests to perform (all done with MSG_SIZE messages): + * + * 1) Time to add/remove message with 0 messages on queue + * 1a) with constant prio + * 2) Time to add/remove message when queue close to capacity: + * 2a) with constant prio + * 2b) with increasing prio + * 2c) with decreasing prio + * 2d) with random prio + * 3) Test limits of priorities honored (double check _SC_MQ_PRIO_MAX) + */ +void *perf_test_thread(void *arg) +{ + char buff[MSG_SIZE]; + int prio_out, prio_in; + int i; + clockid_t clock; + pthread_t *t; + struct timespec res, start, middle, end, send_total, recv_total; + unsigned long long nsec; + struct test *cur_test; + + t = &cpu_threads[0]; + printf("\n\tStarted mqueue performance test thread on CPU %d\n", + cpus_to_pin[0]); + mq_prio_max = sysconf(_SC_MQ_PRIO_MAX); + if (mq_prio_max == -1) + shutdown(2, "sysconf(_SC_MQ_PRIO_MAX)", __LINE__); + if (pthread_getcpuclockid(cpu_threads[0], &clock) != 0) + shutdown(2, "pthread_getcpuclockid", __LINE__); + + if (clock_getres(clock, &res)) + shutdown(2, "clock_getres()", __LINE__); + + printf("\t\tMax priorities:\t\t\t%d\n", mq_prio_max); + printf("\t\tClock resolution:\t\t%d nsec%s\n", res.tv_nsec, + res.tv_nsec > 1 ? "s" : ""); + + + + printf("\n\tTest #1: Time send/recv message, queue empty\n"); + printf("\t\t(%d iterations)\n", TEST1_LOOPS); + prio_out = 0; + send_total.tv_sec = 0; + send_total.tv_nsec = 0; + recv_total.tv_sec = 0; + recv_total.tv_nsec = 0; + for (i = 0; i < TEST1_LOOPS; i++) + do_send_recv(); + printf("\t\tSend msg:\t\t\t%d.%ds total time\n", + send_total.tv_sec, send_total.tv_nsec); + nsec = ((unsigned long long)send_total.tv_sec * 1000000000 + + send_total.tv_nsec) / TEST1_LOOPS; + printf("\t\t\t\t\t\t%d nsec/msg\n", nsec); + printf("\t\tRecv msg:\t\t\t%d.%ds total time\n", + recv_total.tv_sec, recv_total.tv_nsec); + nsec = ((unsigned long long)recv_total.tv_sec * 1000000000 + + recv_total.tv_nsec) / TEST1_LOOPS; + printf("\t\t\t\t\t\t%d nsec/msg\n", nsec); + + + for (cur_test = test2; cur_test->desc != NULL; cur_test++) { + printf(cur_test->desc); + printf("\t\t(%d iterations)\n", TEST2_LOOPS); + prio_out = 0; + send_total.tv_sec = 0; + send_total.tv_nsec = 0; + recv_total.tv_sec = 0; + recv_total.tv_nsec = 0; + printf("\t\tFilling queue..."); + fflush(stdout); + clock_gettime(clock, &start); + for (i = 0; i < result.mq_maxmsg - 1; i++) { + do_untimed_send(); + cur_test->func(&prio_out); + } + clock_gettime(clock, &end); + nsec = ((unsigned long long)(end.tv_sec - start.tv_sec) * + 1000000000) + (end.tv_nsec - start.tv_nsec); + printf("done.\t\t%lld.%llds\n", nsec / 1000000000, + nsec % 1000000000); + printf("\t\tTesting..."); + fflush(stdout); + for (i = 0; i < TEST2_LOOPS; i++) { + do_send_recv(); + cur_test->func(&prio_out); + } + printf("done.\n"); + printf("\t\tSend msg:\t\t\t%d.%ds total time\n", + send_total.tv_sec, send_total.tv_nsec); + nsec = ((unsigned long long)send_total.tv_sec * 1000000000 + + send_total.tv_nsec) / TEST2_LOOPS; + printf("\t\t\t\t\t\t%d nsec/msg\n", nsec); + printf("\t\tRecv msg:\t\t\t%d.%ds total time\n", + recv_total.tv_sec, recv_total.tv_nsec); + nsec = ((unsigned long long)recv_total.tv_sec * 1000000000 + + recv_total.tv_nsec) / TEST2_LOOPS; + printf("\t\t\t\t\t\t%d nsec/msg\n", nsec); + printf("\t\tDraining queue..."); + fflush(stdout); + clock_gettime(clock, &start); + drain_queue(); + clock_gettime(clock, &end); + nsec = ((unsigned long long)(end.tv_sec - start.tv_sec) * + 1000000000) + (end.tv_nsec - start.tv_nsec); + printf("done.\t\t%lld.%llds\n", nsec / 1000000000, + nsec % 1000000000); + } + return 0; +} + +void increase_limits(void) +{ + cur_limits.rlim_cur = RLIM_INFINITY; + cur_limits.rlim_max = RLIM_INFINITY; + setr(RLIMIT_MSGQUEUE, &cur_limits); + while (try_set(max_msgs, cur_max_msgs += 10)) + ; + cur_max_msgs = get(max_msgs); + while (try_set(max_msgsize, cur_max_msgsize += 1024)) + ; + cur_max_msgsize = get(max_msgsize); + if (setpriority(PRIO_PROCESS, 0, -20) != 0) + shutdown(2, "setpriority()", __LINE__); + cur_nice = -20; +} + +int main(int argc, char *argv[]) +{ + struct mq_attr attr; + char *option, *next_option; + int i, cpu; + struct sigaction sa; + poptContext popt_context; + char rc; + void *retval; + + main_thread = pthread_self(); + num_cpus_to_pin = 0; + + if (sysconf(_SC_NPROCESSORS_ONLN) == -1) { + perror("sysconf(_SC_NPROCESSORS_ONLN)"); + exit(1); + } + cpus_online = min(MAX_CPUS, sysconf(_SC_NPROCESSORS_ONLN)); + cpu_set = CPU_ALLOC(cpus_online); + if (cpu_set == NULL) { + perror("CPU_ALLOC()"); + exit(1); + } + cpu_set_size = CPU_ALLOC_SIZE(cpus_online); + CPU_ZERO_S(cpu_set_size, cpu_set); + + popt_context = poptGetContext(NULL, argc, (const char **)argv, + options, 0); + + while ((rc = poptGetNextOpt(popt_context)) > 0) { + switch (rc) { + case 'c': + continuous_mode = 1; + option = cpu_option_string; + do { + next_option = strchr(option, ','); + if (next_option) + *next_option = '\0'; + cpu = atoi(option); + if (cpu >= cpus_online) + fprintf(stderr, "CPU %d exceeds " + "cpus online, ignoring.\n", + cpu); + else + cpus_to_pin[num_cpus_to_pin++] = cpu; + if (next_option) + option = ++next_option; + } while (next_option && num_cpus_to_pin < MAX_CPUS); + /* Double check that they didn't give us the same CPU + * more than once */ + for (cpu = 0; cpu < num_cpus_to_pin; cpu++) { + if (CPU_ISSET_S(cpus_to_pin[cpu], cpu_set_size, + cpu_set)) { + fprintf(stderr, "Any given CPU may " + "only be given once.\n"); + exit(1); + } else + CPU_SET_S(cpus_to_pin[cpu], + cpu_set_size, cpu_set); + } + break; + case 'p': + /* + * Although we can create a msg queue with a + * non-absolute path name, unlink will fail. So, + * if the name doesn't start with a /, add one + * when we save it. + */ + option = queue_path; + if (*option != '/') { + queue_path = malloc(strlen(option) + 2); + if (!queue_path) { + perror("malloc()"); + exit(1); + } + queue_path[0] = '/'; + queue_path[1] = 0; + strcat(queue_path, option); + free(option); + } + break; + } + } + + if (continuous_mode && num_cpus_to_pin == 0) { + fprintf(stderr, "Must pass at least one CPU to continuous " + "mode.\n"); + poptPrintUsage(popt_context, stderr, 0); + exit(1); + } else if (!continuous_mode) { + num_cpus_to_pin = 1; + cpus_to_pin[0] = cpus_online - 1; + } + + if (getuid() != 0) { + fprintf(stderr, "Not running as root, but almost all tests " + "require root in order to modify\nsystem settings. " + "Exiting.\n"); + exit(1); + } + + max_msgs = fopen(MAX_MSGS, "r+"); + max_msgsize = fopen(MAX_MSGSIZE, "r+"); + if (!max_msgs) + shutdown(2, "Failed to open msg_max", __LINE__); + if (!max_msgsize) + shutdown(2, "Failed to open msgsize_max", __LINE__); + + /* Load up the current system values for everything we can */ + getr(RLIMIT_MSGQUEUE, &saved_limits); + cur_limits = saved_limits; + saved_max_msgs = cur_max_msgs = get(max_msgs); + saved_max_msgsize = cur_max_msgsize = get(max_msgsize); + errno = 0; + cur_nice = getpriority(PRIO_PROCESS, 0); + if (errno) + shutdown(2, "getpriority()", __LINE__); + + /* Tell the user our initial state */ + printf("\nInitial system state:\n"); + printf("\tUsing queue path:\t\t\t%s\n", queue_path); + printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t%d\n", saved_limits.rlim_cur); + printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t%d\n", saved_limits.rlim_max); + printf("\tMaximum Message Size:\t\t\t%d\n", saved_max_msgsize); + printf("\tMaximum Queue Size:\t\t\t%d\n", saved_max_msgs); + printf("\tNice value:\t\t\t\t%d\n", cur_nice); + printf("\n"); + + increase_limits(); + + printf("Adjusted system state for testing:\n"); + if (cur_limits.rlim_cur == RLIM_INFINITY) { + printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t(unlimited)\n"); + printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t(unlimited)\n"); + } else { + printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t%d\n", + cur_limits.rlim_cur); + printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t%d\n", + cur_limits.rlim_max); + } + printf("\tMaximum Message Size:\t\t\t%d\n", cur_max_msgsize); + printf("\tMaximum Queue Size:\t\t\t%d\n", cur_max_msgs); + printf("\tNice value:\t\t\t\t%d\n", cur_nice); + printf("\tContinuous mode:\t\t\t(%s)\n", continuous_mode ? + (continuous_mode_fake ? "fake mode" : "enabled") : + "disabled"); + printf("\tCPUs to pin:\t\t\t\t%d", cpus_to_pin[0]); + for (cpu = 1; cpu < num_cpus_to_pin; cpu++) + printf(",%d", cpus_to_pin[cpu]); + printf("\n"); + + sa.sa_sigaction = sig_action_SIGUSR1; + sigemptyset(&sa.sa_mask); + sigaddset(&sa.sa_mask, SIGHUP); + sigaddset(&sa.sa_mask, SIGINT); + sigaddset(&sa.sa_mask, SIGQUIT); + sigaddset(&sa.sa_mask, SIGTERM); + sa.sa_flags = SA_SIGINFO; + if (sigaction(SIGUSR1, &sa, NULL) == -1) + shutdown(1, "sigaction(SIGUSR1)", __LINE__); + sa.sa_sigaction = sig_action; + if (sigaction(SIGHUP, &sa, NULL) == -1) + shutdown(1, "sigaction(SIGHUP)", __LINE__); + if (sigaction(SIGINT, &sa, NULL) == -1) + shutdown(1, "sigaction(SIGINT)", __LINE__); + if (sigaction(SIGQUIT, &sa, NULL) == -1) + shutdown(1, "sigaction(SIGQUIT)", __LINE__); + if (sigaction(SIGTERM, &sa, NULL) == -1) + shutdown(1, "sigaction(SIGTERM)", __LINE__); + + if (!continuous_mode_fake) { + attr.mq_flags = O_NONBLOCK; + attr.mq_maxmsg = cur_max_msgs; + attr.mq_msgsize = MSG_SIZE; + open_queue(&attr); + } + for (i = 0; i < num_cpus_to_pin; i++) { + pthread_attr_t thread_attr; + void *thread_func; + + if (continuous_mode_fake) + thread_func = &fake_cont_thread; + else if (continuous_mode) + thread_func = &cont_thread; + else + thread_func = &perf_test_thread; + + CPU_ZERO_S(cpu_set_size, cpu_set); + CPU_SET_S(cpus_to_pin[i], cpu_set_size, cpu_set); + pthread_attr_init(&thread_attr); + pthread_attr_setaffinity_np(&thread_attr, cpu_set_size, + cpu_set); + if (pthread_create(&cpu_threads[i], &thread_attr, thread_func, + NULL)) + shutdown(1, "pthread_create()", __LINE__); + pthread_attr_destroy(&thread_attr); + } + + if (!continuous_mode) { + pthread_join(cpu_threads[0], &retval); + shutdown((long)retval, "perf_test_thread()", __LINE__); + } else { + while (1) + sleep(1); + } + shutdown(0, "", 0); +} -- cgit v1.2.3-59-g8ed1b From d97b46a64674a267bc41c9e16132ee2a98c3347d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:44 -0700 Subject: syscalls, x86: add __NR_kcmp syscall While doing the checkpoint-restore in the user space one need to determine whether various kernel objects (like mm_struct-s of file_struct-s) are shared between tasks and restore this state. The 2nd step can be solved by using appropriate CLONE_ flags and the unshare syscall, while there's currently no ways for solving the 1st one. One of the ways for checking whether two tasks share e.g. mm_struct is to provide some mm_struct ID of a task to its proc file, but showing such info considered to be not that good for security reasons. Thus after some debates we end up in conclusion that using that named 'comparison' syscall might be the best candidate. So here is it -- __NR_kcmp. It takes up to 5 arguments - the pids of the two tasks (which characteristics should be compared), the comparison type and (in case of comparison of files) two file descriptors. Lookups for pids are done in the caller's PID namespace only. At moment only x86 is supported and tested. [akpm@linux-foundation.org: fix up selftests, warnings] [akpm@linux-foundation.org: include errno.h] [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Cyrill Gorcunov Acked-by: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Glauber Costa Cc: Andi Kleen Cc: Tejun Heo Cc: Matt Helsley Cc: Pekka Enberg Cc: Eric Dumazet Cc: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Valdis.Kletnieks@vt.edu Cc: Michal Marek Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 2 + include/linux/kcmp.h | 17 +++ include/linux/syscalls.h | 2 + kernel/Makefile | 3 + kernel/kcmp.c | 196 +++++++++++++++++++++++++++++++ kernel/sys_ni.c | 3 + tools/testing/selftests/Makefile | 2 +- tools/testing/selftests/kcmp/Makefile | 29 +++++ tools/testing/selftests/kcmp/kcmp_test.c | 94 +++++++++++++++ 10 files changed, 348 insertions(+), 1 deletion(-) create mode 100644 include/linux/kcmp.h create mode 100644 kernel/kcmp.c create mode 100644 tools/testing/selftests/kcmp/Makefile create mode 100644 tools/testing/selftests/kcmp/kcmp_test.c (limited to 'tools/testing') diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 29f9f0554f7d..7a35a6e71d44 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -355,3 +355,4 @@ 346 i386 setns sys_setns 347 i386 process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv 348 i386 process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +349 i386 kcmp sys_kcmp diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index dd29a9ea27c5..51171aeff0dc 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -318,6 +318,8 @@ 309 common getcpu sys_getcpu 310 64 process_vm_readv sys_process_vm_readv 311 64 process_vm_writev sys_process_vm_writev +312 64 kcmp sys_kcmp + # # x32-specific system call numbers start at 512 to avoid cache impact # for native 64-bit operation. diff --git a/include/linux/kcmp.h b/include/linux/kcmp.h new file mode 100644 index 000000000000..2dcd1b3aafc8 --- /dev/null +++ b/include/linux/kcmp.h @@ -0,0 +1,17 @@ +#ifndef _LINUX_KCMP_H +#define _LINUX_KCMP_H + +/* Comparison type */ +enum kcmp_type { + KCMP_FILE, + KCMP_VM, + KCMP_FILES, + KCMP_FS, + KCMP_SIGHAND, + KCMP_IO, + KCMP_SYSVSEM, + + KCMP_TYPES, +}; + +#endif /* _LINUX_KCMP_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3de3acb84a95..19439c75c5b2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -858,4 +858,6 @@ asmlinkage long sys_process_vm_writev(pid_t pid, unsigned long riovcnt, unsigned long flags); +asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type, + unsigned long idx1, unsigned long idx2); #endif diff --git a/kernel/Makefile b/kernel/Makefile index 6c07f30fa9b7..80be6ca0cc75 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -25,6 +25,9 @@ endif obj-y += sched/ obj-y += power/ +ifeq ($(CONFIG_CHECKPOINT_RESTORE),y) +obj-$(CONFIG_X86) += kcmp.o +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/kernel/kcmp.c b/kernel/kcmp.c new file mode 100644 index 000000000000..30b7b225306c --- /dev/null +++ b/kernel/kcmp.c @@ -0,0 +1,196 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * We don't expose the real in-memory order of objects for security reasons. + * But still the comparison results should be suitable for sorting. So we + * obfuscate kernel pointers values and compare the production instead. + * + * The obfuscation is done in two steps. First we xor the kernel pointer with + * a random value, which puts pointer into a new position in a reordered space. + * Secondly we multiply the xor production with a large odd random number to + * permute its bits even more (the odd multiplier guarantees that the product + * is unique ever after the high bits are truncated, since any odd number is + * relative prime to 2^n). + * + * Note also that the obfuscation itself is invisible to userspace and if needed + * it can be changed to an alternate scheme. + */ +static unsigned long cookies[KCMP_TYPES][2] __read_mostly; + +static long kptr_obfuscate(long v, int type) +{ + return (v ^ cookies[type][0]) * cookies[type][1]; +} + +/* + * 0 - equal, i.e. v1 = v2 + * 1 - less than, i.e. v1 < v2 + * 2 - greater than, i.e. v1 > v2 + * 3 - not equal but ordering unavailable (reserved for future) + */ +static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) +{ + long ret; + + ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + + return (ret < 0) | ((ret > 0) << 1); +} + +/* The caller must have pinned the task */ +static struct file * +get_file_raw_ptr(struct task_struct *task, unsigned int idx) +{ + struct file *file = NULL; + + task_lock(task); + rcu_read_lock(); + + if (task->files) + file = fcheck_files(task->files, idx); + + rcu_read_unlock(); + task_unlock(task); + + return file; +} + +static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +{ + if (likely(m2 != m1)) + mutex_unlock(m2); + mutex_unlock(m1); +} + +static int kcmp_lock(struct mutex *m1, struct mutex *m2) +{ + int err; + + if (m2 > m1) + swap(m1, m2); + + err = mutex_lock_killable(m1); + if (!err && likely(m1 != m2)) { + err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + if (err) + mutex_unlock(m1); + } + + return err; +} + +SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, + unsigned long, idx1, unsigned long, idx2) +{ + struct task_struct *task1, *task2; + int ret; + + rcu_read_lock(); + + /* + * Tasks are looked up in caller's PID namespace only. + */ + task1 = find_task_by_vpid(pid1); + task2 = find_task_by_vpid(pid2); + if (!task1 || !task2) + goto err_no_task; + + get_task_struct(task1); + get_task_struct(task2); + + rcu_read_unlock(); + + /* + * One should have enough rights to inspect task details. + */ + ret = kcmp_lock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); + if (ret) + goto err; + if (!ptrace_may_access(task1, PTRACE_MODE_READ) || + !ptrace_may_access(task2, PTRACE_MODE_READ)) { + ret = -EPERM; + goto err_unlock; + } + + switch (type) { + case KCMP_FILE: { + struct file *filp1, *filp2; + + filp1 = get_file_raw_ptr(task1, idx1); + filp2 = get_file_raw_ptr(task2, idx2); + + if (filp1 && filp2) + ret = kcmp_ptr(filp1, filp2, KCMP_FILE); + else + ret = -EBADF; + break; + } + case KCMP_VM: + ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); + break; + case KCMP_FILES: + ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); + break; + case KCMP_FS: + ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); + break; + case KCMP_SIGHAND: + ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); + break; + case KCMP_IO: + ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); + break; + case KCMP_SYSVSEM: +#ifdef CONFIG_SYSVIPC + ret = kcmp_ptr(task1->sysvsem.undo_list, + task2->sysvsem.undo_list, + KCMP_SYSVSEM); +#else + ret = -EOPNOTSUPP; +#endif + break; + default: + ret = -EINVAL; + break; + } + +err_unlock: + kcmp_unlock(&task1->signal->cred_guard_mutex, + &task2->signal->cred_guard_mutex); +err: + put_task_struct(task1); + put_task_struct(task2); + + return ret; + +err_no_task: + rcu_read_unlock(); + return -ESRCH; +} + +static __init int kcmp_cookies_init(void) +{ + int i; + + get_random_bytes(cookies, sizeof(cookies)); + + for (i = 0; i < KCMP_TYPES; i++) + cookies[i][1] |= (~(~0UL >> 1) | 1); + + return 0; +} +arch_initcall(kcmp_cookies_init); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 47bfa16430d7..dbff751e4086 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -203,3 +203,6 @@ cond_syscall(sys_fanotify_mark); cond_syscall(sys_name_to_handle_at); cond_syscall(sys_open_by_handle_at); cond_syscall(compat_sys_open_by_handle_at); + +/* compare kernel pointers */ +cond_syscall(sys_kcmp); diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 14972017a43e..a4162e15c25f 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -1,4 +1,4 @@ -TARGETS = breakpoints mqueue vm +TARGETS = breakpoints kcmp mqueue vm all: for TARGET in $(TARGETS); do \ diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile new file mode 100644 index 000000000000..dc79b86ea65c --- /dev/null +++ b/tools/testing/selftests/kcmp/Makefile @@ -0,0 +1,29 @@ +uname_M := $(shell uname -m 2>/dev/null || echo not) +ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/) +ifeq ($(ARCH),i386) + ARCH := X86 + CFLAGS := -DCONFIG_X86_32 -D__i386__ +endif +ifeq ($(ARCH),x86_64) + ARCH := X86 + CFLAGS := -DCONFIG_X86_64 -D__x86_64__ +endif + +CFLAGS += -I../../../../arch/x86/include/generated/ +CFLAGS += -I../../../../include/ +CFLAGS += -I../../../../usr/include/ +CFLAGS += -I../../../../arch/x86/include/ + +all: +ifeq ($(ARCH),X86) + gcc $(CFLAGS) kcmp_test.c -o run_test +else + echo "Not an x86 target, can't build kcmp selftest" +endif + +run-tests: all + ./kcmp_test + +clean: + rm -fr ./run_test + rm -fr ./test-file diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c new file mode 100644 index 000000000000..358cc6bfa35d --- /dev/null +++ b/tools/testing/selftests/kcmp/kcmp_test.c @@ -0,0 +1,94 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +static long sys_kcmp(int pid1, int pid2, int type, int fd1, int fd2) +{ + return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2); +} + +int main(int argc, char **argv) +{ + const char kpath[] = "kcmp-test-file"; + int pid1, pid2; + int fd1, fd2; + int status; + + fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644); + pid1 = getpid(); + + if (fd1 < 0) { + perror("Can't create file"); + exit(1); + } + + pid2 = fork(); + if (pid2 < 0) { + perror("fork failed"); + exit(1); + } + + if (!pid2) { + int pid2 = getpid(); + int ret; + + fd2 = open(kpath, O_RDWR, 0644); + if (fd2 < 0) { + perror("Can't open file"); + exit(1); + } + + /* An example of output and arguments */ + printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld " + "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld " + "INV: %2ld\n", + pid1, pid2, + sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2), + sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0), + sys_kcmp(pid1, pid2, KCMP_VM, 0, 0), + sys_kcmp(pid1, pid2, KCMP_FS, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0), + sys_kcmp(pid1, pid2, KCMP_IO, 0, 0), + sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0), + + /* This one should fail */ + sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0)); + + /* This one should return same fd */ + ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1); + if (ret) { + printf("FAIL: 0 expected but %d returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + /* Compare with self */ + ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0); + if (ret) { + printf("FAIL: 0 expected but %li returned\n", ret); + ret = -1; + } else + printf("PASS: 0 returned as expected\n"); + + exit(ret); + } + + waitpid(pid2, &status, P_ALL); + + return 0; +} -- cgit v1.2.3-59-g8ed1b