From 6b314d0e11924c803bf8cd944e87fd58cdb5088c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Dec 2009 18:58:05 +0100 Subject: sched: Remove sysctl.sched_features Since we've had a much saner debugfs interface to this, remove the sysctl one. Signed-off-by: Peter Zijlstra LKML-Reference: [ v2: build fix ] Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4dbf93a52ee9..e5cc53514caa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -314,14 +314,6 @@ static struct ctl_table kern_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_features", - .data = &sysctl_sched_features, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_migration_cost", -- cgit v1.3-8-gc7d7 From 1983a922a1bc843806b9a36cf3a370b242783140 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:47 +0100 Subject: sched: Make tunable scaling style configurable As scaling now takes place on all kind of cpu add/remove events a user that configures values via proc should be able to configure if his set values are still rescaled or kept whatever happens. As the comments state that log2 was just a second guess that worked the interface is not just designed for on/off, but to choose a scaling type. Currently this allows none, log and linear, but more important it allwos us to keep the interface even if someone has an even better idea how to scale the values. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-3-git-send-email-ehrhardt@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 ++++++++++- kernel/sched.c | 15 ++++++++++++++- kernel/sched_debug.c | 10 ++++++++++ kernel/sched_fair.c | 13 +++++++++++++ kernel/sysctl.c | 14 ++++++++++++++ 5 files changed, 61 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1ebd3280c6..ee9f200d12d3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1902,13 +1902,22 @@ extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_thresh; extern unsigned int sysctl_sched_child_runs_first; + +enum sched_tunable_scaling { + SCHED_TUNABLESCALING_NONE, + SCHED_TUNABLESCALING_LOG, + SCHED_TUNABLESCALING_LINEAR, + SCHED_TUNABLESCALING_END, +}; +extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif diff --git a/kernel/sched.c b/kernel/sched.c index b54ecf84b6be..116efed962c6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7033,7 +7033,20 @@ cpumask_var_t nohz_cpu_mask; static void update_sysctl(void) { unsigned int cpus = min(num_online_cpus(), 8U); - unsigned int factor = 1 + ilog2(cpus); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5fda66615fee..0fc5287fe80f 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu) print_rq(m, rq, cpu); } +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + static int sched_debug_show(struct seq_file *m, void *v) { u64 now = ktime_to_ns(ktime_get()); @@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + for_each_online_cpu(cpu) print_cpu(m, cpu); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 71b3458245e5..455106d318a8 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -21,6 +21,7 @@ */ #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -37,6 +38,18 @@ unsigned int sysctl_sched_latency = 5000000ULL; unsigned int normalized_sysctl_sched_latency = 5000000ULL; +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + /* * Minimal preemption granularity for CPU-bound tasks: * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e5cc53514caa..d10406e5fdfe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -251,6 +251,8 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; #endif static struct ctl_table kern_table[] = { @@ -304,6 +306,18 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { .ctl_name = CTL_UNNUMBERED, .procname = "sched_shares_thresh", -- cgit v1.3-8-gc7d7 From acb4a848da821a095ae9e4d8b22ae2d9633ba5cd Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Mon, 30 Nov 2009 12:16:48 +0100 Subject: sched: Update normalized values on user updates via proc The normalized values are also recalculated in case the scaling factor changes. This patch updates the internally used scheduler tuning values that are normalized to one cpu in case a user sets new values via sysfs. Together with patch 2 of this series this allows to let user configured values scale (or not) to cpu add/remove events taking place later. Signed-off-by: Christian Ehrhardt Signed-off-by: Peter Zijlstra LKML-Reference: <1259579808-11357-4-git-send-email-ehrhardt@linux.vnet.ibm.com> [ v2: fix warning ] Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++++++-- kernel/sched_fair.c | 11 ++++++++++- kernel/sysctl.c | 14 +++++++++----- 3 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sched.c b/kernel/sched.c index 116efed962c6..0a60e8e9b094 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1816,6 +1816,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_active(struct rq *this_rq); static void update_sysctl(void); +static int get_update_sysctl_factor(void); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -7030,9 +7031,9 @@ cpumask_var_t nohz_cpu_mask; * * This idea comes from the SD scheduler of Con Kolivas: */ -static void update_sysctl(void) +static int get_update_sysctl_factor(void) { - unsigned int cpus = min(num_online_cpus(), 8U); + unsigned int cpus = min(num_online_cpus(), 8); unsigned int factor; switch (sysctl_sched_tunable_scaling) { @@ -7048,6 +7049,13 @@ static void update_sysctl(void) break; } + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + #define SET_SYSCTL(name) \ (sysctl_##name = (factor) * normalized_sysctl_##name) SET_SYSCTL(sched_min_granularity); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 455106d318a8..804a411838f1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -399,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ #ifdef CONFIG_SCHED_DEBUG -int sched_nr_latency_handler(struct ctl_table *table, int write, +int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); if (ret || !write) return ret; @@ -411,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, sysctl_sched_min_granularity); +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); + WRT_SYSCTL(sched_shares_ratelimit); +#undef WRT_SYSCTL + return 0; } #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d10406e5fdfe..b9e5a45f1e28 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -253,6 +253,8 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +static int min_sched_shares_ratelimit = 100000; /* 100 usec */ +static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { @@ -271,7 +273,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_min_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -282,7 +284,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &sched_nr_latency_handler, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, @@ -293,7 +295,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_wakeup_granularity, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_wakeup_granularity_ns, .extra2 = &max_wakeup_granularity_ns, @@ -304,7 +306,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_shares_ratelimit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &sched_proc_update_handler, + .extra1 = &min_sched_shares_ratelimit, + .extra2 = &max_sched_shares_ratelimit, }, { .ctl_name = CTL_UNNUMBERED, @@ -312,7 +316,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_sched_tunable_scaling, .maxlen = sizeof(enum sched_tunable_scaling), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, + .proc_handler = &sched_proc_update_handler, .strategy = &sysctl_intvec, .extra1 = &min_sched_tunable_scaling, .extra2 = &max_sched_tunable_scaling, -- cgit v1.3-8-gc7d7 From 06808b0827e1cd14eedc96bac2655d5b37ac246c Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:21 -0800 Subject: hugetlb: derive huge pages nodes allowed from task mempolicy This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes Signed-off-by: Lee Schermerhorn Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 +++ include/linux/mempolicy.h | 3 ++ kernel/sysctl.c | 15 +++++++- mm/hugetlb.c | 97 ++++++++++++++++++++++++++++++++++++++++------- mm/mempolicy.c | 47 +++++++++++++++++++++++ 5 files changed, 153 insertions(+), 15 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 41a59afc70fa..78b4bc64c006 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -23,6 +23,12 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); +#endif + int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 085c903fe0f1..1cc966cd3e5f 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -201,6 +201,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); +extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern unsigned slab_node(struct mempolicy *policy); extern enum zone_type policy_zone; @@ -328,6 +329,8 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, return node_zonelist(0, gfp_flags); } +static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } + static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 554ac4894f0f..60fc93131095 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1051,7 +1051,7 @@ static struct ctl_table vm_table[] = { .extra2 = &one_hundred, }, #ifdef CONFIG_HUGETLB_PAGE - { + { .procname = "nr_hugepages", .data = NULL, .maxlen = sizeof(unsigned long), @@ -1059,7 +1059,18 @@ static struct ctl_table vm_table[] = { .proc_handler = hugetlb_sysctl_handler, .extra1 = (void *)&hugetlb_zero, .extra2 = (void *)&hugetlb_infinity, - }, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif { .procname = "hugetlb_shm_group", .data = &sysctl_hugetlb_shm_group, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 324d1abae876..1125d818ea06 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1330,29 +1330,71 @@ static struct hstate *kobj_to_hstate(struct kobject *kobj) return NULL; } -static ssize_t nr_hugepages_show(struct kobject *kobj, +static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj); return sprintf(buf, "%lu\n", h->nr_huge_pages); } -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) { int err; - unsigned long input; + unsigned long count; struct hstate *h = kobj_to_hstate(kobj); + NODEMASK_ALLOC(nodemask_t, nodes_allowed); - err = strict_strtoul(buf, 10, &input); + err = strict_strtoul(buf, 10, &count); if (err) return 0; - h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map); + if (!(obey_mempolicy && init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_online_map; + } + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - return count; + if (nodes_allowed != &node_online_map) + NODEMASK_FREE(nodes_allowed); + + return len; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, attr, buf, len); } HSTATE_ATTR(nr_hugepages); +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, attr, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1408,6 +1450,9 @@ static struct attribute *hstate_attrs[] = { &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif NULL, }; @@ -1574,9 +1619,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL -int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -1588,13 +1633,39 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, buffer, length, ppos); - if (write) - h->max_huge_pages = set_max_huge_pages(h, tmp, - &node_online_map); + if (write) { + NODEMASK_ALLOC(nodemask_t, nodes_allowed); + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); + + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + } return 0; } +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + int hugetlb_treat_movable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f89eabbaf3e..f11fdad06204 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1568,6 +1568,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, } return zl; } + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + int nid; + + if (!(mask && current->mempolicy)) + return false; + + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + if (mempolicy->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mempolicy->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mempolicy->v.nodes; + break; + + default: + BUG(); + } + + return true; +} #endif /* Allocate a page in interleaved policy. -- cgit v1.3-8-gc7d7 From 70da2340fbc68e91e701762f785479ab495a0869 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 14 Dec 2009 17:59:52 -0800 Subject: 'sysctl_max_map_count' should be non-negative Jan Engelhardt reported we have this problem: setting max_map_count to a value large enough results in programs dying at first try. This is on 2.6.31.6: 15:59 borg:/proc/sys/vm # echo $[1<<31-1] >max_map_count 15:59 borg:/proc/sys/vm # cat max_map_count 1073741824 15:59 borg:/proc/sys/vm # echo $[1<<31] >max_map_count 15:59 borg:/proc/sys/vm # cat max_map_count Killed This is because we have a chance to make 'max_map_count' negative. but it's meaningless. Make it only accept non-negative values. Reported-by: Jan Engelhardt Signed-off-by: WANG Cong Cc: Ingo Molnar Cc: Peter Zijlstra Cc: James Morris Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 60fc93131095..45e4bef0012a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1131,7 +1131,8 @@ static struct ctl_table vm_table[] = { .data = &sysctl_max_map_count, .maxlen = sizeof(sysctl_max_map_count), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec, + .extra1 = &zero, }, #else { -- cgit v1.3-8-gc7d7