From 34c06254ff82a815fdccdfae7517a06c9b768cee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Nov 2015 00:12:24 -0500 Subject: cgroup: fix cftype->file_offset handling 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") introduced cftype->file_offset so that the handles for per-css file instances can be recorded. These handles then can be used, for example, to generate file modified notifications. Unfortunately, it made the wrong assumption that files are created once for a given css and removed on its destruction. Due to the dependencies among subsystems, a css may be hidden from userland and then later shown again. This is implemented by removing and re-creating the affected files, so the associated kernfs_node for a given cgroup file may change over time. This incorrect assumption led to the corruption of css->files lists. Reimplement cftype->file_offset handling so that cgroup_file->kn is protected by a lock and updated as files are created and destroyed. This also makes keeping them on per-cgroup list unnecessary. Signed-off-by: Tejun Heo Reported-by: James Sedgwick Fixes: 6f60eade2433 ("cgroup: generalize obtaining the handles of and notifying cgroup files") Acked-by: Johannes Weiner Acked-by: Zefan Li --- include/linux/cgroup-defs.h | 4 ---- include/linux/cgroup.h | 14 +------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d44b26276d..869fd4a3d28e 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -90,7 +90,6 @@ enum { */ struct cgroup_file { /* do not access any fields from outside cgroup core */ - struct list_head node; /* anchored at css->files */ struct kernfs_node *kn; }; @@ -134,9 +133,6 @@ struct cgroup_subsys_state { */ u64 serial_nr; - /* all cgroup_files associated with this css */ - struct list_head files; - /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 22e3754f89c5..f64083030ad5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -88,6 +88,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); +void cgroup_file_notify(struct cgroup_file *cfile); char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); @@ -516,19 +517,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp) pr_cont_kernfs_path(cgrp->kn); } -/** - * cgroup_file_notify - generate a file modified event for a cgroup_file - * @cfile: target cgroup_file - * - * @cfile must have been obtained by setting cftype->file_offset. - */ -static inline void cgroup_file_notify(struct cgroup_file *cfile) -{ - /* might not have been created due to one of the CFTYPE selector flags */ - if (cfile->kn) - kernfs_notify(cfile->kn); -} - #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; -- cgit v1.2.3-59-g8ed1b From 614e4c4ebc75517295bccd29b20ddbc5b52af6fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 12 Nov 2015 11:00:04 +0100 Subject: perf/core: Robustify the perf_cgroup_from_task() RCU checks This patch reinforces the lockdep checks performed by perf_cgroup_from_tsk() by passing the perf_event_context whenever possible. It is okay to not hold the RCU read lock when we know we hold the ctx->lock. This patch makes sure this property holds. In some functions, such as perf_cgroup_sched_in(), we do not pass the context because we are sure we are holding the RCU read lock. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: edumazet@google.com Link: http://lkml.kernel.org/r/1447322404-10920-3-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 2 +- include/linux/perf_event.h | 6 ++++-- kernel/events/core.c | 20 +++++++++++++------- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 377e8f8ed391..a316ca96f1b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -298,7 +298,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.target); + return perf_cgroup_from_task(event->hw.target, event->ctx); return event->cgrp; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d841d33bcdc9..f9828a48f16a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -697,9 +697,11 @@ struct perf_cgroup { * if there is no cgroup event for the current CPU context. */ static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) +perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) { - return container_of(task_css(task, perf_event_cgrp_id), + return container_of(task_css_check(task, perf_event_cgrp_id, + ctx ? lockdep_is_held(&ctx->lock) + : true), struct perf_cgroup, css); } #endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 60e71ca42c22..1ac857aff7b0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -521,8 +521,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -542,15 +544,17 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -572,11 +576,13 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing -- cgit v1.2.3-59-g8ed1b From 90eec103b96e30401c0b846045bf8a1c7159b6da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2015 11:08:45 +0100 Subject: treewide: Remove old email address There were still a number of references to my old Red Hat email address in the kernel source. Remove these while keeping the Red Hat copyright notices intact. Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/perf_event.c | 2 +- arch/sh/kernel/perf_event.c | 2 +- arch/sparc/kernel/perf_event.c | 2 +- arch/tile/kernel/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.h | 2 +- arch/x86/kernel/irq_work.c | 2 +- include/asm-generic/tlb.h | 2 +- include/linux/jump_label.h | 2 +- include/linux/lockdep.h | 2 +- include/linux/proportions.h | 2 +- include/linux/uprobes.h | 2 +- kernel/events/callchain.c | 2 +- kernel/events/core.c | 2 +- kernel/events/ring_buffer.c | 2 +- kernel/events/uprobes.c | 2 +- kernel/irq_work.c | 2 +- kernel/jump_label.c | 2 +- kernel/locking/lockdep.c | 2 +- kernel/locking/lockdep_proc.c | 2 +- kernel/sched/clock.c | 2 +- kernel/sched/fair.c | 2 +- kernel/trace/trace_event_perf.c | 2 +- lib/btree.c | 2 +- lib/proportions.c | 2 +- mm/page-writeback.c | 2 +- 26 files changed, 26 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/arch/blackfin/kernel/perf_event.c b/arch/blackfin/kernel/perf_event.c index 1e9c8b0bf486..170d786807c4 100644 --- a/arch/blackfin/kernel/perf_event.c +++ b/arch/blackfin/kernel/perf_event.c @@ -14,7 +14,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 7cfd7f153966..4dca18347ee9 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -10,7 +10,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * * ppc: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index b0da5aedb336..3091267c5cc3 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -9,7 +9,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/arch/tile/kernel/perf_event.c b/arch/tile/kernel/perf_event.c index bb509cee3b59..8767060d70fb 100644 --- a/arch/tile/kernel/perf_event.c +++ b/arch/tile/kernel/perf_event.c @@ -21,7 +21,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian */ diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 4562cf070c27..2bf79d7c97df 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index ffa7a92025e1..ab18b8a91583 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -5,7 +5,7 @@ * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, * Copyright (C) 2009 Google, Inc., Stephane Eranian * diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c index dc5fa6a1e8d6..3512ba607361 100644 --- a/arch/x86/kernel/irq_work.c +++ b/arch/x86/kernel/irq_work.c @@ -1,7 +1,7 @@ /* * x86 specific code for irq_work * - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db284bff29dc..9dbb739cafa0 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -5,7 +5,7 @@ * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * - * Copyright 2011 Red Hat, Inc., Peter Zijlstra + * Copyright 2011 Red Hat, Inc., Peter Zijlstra * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 8dde55974f18..0536524bb9eb 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -5,7 +5,7 @@ * Jump label support * * Copyright (C) 2009-2012 Jason Baron - * Copyright (C) 2011-2012 Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra * * DEPRECATED API: * diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 70400dc7660f..c57e424d914b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -2,7 +2,7 @@ * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.txt for more details. */ diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 5440f64d2942..21221338ad18 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h @@ -1,7 +1,7 @@ /* * FLoating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * This file contains the public data structure and API definitions. */ diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 0bdc72f36905..4a29c75b146e 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -21,7 +21,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487254d5..9c418002b8c1 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/core.c b/kernel/events/core.c index 1ac857aff7b0..5854fcf7f05a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea79c595..adfdc0536117 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e9798aa0c..7dad84913abf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb899d92..bcf107ce0854 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d537f9..05254eeb4b4e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron - * Copyright (C) 2011 Peter Zijlstra + * Copyright (C) 2011 Peter Zijlstra * */ #include diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index deae3907ac1e..60ace56618f6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798bef95..dbb61a302548 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..caf4041f5b0a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..90e26b11deaa 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903e741e..cc9f7a9319be 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker */ diff --git a/lib/btree.c b/lib/btree.c index 4264871ea1a0..f93a945274af 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -5,7 +5,7 @@ * * Copyright (c) 2007-2008 Joern Engel * Bits and pieces stolen from Peter Zijlstra's code, which is - * Copyright 2007, Red Hat Inc. Peter Zijlstra + * Copyright 2007, Red Hat Inc. Peter Zijlstra * GPLv2 * * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch diff --git a/lib/proportions.c b/lib/proportions.c index 6f724298f67a..efa54f259ea9 100644 --- a/lib/proportions.c +++ b/lib/proportions.c @@ -1,7 +1,7 @@ /* * Floating proportions * - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Description: * diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3e4d65445fa7..d15d88c8efa1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2,7 +2,7 @@ * mm/page-writeback.c * * Copyright (C) 2002, Linus Torvalds. - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Contains functions related to writing back dirty pages at the * address_space level. -- cgit v1.2.3-59-g8ed1b From ca369d51b3e1649be4a72addd6d6a168cfb3f537 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 13 Nov 2015 16:46:48 -0500 Subject: block/sd: Fix device-imposed transfer length limits Commit 4f258a46346c ("sd: Fix maximum I/O size for BLOCK_PC requests") had the unfortunate side-effect of removing an implicit clamp to BLK_DEF_MAX_SECTORS for REQ_TYPE_FS requests in the block layer code. This caused problems for some SMR drives. Debugging this issue revealed a few problems with the existing infrastructure since the block layer didn't know how to deal with device-imposed limits, only limits set by the I/O controller. - Introduce a new queue limit, max_dev_sectors, which is used by the ULD to signal the maximum sectors for a REQ_TYPE_FS request. - Ensure that max_dev_sectors is correctly stacked and taken into account when overriding max_sectors through sysfs. - Rework sd_read_block_limits() so it saves the max_xfer and opt_xfer values for later processing. - In sd_revalidate() set the queue's max_dev_sectors based on the MAXIMUM TRANSFER LENGTH value in the Block Limits VPD. If this value is not reported, fall back to a cap based on the CDB TRANSFER LENGTH field size. - In sd_revalidate(), use OPTIMAL TRANSFER LENGTH from the Block Limits VPD--if reported and sane--to signal the preferred device transfer size for FS requests. Otherwise use BLK_DEF_MAX_SECTORS. - blk_limits_max_hw_sectors() is no longer used and can be removed. Signed-off-by: Martin K. Petersen Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=93581 Reviewed-by: Christoph Hellwig Tested-by: sweeneygj@gmx.com Tested-by: Arzeets Tested-by: David Eisner Tested-by: Mario Kicherer Signed-off-by: Martin K. Petersen --- block/blk-settings.c | 36 ++++++++++++++++-------------------- block/blk-sysfs.c | 3 +++ drivers/scsi/sd.c | 46 ++++++++++++++++++++++++++++++---------------- drivers/scsi/sd.h | 1 + include/linux/blkdev.h | 2 +- 5 files changed, 51 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/block/blk-settings.c b/block/blk-settings.c index 7d8f129a1516..dd4973583978 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -91,7 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors = + BLK_SAFE_MAX_SECTORS; lim->chunk_sectors = 0; lim->max_write_same_sectors = 0; lim->max_discard_sectors = 0; @@ -127,6 +128,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_hw_sectors = UINT_MAX; lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; lim->max_write_same_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -214,8 +216,8 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request - * @limits: the queue limits + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: @@ -224,13 +226,19 @@ EXPORT_SYMBOL(blk_queue_bounce_limit); * the device driver based upon the capabilities of the I/O * controller. * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * * max_sectors is a soft limit imposed by the block layer for * filesystem type requests. This value can be overridden on a * per-device basis in /sys/block//queue/max_sectors_kb. * The soft limit can not exceed max_hw_sectors. **/ -void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", @@ -238,22 +246,9 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ } limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); -} -EXPORT_SYMBOL(blk_limits_max_hw_sectors); - -/** - * blk_queue_max_hw_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_hw_sectors: max hardware sectors in the usual 512b unit - * - * Description: - * See description for blk_limits_max_hw_sectors(). - **/ -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) -{ - blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -527,6 +522,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_same_sectors = min(t->max_write_same_sectors, b->max_write_same_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3e44a9da2a13..55c637b9c42b 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -205,6 +205,9 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e868d39c39bb..7af47ed10d90 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2238,11 +2238,8 @@ got_data: } } - if (sdkp->capacity > 0xffffffff) { + if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; - sdkp->max_xfer_blocks = SD_MAX_XFER_BLOCKS; - } else - sdkp->max_xfer_blocks = SD_DEF_XFER_BLOCKS; /* Rescale capacity to 512-byte units */ if (sector_size == 4096) @@ -2559,7 +2556,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) { unsigned int sector_sz = sdkp->device->sector_size; const int vpd_len = 64; - u32 max_xfer_length; unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL); if (!buffer || @@ -2567,14 +2563,11 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len)) goto out; - max_xfer_length = get_unaligned_be32(&buffer[8]); - if (max_xfer_length) - sdkp->max_xfer_blocks = max_xfer_length; - blk_queue_io_min(sdkp->disk->queue, get_unaligned_be16(&buffer[6]) * sector_sz); - blk_queue_io_opt(sdkp->disk->queue, - get_unaligned_be32(&buffer[12]) * sector_sz); + + sdkp->max_xfer_blocks = get_unaligned_be32(&buffer[8]); + sdkp->opt_xfer_blocks = get_unaligned_be32(&buffer[12]); if (buffer[3] == 0x3c) { unsigned int lba_count, desc_count; @@ -2723,6 +2716,11 @@ static int sd_try_extended_inquiry(struct scsi_device *sdp) return 0; } +static inline u32 logical_to_sectors(struct scsi_device *sdev, u32 blocks) +{ + return blocks << (ilog2(sdev->sector_size) - 9); +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -2732,8 +2730,9 @@ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; + struct request_queue *q = sdkp->disk->queue; unsigned char *buffer; - unsigned int max_xfer; + unsigned int dev_max, rw_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); @@ -2781,11 +2780,26 @@ static int sd_revalidate_disk(struct gendisk *disk) */ sd_set_flush_flag(sdkp); - max_xfer = sdkp->max_xfer_blocks; - max_xfer <<= ilog2(sdp->sector_size) - 9; + /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ + dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; + + /* Some devices report a maximum block count for READ/WRITE requests. */ + dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); + q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); + + /* + * Use the device's preferred I/O size for reads and writes + * unless the reported value is unreasonably large (or garbage). + */ + if (sdkp->opt_xfer_blocks && sdkp->opt_xfer_blocks <= dev_max && + sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS) + rw_max = q->limits.io_opt = + logical_to_sectors(sdp, sdkp->opt_xfer_blocks); + else + rw_max = BLK_DEF_MAX_SECTORS; - sdkp->disk->queue->limits.max_sectors = - min_not_zero(queue_max_hw_sectors(sdkp->disk->queue), max_xfer); + /* Combine with controller limits */ + q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); set_capacity(disk, sdkp->capacity); sd_config_write_same(sdkp); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 63ba5ca7f9a1..5f2a84aff29f 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -67,6 +67,7 @@ struct scsi_disk { atomic_t openers; sector_t capacity; /* size in 512-byte sectors */ u32 max_xfer_blocks; + u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 38a5ff772a37..9dacb745fa96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -253,6 +253,7 @@ struct queue_limits { unsigned long virt_boundary_mask; unsigned int max_hw_sectors; + unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_segment_size; @@ -948,7 +949,6 @@ extern struct request_queue *blk_init_allocated_queue(struct request_queue *, extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); -extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); -- cgit v1.2.3-59-g8ed1b From 64031e3e8a5c042840c5123af695eec89f9e6a24 Mon Sep 17 00:00:00 2001 From: Hanjun Guo Date: Wed, 2 Dec 2015 15:44:22 +0800 Subject: ACPI / property: fix compile error for acpi_node_get_property_reference() when CONFIG_ACPI=n In commit 60ba032ed76e ("ACPI / property: Drop size_prop from acpi_dev_get_property_reference()"), the argument "const char *cells_name" was dropped, but forgot to update the stub function in no-ACPI case, it will lead to compile error when CONFIG_ACPI=n, easliy remove "const char *cells_name" to fix it. Fixes: 60ba032ed76e "ACPI / property: Drop size_prop from acpi_dev_get_property_reference()" Reported-by: Kejian Yan Signed-off-by: Hanjun Guo Acked-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- include/linux/acpi.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 865d948c60e6..9e6f4bb4692f 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -782,8 +782,8 @@ static inline int acpi_dev_get_property(struct acpi_device *adev, } static inline int acpi_node_get_property_reference(struct fwnode_handle *fwnode, - const char *name, const char *cells_name, - size_t index, struct acpi_reference_args *args) + const char *name, size_t index, + struct acpi_reference_args *args) { return -ENXIO; } -- cgit v1.2.3-59-g8ed1b From 69030dd1c3671625c6f766af0b64a4bb4409ac3b Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Tue, 1 Dec 2015 16:52:14 -0800 Subject: cpufreq: use last policy after online for drivers with ->setpolicy For cpufreq drivers which use setpolicy interface, after offline->online the policy is set to default. This can be reproduced by setting the default policy of intel_pstate or longrun to ondemand and then change to "performance". After offline and online, the setpolicy will be called with the policy=ondemand. For drivers using governors this condition is handled by storing last_governor, during offline and restoring during online. The same should be done for drivers using setpolicy interface. Storing last_policy during offline and restoring during online. Signed-off-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 14 ++++++++++---- include/linux/cpufreq.h | 1 + 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index a83c995a62df..8412ce5f93a7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -976,10 +976,14 @@ static int cpufreq_init_policy(struct cpufreq_policy *policy) new_policy.governor = gov; - /* Use the default policy if its valid. */ - if (cpufreq_driver->setpolicy) - cpufreq_parse_governor(gov->name, &new_policy.policy, NULL); - + /* Use the default policy if there is no last_policy. */ + if (cpufreq_driver->setpolicy) { + if (policy->last_policy) + new_policy.policy = policy->last_policy; + else + cpufreq_parse_governor(gov->name, &new_policy.policy, + NULL); + } /* set default policy */ return cpufreq_set_policy(policy, &new_policy); } @@ -1330,6 +1334,8 @@ static void cpufreq_offline_prepare(unsigned int cpu) if (has_target()) strncpy(policy->last_governor, policy->governor->name, CPUFREQ_NAME_LEN); + else + policy->last_policy = policy->policy; } else if (cpu == policy->cpu) { /* Nominate new CPU */ policy->cpu = cpumask_any(policy->cpus); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index ef4c5b1a860f..177c7680c1a8 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -77,6 +77,7 @@ struct cpufreq_policy { unsigned int suspend_freq; /* freq to set during suspend */ unsigned int policy; /* see above */ + unsigned int last_policy; /* policy before unplug */ struct cpufreq_governor *governor; /* see below */ void *governor_data; bool governor_enabled; /* governor start/stop flag */ -- cgit v1.2.3-59-g8ed1b From 1f7dd3e5a6e4f093017fff12232572ee1aa4639b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Dec 2015 10:18:21 -0500 Subject: cgroup: fix handling of multi-destination migration from subtree_control enabling Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [] dump_stack+0x4e/0x82 [] warn_slowpath_common+0x82/0xc0 [] warn_slowpath_null+0x1a/0x20 [] pids_cancel.constprop.6+0x31/0x40 [] pids_can_attach+0x6d/0xf0 [] cgroup_taskset_migrate+0x6c/0x330 [] cgroup_migrate+0xf5/0x190 [] cgroup_attach_task+0x176/0x200 [] __cgroup_procs_write+0x2ad/0x460 [] cgroup_procs_write+0x14/0x20 [] cgroup_file_write+0x35/0x1c0 [] kernfs_fop_write+0x141/0x190 [] __vfs_write+0x28/0xe0 [] vfs_write+0xac/0x1a0 [] SyS_write+0x49/0xb0 [] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo Reported-and-tested-by: Daniel Wagner Cc: Aleksa Sarai --- block/blk-cgroup.c | 6 +++--- include/linux/cgroup-defs.h | 9 +++------ include/linux/cgroup.h | 33 +++++++++++++++++++++----------- kernel/cgroup.c | 43 +++++++++++++++++++++++++++++++++--------- kernel/cgroup_freezer.c | 6 +++--- kernel/cgroup_pids.c | 16 ++++++++-------- kernel/cpuset.c | 33 ++++++++++++++++++++------------ kernel/events/core.c | 6 +++--- kernel/sched/core.c | 12 ++++++------ mm/memcontrol.c | 45 ++++++++++++++++++++++---------------------- net/core/netclassid_cgroup.c | 11 ++++++----- net/core/netprio_cgroup.c | 9 +++++---- 12 files changed, 137 insertions(+), 92 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5bcdfc10c23a..5a37188b559f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1127,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 869fd4a3d28e..06b77f9dd3f2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -422,12 +422,9 @@ struct cgroup_subsys { void (*css_reset)(struct cgroup_subsys_state *css); void (*css_e_css_changed)(struct cgroup_subsys_state *css); - int (*can_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset); + int (*can_attach)(struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_taskset *tset); + void (*attach)(struct cgroup_taskset *tset); int (*can_fork)(struct task_struct *task, void **priv_p); void (*cancel_fork)(struct task_struct *task, void *priv); void (*fork)(struct task_struct *task, void *priv); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f64083030ad5..cb91b44f5f78 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -120,8 +120,10 @@ struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it); @@ -236,30 +238,39 @@ void css_task_iter_end(struct css_task_iter *it); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor + * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple - * processes. When there are multiple tasks in @tset, if a task of a - * process is in @tset, all tasks of the process are in @tset. Also, all - * are guaranteed to share the same source and destination csses. + * processes. + * + * On the v2 hierarchy, there may be tasks from multiple processes and they + * may not share the source or destination csses. + * + * On traditional hierarchies, when there are multiple tasks in @tset, if a + * task of a process is in @tset, all tasks of the process are in @tset. + * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ -#define cgroup_taskset_for_each(task, tset) \ - for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) +#define cgroup_taskset_for_each(task, dst_css, tset) \ + for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ + (task); \ + (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor + * @dst_css: the destination css * @tset: takset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ -#define cgroup_taskset_for_each_leader(leader, tset) \ - for ((leader) = cgroup_taskset_first((tset)); (leader); \ - (leader) = cgroup_taskset_next((tset))) \ +#define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ + for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ + (leader); \ + (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cea63fe4095..470f6536b9e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2237,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2299,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2332,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2367,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2400,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2411,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ff02a8e51bb3..2d3df82c54f2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,10 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -172,7 +172,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index de3359a48dbb..8e27fc5dbb20 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae73611d80..02a8ea5c9963 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd20648..026305dfe523 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9456,12 +9456,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..a9db4819e586 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8217,12 +8217,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8235,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9acfb165eb52..c92a65b2b4ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4779,23 +4779,18 @@ static void mem_cgroup_clear_mc(void) spin_unlock(&mc.lock); } -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; struct mem_cgroup *from; struct task_struct *leader, *p; struct mm_struct *mm; unsigned long move_flags; int ret = 0; - /* - * We are now commited to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) + /* charge immigration isn't supported on the default hierarchy */ + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; /* @@ -4805,13 +4800,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, * multiple. */ p = NULL; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { WARN_ON_ONCE(p); p = leader; + memcg = mem_cgroup_from_css(css); } if (!p) return 0; + /* + * We are now commited to this value whatever it is. Changes in this + * tunable will only affect upcoming migrations, not the current one. + * So we need to save it, and keep it going. + */ + move_flags = READ_ONCE(memcg->move_charge_at_immigrate); + if (!move_flags) + return 0; + from = mem_cgroup_from_task(p); VM_BUG_ON(from == memcg); @@ -4842,8 +4847,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { if (mc.to) mem_cgroup_clear_mc(); @@ -4985,10 +4989,10 @@ retry: atomic_dec(&mc.from->moving_account); } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { - struct task_struct *p = cgroup_taskset_first(tset); + struct cgroup_subsys_state *css; + struct task_struct *p = cgroup_taskset_first(tset, &css); struct mm_struct *mm = get_task_mm(p); if (mm) { @@ -5000,17 +5004,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 6441f47b1a8f..81cb3c72efe8 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -67,14 +67,15 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_taskset *tset) { - struct cgroup_cls_state *cs = css_cls_state(css); - void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + struct cgroup_cls_state *cs = css_cls_state(css); + void *v = (void *)(unsigned long)cs->classid; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index cbd0a199bf52..40fd09fe06ae 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -218,13 +218,14 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_taskset *tset) { struct task_struct *p; - void *v = (void *)(unsigned long)css->cgroup->id; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(p, css, tset) { + void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); -- cgit v1.2.3-59-g8ed1b From ae5515d66362b9d96cdcfce504567f0b8b7bd83e Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 4 Dec 2015 08:38:42 -0700 Subject: Revert: "vfio: Include No-IOMMU mode" Revert commit 033291eccbdb ("vfio: Include No-IOMMU mode") due to lack of a user. This was originally intended to fill a need for the DPDK driver, but uptake has been slow so rather than support an unproven kernel interface revert it and revisit when userspace catches up. Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 15 ---- drivers/vfio/pci/vfio_pci.c | 8 +- drivers/vfio/vfio.c | 186 ++------------------------------------------ include/linux/vfio.h | 3 - include/uapi/linux/vfio.h | 7 -- 5 files changed, 10 insertions(+), 209 deletions(-) (limited to 'include/linux') diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce77495..850d86ca685b 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -31,21 +31,6 @@ menuconfig VFIO If you don't know what to do here, say N. -menuconfig VFIO_NOIOMMU - bool "VFIO No-IOMMU support" - depends on VFIO - help - VFIO is built on the ability to isolate devices using the IOMMU. - Only with an IOMMU can userspace access to DMA capable devices be - considered secure. VFIO No-IOMMU mode enables IOMMU groups for - devices without IOMMU backing for the purpose of re-using the VFIO - infrastructure in a non-secure mode. Use of this mode will result - in an unsupportable kernel and will therefore taint the kernel. - Device assignment to virtual machines is also not possible with - this mode since there is no IOMMU to provide DMA translation. - - If you don't know what to do here, say N. - source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 2760a7ba3f30..56bf6dbb93db 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -940,13 +940,13 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; - group = vfio_iommu_group_get(&pdev->dev); + group = iommu_group_get(&pdev->dev); if (!group) return -EINVAL; vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); if (!vdev) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); return -ENOMEM; } @@ -957,7 +957,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); if (ret) { - vfio_iommu_group_put(group, &pdev->dev); + iommu_group_put(group); kfree(vdev); return ret; } @@ -993,7 +993,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) if (!vdev) return; - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); + iommu_group_put(pdev->dev.iommu_group); kfree(vdev); if (vfio_pci_is_vga(pdev)) { diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 9da0703e09d0..6070b793cbcb 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -62,7 +62,6 @@ struct vfio_container { struct rw_semaphore group_lock; struct vfio_iommu_driver *iommu_driver; void *iommu_data; - bool noiommu; }; struct vfio_unbound_dev { @@ -85,7 +84,6 @@ struct vfio_group { struct list_head unbound_list; struct mutex unbound_lock; atomic_t opened; - bool noiommu; }; struct vfio_device { @@ -97,147 +95,6 @@ struct vfio_device { void *device_data; }; -#ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_support, - noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#endif - -/* - * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe - * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_add_group_dev() or removing - * that symmetric reference after vfio_del_group_dev() should use the raw - * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() - * removes the device from the dummy group and cannot be nested. - */ -struct iommu_group *vfio_iommu_group_get(struct device *dev) -{ - struct iommu_group *group; - int __maybe_unused ret; - - group = iommu_group_get(dev); - -#ifdef CONFIG_VFIO_NOIOMMU - /* - * With noiommu enabled, an IOMMU group will be created for a device - * that doesn't already have one and doesn't have an iommu_ops on their - * bus. We use iommu_present() again in the main code to detect these - * fake groups. - */ - if (group || !noiommu || iommu_present(dev->bus)) - return group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - iommu_group_set_name(group, "vfio-noiommu"); - ret = iommu_group_add_device(group, dev); - iommu_group_put(group); - if (ret) - return NULL; - - /* - * Where to taint? At this point we've added an IOMMU group for a - * device that is not backed by iommu_ops, therefore any iommu_ - * callback using iommu_ops can legitimately Oops. So, while we may - * be about to give a DMA capable device to a user without IOMMU - * protection, which is clearly taint-worthy, let's go ahead and do - * it here. - */ - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); -#endif - - return group; -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_get); - -void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) -{ -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_present(dev->bus)) - iommu_group_remove_device(dev); -#endif - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_put); - -#ifdef CONFIG_VFIO_NOIOMMU -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return arg == VFIO_NOIOMMU_IOMMU ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_iommu_present(struct device *dev, void *unused) -{ - return iommu_present(dev->bus) ? 1 : 0; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ - return iommu_group_for_each_dev(iommu_group, NULL, - vfio_iommu_present) ? -EINVAL : 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -static struct vfio_iommu_driver vfio_noiommu_driver = { - .ops = &vfio_noiommu_ops, -}; - -/* - * Wrap IOMMU drivers, the noiommu driver is the one and only driver for - * noiommu groups (and thus containers) and not available for normal groups. - */ -#define vfio_for_each_iommu_driver(con, pos) \ - for (pos = con->noiommu ? &vfio_noiommu_driver : \ - list_first_entry(&vfio.iommu_drivers_list, \ - struct vfio_iommu_driver, vfio_next); \ - (con->noiommu ? pos != NULL : \ - &pos->vfio_next != &vfio.iommu_drivers_list); \ - pos = con->noiommu ? NULL : list_next_entry(pos, vfio_next)) -#else -#define vfio_for_each_iommu_driver(con, pos) \ - list_for_each_entry(pos, &vfio.iommu_drivers_list, vfio_next) -#endif - - /** * IOMMU driver registration */ @@ -342,8 +199,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - bool noiommu) +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) { struct vfio_group *group, *tmp; struct device *dev; @@ -361,7 +217,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->container_users, 0); atomic_set(&group->opened, 0); group->iommu_group = iommu_group; - group->noiommu = noiommu; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -397,8 +252,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, dev = device_create(vfio.class, NULL, MKDEV(MAJOR(vfio.group_devt), minor), - group, "%s%d", noiommu ? "noiommu-" : "", - iommu_group_id(iommu_group)); + group, "%d", iommu_group_id(iommu_group)); if (IS_ERR(dev)) { vfio_free_group_minor(minor); vfio_group_unlock_and_free(group); @@ -786,8 +640,7 @@ int vfio_add_group_dev(struct device *dev, group = vfio_group_get_from_iommu(iommu_group); if (!group) { - group = vfio_create_group(iommu_group, - !iommu_present(dev->bus)); + group = vfio_create_group(iommu_group); if (IS_ERR(group)) { iommu_group_put(iommu_group); return PTR_ERR(group); @@ -999,7 +852,8 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, */ if (!driver) { mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { if (!try_module_get(driver->ops->owner)) continue; @@ -1068,7 +922,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, } mutex_lock(&vfio.iommu_drivers_lock); - vfio_for_each_iommu_driver(container, driver) { + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { void *data; if (!try_module_get(driver->ops->owner)) @@ -1333,9 +1187,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (atomic_read(&group->container_users)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - f = fdget(container_fd); if (!f.file) return -EBADF; @@ -1351,13 +1202,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) down_write(&container->group_lock); - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != group->noiommu) { - ret = -EPERM; - goto unlock_out; - } - driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, @@ -1367,7 +1211,6 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; - container->noiommu = group->noiommu; list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ @@ -1398,9 +1241,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) - return -EPERM; - device = vfio_device_get_from_name(group, buf); if (!device) return -ENODEV; @@ -1443,10 +1283,6 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) fd_install(ret, filep); - if (group->noiommu) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - return ret; } @@ -1535,11 +1371,6 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) if (!group) return -ENODEV; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) { - vfio_group_put(group); - return -EPERM; - } - /* Do we need multiple instances of the group open? Seems not. */ opened = atomic_cmpxchg(&group->opened, 0, 1); if (opened) { @@ -1702,11 +1533,6 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) if (!atomic_inc_not_zero(&group->container_users)) return ERR_PTR(-EINVAL); - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - if (!group->container->iommu_driver || !vfio_group_viable(group)) { atomic_dec(&group->container_users); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 610a86a892b8..ddb440975382 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -44,9 +44,6 @@ struct vfio_device_ops { void (*request)(void *device_data, unsigned int count); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - extern int vfio_add_group_dev(struct device *dev, const struct vfio_device_ops *ops, void *device_data); diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 751b69f858c8..9fd7b5d8df2f 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -38,13 +38,6 @@ #define VFIO_SPAPR_TCE_v2_IOMMU 7 -/* - * The No-IOMMU IOMMU offers no translation or isolation for devices and - * supports no ioctls outside of VFIO_CHECK_EXTENSION. Use of VFIO's No-IOMMU - * code will taint the host kernel and should be used with extreme caution. - */ -#define VFIO_NOIOMMU_IOMMU 8 - /* * The IOCTL interface is designed for extensibility by embedding the * structure length (argsz) and flags into structures passed between -- cgit v1.2.3-59-g8ed1b From 3cf92222a39cc7842c373dd90a0c204fa7d7cced Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 3 Dec 2015 20:41:29 +0800 Subject: rhashtable: Prevent spurious EBUSY errors on insertion Thomas and Phil observed that under stress rhashtable insertion sometimes failed with EBUSY, even though this error should only ever been seen when we're under attack and our hash chain length has grown to an unacceptable level, even after a rehash. It turns out that the logic for detecting whether there is an existing rehash is faulty. In particular, when two threads both try to grow the same table at the same time, one of them may see the newly grown table and thus erroneously conclude that it had been rehashed. This is what leads to the EBUSY error. This patch fixes this by remembering the current last table we used during insertion so that rhashtable_insert_rehash can detect when another thread has also done a resize/rehash. When this is detected we will give up our resize/rehash and simply retry the insertion with the new table. Reported-by: Thomas Graf Reported-by: Phil Sutter Signed-off-by: Herbert Xu Tested-by: Phil Sutter Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 18 +++++++++++------- lib/rhashtable.c | 45 ++++++++++++++++++++++++++++++--------------- 2 files changed, 41 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 843ceca9a21e..e50b31d18462 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -339,10 +340,11 @@ static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, int rhashtable_init(struct rhashtable *ht, const struct rhashtable_params *params); -int rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj, - struct bucket_table *old_tbl); -int rhashtable_insert_rehash(struct rhashtable *ht); +struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, + const void *key, + struct rhash_head *obj, + struct bucket_table *old_tbl); +int rhashtable_insert_rehash(struct rhashtable *ht, struct bucket_table *tbl); int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); @@ -598,9 +600,11 @@ restart: new_tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(new_tbl)) { - err = rhashtable_insert_slow(ht, key, obj, new_tbl); - if (err == -EAGAIN) + tbl = rhashtable_insert_slow(ht, key, obj, new_tbl); + if (!IS_ERR_OR_NULL(tbl)) goto slow_path; + + err = PTR_ERR(tbl); goto out; } @@ -611,7 +615,7 @@ restart: if (unlikely(rht_grow_above_100(ht, tbl))) { slow_path: spin_unlock_bh(lock); - err = rhashtable_insert_rehash(ht); + err = rhashtable_insert_rehash(ht, tbl); rcu_read_unlock(); if (err) return err; diff --git a/lib/rhashtable.c b/lib/rhashtable.c index a54ff8949f91..2ff7ed91663a 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -389,33 +389,31 @@ static bool rhashtable_check_elasticity(struct rhashtable *ht, return false; } -int rhashtable_insert_rehash(struct rhashtable *ht) +int rhashtable_insert_rehash(struct rhashtable *ht, + struct bucket_table *tbl) { struct bucket_table *old_tbl; struct bucket_table *new_tbl; - struct bucket_table *tbl; unsigned int size; int err; old_tbl = rht_dereference_rcu(ht->tbl, ht); - tbl = rhashtable_last_table(ht, old_tbl); size = tbl->size; + err = -EBUSY; + if (rht_grow_above_75(ht, tbl)) size *= 2; /* Do not schedule more than one rehash */ else if (old_tbl != tbl) - return -EBUSY; + goto fail; + + err = -ENOMEM; new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC); - if (new_tbl == NULL) { - /* Schedule async resize/rehash to try allocation - * non-atomic context. - */ - schedule_work(&ht->run_work); - return -ENOMEM; - } + if (new_tbl == NULL) + goto fail; err = rhashtable_rehash_attach(ht, tbl, new_tbl); if (err) { @@ -426,12 +424,24 @@ int rhashtable_insert_rehash(struct rhashtable *ht) schedule_work(&ht->run_work); return err; + +fail: + /* Do not fail the insert if someone else did a rehash. */ + if (likely(rcu_dereference_raw(tbl->future_tbl))) + return 0; + + /* Schedule async rehash to retry allocation in process context. */ + if (err == -ENOMEM) + schedule_work(&ht->run_work); + + return err; } EXPORT_SYMBOL_GPL(rhashtable_insert_rehash); -int rhashtable_insert_slow(struct rhashtable *ht, const void *key, - struct rhash_head *obj, - struct bucket_table *tbl) +struct bucket_table *rhashtable_insert_slow(struct rhashtable *ht, + const void *key, + struct rhash_head *obj, + struct bucket_table *tbl) { struct rhash_head *head; unsigned int hash; @@ -467,7 +477,12 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key, exit: spin_unlock(rht_bucket_lock(tbl, hash)); - return err; + if (err == 0) + return NULL; + else if (err == -EAGAIN) + return tbl; + else + return ERR_PTR(err); } EXPORT_SYMBOL_GPL(rhashtable_insert_slow); -- cgit v1.2.3-59-g8ed1b From 326fcfa5acca446b3f71e99f6d19881145556e5c Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 5 Dec 2015 13:58:11 +0100 Subject: net: remove unnecessary semicolon in netdev_alloc_pcpu_stats() This semicolon causes a build error if the function call is wrapped in parentheses. Fixes: aabc92bbe3cf ("net: add __netdev_alloc_pcpu_stats() to indicate gfp flags") Reported-by: Imre Kaloz Signed-off-by: Felix Fietkau Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3b5d134e945a..3143c847bddb 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2084,7 +2084,7 @@ struct pcpu_sw_netstats { }) #define netdev_alloc_pcpu_stats(type) \ - __netdev_alloc_pcpu_stats(type, GFP_KERNEL); + __netdev_alloc_pcpu_stats(type, GFP_KERNEL) #include -- cgit v1.2.3-59-g8ed1b From ea013a9b205b47b1fcbc72522146fad560af0712 Mon Sep 17 00:00:00 2001 From: Andreas Werner Date: Fri, 4 Dec 2015 18:12:49 +0100 Subject: libata-eh.c: Introduce new ata port flag for controller which lockup on read log page Some controller lockup on a ata_read_log_page. Add new ata port flag ATA_FLAG_NO_LOG_PAGE which can used to blacklist a controller. If this flag is set, any attempt to read a log page returns an error without actually issuing the command. Signed-off-by: Andreas Werner Signed-off-by: Tejun Heo --- drivers/ata/libata-eh.c | 8 ++++++++ include/linux/libata.h | 1 + 2 files changed, 9 insertions(+) (limited to 'include/linux') diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c index cb0508af1459..961acc788f44 100644 --- a/drivers/ata/libata-eh.c +++ b/drivers/ata/libata-eh.c @@ -1505,12 +1505,20 @@ static const char *ata_err_string(unsigned int err_mask) unsigned int ata_read_log_page(struct ata_device *dev, u8 log, u8 page, void *buf, unsigned int sectors) { + unsigned long ap_flags = dev->link->ap->flags; struct ata_taskfile tf; unsigned int err_mask; bool dma = false; DPRINTK("read log page - log 0x%x, page 0x%x\n", log, page); + /* + * Return error without actually issuing the command on controllers + * which e.g. lockup on a read log page. + */ + if (ap_flags & ATA_FLAG_NO_LOG_PAGE) + return AC_ERR_DEV; + retry: ata_tf_init(dev, &tf); if (dev->dma_mode && ata_id_has_read_log_dma_ext(dev->id) && diff --git a/include/linux/libata.h b/include/linux/libata.h index 83577f8fd15b..600c1e0626a5 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -210,6 +210,7 @@ enum { ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), + ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ ATA_FLAG_PIO_LBA48 = (1 << 8), /* Host DMA engine is LBA28 only */ -- cgit v1.2.3-59-g8ed1b From 57b4bd06ff0372fe1e3617889c4b37fbd500364a Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:47 +0100 Subject: lightnvm: comments on constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is not obvious what NVM_IO_* and NVM_BLK_T_* are used for. Make sure to comment them appropriately as the other constants. Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- include/linux/lightnvm.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index c6916aec43b6..935ef3844c05 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -50,9 +50,16 @@ enum { NVM_IO_DUAL_ACCESS = 0x1, NVM_IO_QUAD_ACCESS = 0x2, + /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, NVM_IO_SCRAMBLE_DISABLE = 0x200, + + /* Block Types */ + NVM_BLK_T_FREE = 0x0, + NVM_BLK_T_BAD = 0x1, + NVM_BLK_T_DEV = 0x2, + NVM_BLK_T_HOST = 0x4, }; struct nvm_id_group { -- cgit v1.2.3-59-g8ed1b From 16f26c3aa9b9c36a9d1092ae3258461d1008481e Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Sun, 6 Dec 2015 11:25:48 +0100 Subject: lightnvm: replace req queue with nvmdev for lld MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case where a request queue is passed to the low lever lightnvm device drive integration, the device driver might pass its admin commands through another queue. Instead pass nvm_dev, and let the low level drive the appropriate queue. Reported-by: Christoph Hellwig Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 9 +++++---- drivers/lightnvm/core.c | 7 +++---- drivers/lightnvm/gennvm.c | 8 ++++---- drivers/lightnvm/rrpc.c | 2 +- drivers/nvme/host/lightnvm.c | 24 +++++++++++++----------- include/linux/lightnvm.h | 14 +++++++------- 6 files changed, 33 insertions(+), 31 deletions(-) (limited to 'include/linux') diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 0c3940ec5e62..7981b7407305 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -444,8 +444,9 @@ static void null_lnvm_end_io(struct request *rq, int error) blk_put_request(rq); } -static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct request *rq; struct bio *bio = rqd->bio; @@ -470,7 +471,7 @@ static int null_lnvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) +static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) { sector_t size = gb * 1024 * 1024 * 1024ULL; sector_t blksize; @@ -523,7 +524,7 @@ static int null_lnvm_id(struct request_queue *q, struct nvm_id *id) return 0; } -static void *null_lnvm_create_dma_pool(struct request_queue *q, char *name) +static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name) { mempool_t *virtmem_pool; @@ -541,7 +542,7 @@ static void null_lnvm_destroy_dma_pool(void *pool) mempool_destroy(pool); } -static void *null_lnvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return mempool_alloc(pool, mem_flags); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 86ce887b2ed6..4a8d1fe34c4e 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -74,7 +74,7 @@ EXPORT_SYMBOL(nvm_unregister_target); void *nvm_dev_dma_alloc(struct nvm_dev *dev, gfp_t mem_flags, dma_addr_t *dma_handler) { - return dev->ops->dev_dma_alloc(dev->q, dev->ppalist_pool, mem_flags, + return dev->ops->dev_dma_alloc(dev, dev->ppalist_pool, mem_flags, dma_handler); } EXPORT_SYMBOL(nvm_dev_dma_alloc); @@ -246,7 +246,7 @@ static int nvm_init(struct nvm_dev *dev) if (!dev->q || !dev->ops) return ret; - if (dev->ops->identity(dev->q, &dev->identity)) { + if (dev->ops->identity(dev, &dev->identity)) { pr_err("nvm: device could not be identified\n"); goto err; } @@ -326,8 +326,7 @@ int nvm_register(struct request_queue *q, char *disk_name, } if (dev->ops->max_phys_sect > 1) { - dev->ppalist_pool = dev->ops->create_dma_pool(dev->q, - "ppalist"); + dev->ppalist_pool = dev->ops->create_dma_pool(dev, "ppalist"); if (!dev->ppalist_pool) { pr_err("nvm: could not create ppa pool\n"); ret = -ENOMEM; diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index ce6025487a5c..52b513a65946 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -195,7 +195,7 @@ static int gennvm_blocks_init(struct nvm_dev *dev, struct gen_nvm *gn) } if (dev->ops->get_l2p_tbl) { - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, gennvm_block_map, dev); if (ret) { pr_err("gennvm: could not read L2P table.\n"); @@ -346,7 +346,7 @@ static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) gennvm_generic_to_addr_mode(dev, rqd); rqd->dev = dev; - return dev->ops->submit_io(dev->q, rqd); + return dev->ops->submit_io(dev, rqd); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -382,7 +382,7 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (!dev->ops->set_bb_tbl) return; - if (dev->ops->set_bb_tbl(dev->q, rqd, 1)) + if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; gennvm_addr_to_generic_mode(dev, rqd); @@ -450,7 +450,7 @@ static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, gennvm_generic_to_addr_mode(dev, &rqd); - ret = dev->ops->erase_block(dev->q, &rqd); + ret = dev->ops->erase_block(dev, &rqd); if (plane_cnt) nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index cf1a4a515b76..134e4faba482 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -1016,7 +1016,7 @@ static int rrpc_map_init(struct rrpc *rrpc) return 0; /* Bring up the mapping table from device */ - ret = dev->ops->get_l2p_tbl(dev->q, 0, dev->total_pages, + ret = dev->ops->get_l2p_tbl(dev, 0, dev->total_pages, rrpc_l2p_update, rrpc); if (ret) { pr_err("nvm: rrpc: could not read L2P table.\n"); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 762c9a7cbfa6..15f2acb4d5cd 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -271,9 +271,9 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) return 0; } -static int nvme_nvm_identity(struct request_queue *q, struct nvm_id *nvm_id) +static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; @@ -308,10 +308,10 @@ out: return ret; } -static int nvme_nvm_get_l2p_tbl(struct request_queue *q, u64 slba, u32 nlb, +static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; u32 len = queue_max_hw_sectors(dev->admin_q) << 9; @@ -415,10 +415,10 @@ out: return ret; } -static int nvme_nvm_set_bb_tbl(struct request_queue *q, struct nvm_rq *rqd, +static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -463,8 +463,9 @@ static void nvme_nvm_end_io(struct request *rq, int error) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; struct bio *bio = rqd->bio; @@ -502,8 +503,9 @@ static int nvme_nvm_submit_io(struct request_queue *q, struct nvm_rq *rqd) return 0; } -static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) +static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) { + struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct nvme_nvm_command c = {}; @@ -515,9 +517,9 @@ static int nvme_nvm_erase_block(struct request_queue *q, struct nvm_rq *rqd) return nvme_submit_sync_cmd(q, (struct nvme_command *)&c, NULL, 0); } -static void *nvme_nvm_create_dma_pool(struct request_queue *q, char *name) +static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { - struct nvme_ns *ns = q->queuedata; + struct nvme_ns *ns = nvmdev->q->queuedata; struct nvme_dev *dev = ns->dev; return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); @@ -530,7 +532,7 @@ static void nvme_nvm_destroy_dma_pool(void *pool) dma_pool_destroy(dma_pool); } -static void *nvme_nvm_dev_dma_alloc(struct request_queue *q, void *pool, +static void *nvme_nvm_dev_dma_alloc(struct nvm_dev *dev, void *pool, gfp_t mem_flags, dma_addr_t *dma_handler) { return dma_pool_alloc(pool, mem_flags, dma_handler); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 935ef3844c05..034117b3be5f 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -183,17 +183,17 @@ struct nvm_block; typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct request_queue *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct request_queue *, u64, u32, +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, nvm_l2p_update_fn *, void *); typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct request_queue *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct request_queue *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct request_queue *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct request_queue *, char *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct request_queue *, void *, gfp_t, +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, dma_addr_t *); typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); -- cgit v1.2.3-59-g8ed1b From 4639d60d2bfb7f5007b5d93788fd93c19b63f000 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Dec 2015 06:25:56 -0500 Subject: qed: Fix corner case for chain in-between pages The amount of chain next pointer elements between the producer and the consumer indices depends on which pages they currently point to. The current calculation is based only on their difference, and it can lead to a number of free elements which is higher by 1 than the actual value. Signed-off-by: Tomer Tayar Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- include/linux/qed/qed_chain.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/qed/qed_chain.h b/include/linux/qed/qed_chain.h index b920c3605c46..41b9049b57e2 100644 --- a/include/linux/qed/qed_chain.h +++ b/include/linux/qed/qed_chain.h @@ -111,7 +111,8 @@ static inline u16 qed_chain_get_elem_left(struct qed_chain *p_chain) used = ((u32)0x10000u + (u32)(p_chain->prod_idx)) - (u32)p_chain->cons_idx; if (p_chain->mode == QED_CHAIN_MODE_NEXT_PTR) - used -= (used / p_chain->elem_per_page); + used -= p_chain->prod_idx / p_chain->elem_per_page - + p_chain->cons_idx / p_chain->elem_per_page; return p_chain->capacity - used; } -- cgit v1.2.3-59-g8ed1b From 76a9a3642a0b72d5687d680150580d55b6ea9804 Mon Sep 17 00:00:00 2001 From: Tomer Tayar Date: Mon, 7 Dec 2015 06:25:57 -0500 Subject: qed: fix handling of concurrent ramrods. Concurrent non-blocking slowpath ramrods can be completed out-of-order on the completion chain. Recycling completed elements, while previously sent elements are still completion pending, can lead to overriding of active elements on the chain. Furthermore, sending pending slowpath ramrods currently lacks the update of the chain element physical pointer. This patch: * Ensures that ramrods are sent to the FW with consecutive echo values. * Handles out-of-order completions by freeing only first successive completed entries. * Updates the chain element physical pointer when copying a pending element into a free element for sending. Signed-off-by: Tomer Tayar Signed-off-by: Manish Chopra Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_sp.h | 8 +++- drivers/net/ethernet/qlogic/qed/qed_spq.c | 63 +++++++++++++++++++++++-------- include/linux/qed/common_hsi.h | 2 + 3 files changed, 56 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/qlogic/qed/qed_sp.h b/drivers/net/ethernet/qlogic/qed/qed_sp.h index 31a1f1eb4f56..287fadfab52d 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_sp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_sp.h @@ -124,8 +124,12 @@ struct qed_spq { dma_addr_t p_phys; struct qed_spq_entry *p_virt; - /* Used as index for completions (returns on EQ by FW) */ - u16 echo_idx; +#define SPQ_RING_SIZE \ + (CORE_SPQE_PAGE_SIZE_BYTES / sizeof(struct slow_path_element)) + + /* Bitmap for handling out-of-order completions */ + DECLARE_BITMAP(p_comp_bitmap, SPQ_RING_SIZE); + u8 comp_bitmap_idx; /* Statistics */ u32 unlimited_pending_count; diff --git a/drivers/net/ethernet/qlogic/qed/qed_spq.c b/drivers/net/ethernet/qlogic/qed/qed_spq.c index 7c0b8459666e..3dd548ab8df1 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_spq.c +++ b/drivers/net/ethernet/qlogic/qed/qed_spq.c @@ -112,8 +112,6 @@ static int qed_spq_fill_entry(struct qed_hwfn *p_hwfn, struct qed_spq_entry *p_ent) { - p_ent->elem.hdr.echo = 0; - p_hwfn->p_spq->echo_idx++; p_ent->flags = 0; switch (p_ent->comp_mode) { @@ -195,10 +193,12 @@ static int qed_spq_hw_post(struct qed_hwfn *p_hwfn, struct qed_spq *p_spq, struct qed_spq_entry *p_ent) { - struct qed_chain *p_chain = &p_hwfn->p_spq->chain; + struct qed_chain *p_chain = &p_hwfn->p_spq->chain; + u16 echo = qed_chain_get_prod_idx(p_chain); struct slow_path_element *elem; struct core_db_data db; + p_ent->elem.hdr.echo = cpu_to_le16(echo); elem = qed_chain_produce(p_chain); if (!elem) { DP_NOTICE(p_hwfn, "Failed to produce from SPQ chain\n"); @@ -437,7 +437,9 @@ void qed_spq_setup(struct qed_hwfn *p_hwfn) p_spq->comp_count = 0; p_spq->comp_sent_count = 0; p_spq->unlimited_pending_count = 0; - p_spq->echo_idx = 0; + + bitmap_zero(p_spq->p_comp_bitmap, SPQ_RING_SIZE); + p_spq->comp_bitmap_idx = 0; /* SPQ cid, cannot fail */ qed_cxt_acquire_cid(p_hwfn, PROTOCOLID_CORE, &p_spq->cid); @@ -582,26 +584,32 @@ qed_spq_add_entry(struct qed_hwfn *p_hwfn, struct qed_spq *p_spq = p_hwfn->p_spq; if (p_ent->queue == &p_spq->unlimited_pending) { - struct qed_spq_entry *p_en2; if (list_empty(&p_spq->free_pool)) { list_add_tail(&p_ent->list, &p_spq->unlimited_pending); p_spq->unlimited_pending_count++; return 0; - } + } else { + struct qed_spq_entry *p_en2; - p_en2 = list_first_entry(&p_spq->free_pool, - struct qed_spq_entry, - list); - list_del(&p_en2->list); + p_en2 = list_first_entry(&p_spq->free_pool, + struct qed_spq_entry, + list); + list_del(&p_en2->list); + + /* Copy the ring element physical pointer to the new + * entry, since we are about to override the entire ring + * entry and don't want to lose the pointer. + */ + p_ent->elem.data_ptr = p_en2->elem.data_ptr; - /* Strcut assignment */ - *p_en2 = *p_ent; + *p_en2 = *p_ent; - kfree(p_ent); + kfree(p_ent); - p_ent = p_en2; + p_ent = p_en2; + } } /* entry is to be placed in 'pending' queue */ @@ -777,13 +785,38 @@ int qed_spq_completion(struct qed_hwfn *p_hwfn, list_for_each_entry_safe(p_ent, tmp, &p_spq->completion_pending, list) { if (p_ent->elem.hdr.echo == echo) { + u16 pos = le16_to_cpu(echo) % SPQ_RING_SIZE; + list_del(&p_ent->list); - qed_chain_return_produced(&p_spq->chain); + /* Avoid overriding of SPQ entries when getting + * out-of-order completions, by marking the completions + * in a bitmap and increasing the chain consumer only + * for the first successive completed entries. + */ + bitmap_set(p_spq->p_comp_bitmap, pos, SPQ_RING_SIZE); + + while (test_bit(p_spq->comp_bitmap_idx, + p_spq->p_comp_bitmap)) { + bitmap_clear(p_spq->p_comp_bitmap, + p_spq->comp_bitmap_idx, + SPQ_RING_SIZE); + p_spq->comp_bitmap_idx++; + qed_chain_return_produced(&p_spq->chain); + } + p_spq->comp_count++; found = p_ent; break; } + + /* This is relatively uncommon - depends on scenarios + * which have mutliple per-PF sent ramrods. + */ + DP_VERBOSE(p_hwfn, QED_MSG_SPQ, + "Got completion for echo %04x - doesn't match echo %04x in completion pending list\n", + le16_to_cpu(echo), + le16_to_cpu(p_ent->elem.hdr.echo)); } /* Release lock before callback, as callback may post diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 6a4347639c03..1d1ba2c5ee7a 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -9,6 +9,8 @@ #ifndef __COMMON_HSI__ #define __COMMON_HSI__ +#define CORE_SPQE_PAGE_SIZE_BYTES 4096 + #define FW_MAJOR_VERSION 8 #define FW_MINOR_VERSION 4 #define FW_REVISION_VERSION 2 -- cgit v1.2.3-59-g8ed1b From a5e14ba334e202c58e45ef47414ec94c585c1a8c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 28 Oct 2015 13:28:15 +0200 Subject: mlx4: Expose correct max_sge_rd limit mlx4 devices (ConnectX-2, ConnectX-3) has a limitation where rdma read work queue entries cannot exceed 512 bytes. A rdma_read wqe needs to fit in 512 bytes: - wqe control segment (16 bytes) - rdma segment (16 bytes) - scatter elements (16 bytes each) So max_sge_rd should be: (512 - 16 - 16) / 16 = 30. Signed-off-by: Sagi Grimberg Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 2 +- include/linux/mlx4/device.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f567160a4a56..97d6878f9938 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -456,7 +456,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); - props->max_sge_rd = props->max_sge; + props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 7501626ab529..d3133be12d92 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -426,6 +426,17 @@ enum { MLX4_MAX_FAST_REG_PAGES = 511, }; +enum { + /* + * Max wqe size for rdma read is 512 bytes, so this + * limits our max_sge_rd as the wqe needs to fit: + * - ctrl segment (16 bytes) + * - rdma segment (16 bytes) + * - scatter elements (16 bytes each) + */ + MLX4_MAX_SGE_RD = (512 - 16 - 16) / 16 +}; + enum { MLX4_DEV_PMC_SUBTYPE_GUID_INFO = 0x14, MLX4_DEV_PMC_SUBTYPE_PORT_INFO = 0x15, -- cgit v1.2.3-59-g8ed1b From 4c3141e09cfa6460bfcd5e90f73e498db654c917 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Tue, 1 Dec 2015 17:24:17 +0100 Subject: of/irq: Export of_irq_find_parent again of_irq_find_parent was made static since it had no users outside of of_irq.c. Export it again since we are going to use it again. Signed-off-by: Carlo Caione [robh: move of_irq_find_parent to correct ifdef section] Signed-off-by: Rob Herring --- drivers/of/irq.c | 3 ++- include/linux/of_irq.h | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/of/irq.c b/drivers/of/irq.c index 902b89be7217..4fa916dffc91 100644 --- a/drivers/of/irq.c +++ b/drivers/of/irq.c @@ -53,7 +53,7 @@ EXPORT_SYMBOL_GPL(irq_of_parse_and_map); * Returns a pointer to the interrupt parent node, or NULL if the interrupt * parent could not be determined. */ -static struct device_node *of_irq_find_parent(struct device_node *child) +struct device_node *of_irq_find_parent(struct device_node *child) { struct device_node *p; const __be32 *parp; @@ -77,6 +77,7 @@ static struct device_node *of_irq_find_parent(struct device_node *child) return p; } +EXPORT_SYMBOL_GPL(of_irq_find_parent); /** * of_irq_parse_raw - Low level interrupt tree parsing diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 039f2eec49ce..f648acf27ed7 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -46,6 +46,7 @@ extern int of_irq_get(struct device_node *dev, int index); extern int of_irq_get_byname(struct device_node *dev, const char *name); extern int of_irq_to_resource_table(struct device_node *dev, struct resource *res, int nr_irqs); +extern struct device_node *of_irq_find_parent(struct device_node *child); extern struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token); @@ -70,6 +71,11 @@ static inline int of_irq_to_resource_table(struct device_node *dev, { return 0; } +static inline void *of_irq_find_parent(struct device_node *child) +{ + return NULL; +} + static inline struct irq_domain *of_msi_get_domain(struct device *dev, struct device_node *np, enum irq_domain_bus_token token) -- cgit v1.2.3-59-g8ed1b From eaddb5725357e9f05ffe5d271630f8197d089da4 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 9 Dec 2015 09:11:10 -0600 Subject: of/irq: move of_msi_map_rid declaration to the correct ifdef section In checking fixes for of_irq_find_parent declaration location, I found that of_msi_map_rid is also wrong. of_msi_map_rid is not implemented for Sparc, so it should not be in the Sparc specific section of the header. Move it to just depend on OF_IRQ. Cc: Frank Rowand Signed-off-by: Rob Herring --- include/linux/of_irq.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index f648acf27ed7..1e0deb8e8494 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -53,6 +53,7 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev, extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid); extern void of_msi_configure(struct device *dev, struct device_node *np); +u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else static inline int of_irq_count(struct device_node *dev) { @@ -90,6 +91,11 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev static inline void of_msi_configure(struct device *dev, struct device_node *np) { } +static inline u32 of_msi_map_rid(struct device *dev, + struct device_node *msi_np, u32 rid_in) +{ + return rid_in; +} #endif #if defined(CONFIG_OF_IRQ) || defined(CONFIG_SPARC) @@ -99,7 +105,6 @@ static inline void of_msi_configure(struct device *dev, struct device_node *np) * so declare it here regardless of the CONFIG_OF_IRQ setting. */ extern unsigned int irq_of_parse_and_map(struct device_node *node, int index); -u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in); #else /* !CONFIG_OF && !CONFIG_SPARC */ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, @@ -107,12 +112,6 @@ static inline unsigned int irq_of_parse_and_map(struct device_node *dev, { return 0; } - -static inline u32 of_msi_map_rid(struct device *dev, - struct device_node *msi_np, u32 rid_in) -{ - return rid_in; -} #endif /* !CONFIG_OF */ #endif /* __OF_IRQ_H */ -- cgit v1.2.3-59-g8ed1b From d7e35dfa2531b53618b9e6edcd8752ce988ac555 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Dec 2015 22:04:01 -0500 Subject: bitops.h: correctly handle rol32 with 0 byte shift ROL on a 32 bit integer with a shift of 32 or more is undefined and the result is arch-dependent. Avoid this by handling the trivial case of roling by 0 correctly. The trivial solution of checking if shift is 0 breaks gcc's detection of this code as a ROL instruction, which is unacceptable. This bug was reported and fixed in GCC (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57157): The standard rotate idiom, (x << n) | (x >> (32 - n)) is recognized by gcc (for concreteness, I discuss only the case that x is an uint32_t here). However, this is portable C only for n in the range 0 < n < 32. For n == 0, we get x >> 32 which gives undefined behaviour according to the C standard (6.5.7, Bitwise shift operators). To portably support n == 0, one has to write the rotate as something like (x << n) | (x >> ((-n) & 31)) And this is apparently not recognized by gcc. Note that this is broken on older GCCs and will result in slower ROL. Acked-by: Linus Torvalds Signed-off-by: Sasha Levin Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 2b8ed123ad36..defeaac0745f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -107,7 +107,7 @@ static inline __u64 ror64(__u64 word, unsigned int shift) */ static inline __u32 rol32(__u32 word, unsigned int shift) { - return (word << shift) | (word >> (32 - shift)); + return (word << shift) | (word >> ((-shift) & 31)); } /** -- cgit v1.2.3-59-g8ed1b From ecb7deceff2a51d3be50518969bc06411f485a62 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 9 Dec 2015 10:18:10 +0200 Subject: dmaengine: edma: DT: Change memcpy channel array from 16bit to 32bit type This change makes the DT file to be easier to read since the memcpy channels array does not need the '/bits/ 16' to be specified, which might confuse some people. Signed-off-by: Peter Ujfalusi Acked-by: Arnd Bergmann Acked-by: Rob Herring Acked-by: Tony Lindgren Signed-off-by: Vinod Koul --- Documentation/devicetree/bindings/dma/ti-edma.txt | 5 ++--- drivers/dma/edma.c | 22 ++++++++++------------ include/linux/platform_data/edma.h | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/Documentation/devicetree/bindings/dma/ti-edma.txt b/Documentation/devicetree/bindings/dma/ti-edma.txt index d3d0a4fb1c73..ae8b8f1d6e69 100644 --- a/Documentation/devicetree/bindings/dma/ti-edma.txt +++ b/Documentation/devicetree/bindings/dma/ti-edma.txt @@ -22,8 +22,7 @@ Required properties: Optional properties: - ti,hwmods: Name of the hwmods associated to the eDMA CC - ti,edma-memcpy-channels: List of channels allocated to be used for memcpy, iow - these channels will be SW triggered channels. The list must - contain 16 bits numbers, see example. + these channels will be SW triggered channels. See example. - ti,edma-reserved-slot-ranges: PaRAM slot ranges which should not be used by the driver, they are allocated to be used by for example the DSP. See example. @@ -56,7 +55,7 @@ edma: edma@49000000 { ti,tptcs = <&edma_tptc0 7>, <&edma_tptc1 7>, <&edma_tptc2 0>; /* Channel 20 and 21 is allocated for memcpy */ - ti,edma-memcpy-channels = /bits/ 16 <20 21>; + ti,edma-memcpy-channels = <20 21>; /* The following PaRAM slots are reserved: 35-45 and 100-110 */ ti,edma-reserved-slot-ranges = /bits/ 16 <35 10>, /bits/ 16 <100 10>; diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index 6b03e4e84e6b..3da20291db56 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -1752,16 +1752,14 @@ static enum dma_status edma_tx_status(struct dma_chan *chan, return ret; } -static bool edma_is_memcpy_channel(int ch_num, u16 *memcpy_channels) +static bool edma_is_memcpy_channel(int ch_num, s32 *memcpy_channels) { - s16 *memcpy_ch = memcpy_channels; - if (!memcpy_channels) return false; - while (*memcpy_ch != -1) { - if (*memcpy_ch == ch_num) + while (*memcpy_channels != -1) { + if (*memcpy_channels == ch_num) return true; - memcpy_ch++; + memcpy_channels++; } return false; } @@ -1775,7 +1773,7 @@ static void edma_dma_init(struct edma_cc *ecc, bool legacy_mode) { struct dma_device *s_ddev = &ecc->dma_slave; struct dma_device *m_ddev = NULL; - s16 *memcpy_channels = ecc->info->memcpy_channels; + s32 *memcpy_channels = ecc->info->memcpy_channels; int i, j; dma_cap_zero(s_ddev->cap_mask); @@ -1996,16 +1994,16 @@ static struct edma_soc_info *edma_setup_info_from_dt(struct device *dev, prop = of_find_property(dev->of_node, "ti,edma-memcpy-channels", &sz); if (prop) { const char pname[] = "ti,edma-memcpy-channels"; - size_t nelm = sz / sizeof(s16); - s16 *memcpy_ch; + size_t nelm = sz / sizeof(s32); + s32 *memcpy_ch; - memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s16), + memcpy_ch = devm_kcalloc(dev, nelm + 1, sizeof(s32), GFP_KERNEL); if (!memcpy_ch) return ERR_PTR(-ENOMEM); - ret = of_property_read_u16_array(dev->of_node, pname, - (u16 *)memcpy_ch, nelm); + ret = of_property_read_u32_array(dev->of_node, pname, + (u32 *)memcpy_ch, nelm); if (ret) return ERR_PTR(ret); diff --git a/include/linux/platform_data/edma.h b/include/linux/platform_data/edma.h index e2878baeb90e..4299f4ba03bd 100644 --- a/include/linux/platform_data/edma.h +++ b/include/linux/platform_data/edma.h @@ -72,7 +72,7 @@ struct edma_soc_info { struct edma_rsv_info *rsv; /* List of channels allocated for memcpy, terminated with -1 */ - s16 *memcpy_channels; + s32 *memcpy_channels; s8 (*queue_priority_mapping)[2]; const s16 (*xbar_chans)[2]; -- cgit v1.2.3-59-g8ed1b From 633c9a840d0bf1cce690f3165bdacd8ab412949e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 9 Dec 2015 12:08:26 +0100 Subject: netfilter: nfnetlink: avoid recurrent netns lookups in call_batch Pass the net pointer to the call_batch callback functions so we can skip recurrent lookups. Signed-off-by: Pablo Neira Ayuso Tested-by: Arturo Borrero Gonzalez --- include/linux/netfilter/nfnetlink.h | 2 +- net/netfilter/nf_tables_api.c | 96 +++++++++++++++++-------------------- net/netfilter/nfnetlink.c | 2 +- 3 files changed, 47 insertions(+), 53 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 249d1bb01e03..5646b24bfc64 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -14,7 +14,7 @@ struct nfnl_callback { int (*call_rcu)(struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); - int (*call_batch)(struct sock *nl, struct sk_buff *skb, + int (*call_batch)(struct net *net, struct sock *nl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const cda[]); const struct nla_policy *policy; /* netlink attribute policy */ diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 93cc4737018f..f1002dcfa1c9 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -89,6 +89,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload) } static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, struct nft_af_info *afi, @@ -96,7 +97,7 @@ static void nft_ctx_init(struct nft_ctx *ctx, struct nft_chain *chain, const struct nlattr * const *nla) { - ctx->net = sock_net(skb->sk); + ctx->net = net; ctx->afi = afi; ctx->table = table; ctx->chain = chain; @@ -672,15 +673,14 @@ err: return ret; } -static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newtable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nlattr *name; struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; u32 flags = 0; struct nft_ctx ctx; @@ -706,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); return nf_tables_updtable(&ctx); } @@ -730,7 +730,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb, INIT_LIST_HEAD(&table->sets); table->flags = flags; - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) goto err3; @@ -810,18 +810,17 @@ out: return err; } -static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_deltable(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; - nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla); if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL) return nft_flush(&ctx, family); @@ -1221,8 +1220,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) } } -static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -1232,7 +1231,6 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, struct nft_chain *chain; struct nft_base_chain *basechain = NULL; struct nlattr *ha[NFTA_HOOK_MAX + 1]; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct net_device *dev = NULL; u8 policy = NF_ACCEPT; @@ -1313,7 +1311,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(stats); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); if (trans == NULL) { @@ -1461,7 +1459,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (err < 0) goto err1; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN); if (err < 0) goto err2; @@ -1476,15 +1474,14 @@ err1: return err; } -static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delchain(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; struct nft_chain *chain; - struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nft_ctx ctx; @@ -1506,7 +1503,7 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb, if (chain->use > 0) return -EBUSY; - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); return nft_delchain(&ctx); } @@ -2010,13 +2007,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx, static struct nft_expr_info *info; -static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; @@ -2075,7 +2071,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(old_rule); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); n = 0; size = 0; @@ -2176,13 +2172,12 @@ err1: return err; } -static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delrule(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_chain *chain = NULL; struct nft_rule *rule; @@ -2205,7 +2200,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb, return PTR_ERR(chain); } - nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla); if (chain) { if (nla[NFTA_RULE_HANDLE]) { @@ -2344,12 +2339,11 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi = NULL; struct nft_table *table = NULL; @@ -2371,7 +2365,7 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, return -ENOENT; } - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -2623,6 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; struct sk_buff *skb2; @@ -2630,7 +2625,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb, int err; /* Verify existence before starting dump */ - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -2693,14 +2688,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, return 0; } -static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); const struct nft_set_ops *ops; struct nft_af_info *afi; - struct net *net = sock_net(skb->sk); struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; @@ -2798,7 +2792,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb, if (IS_ERR(table)) return PTR_ERR(table); - nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) { @@ -2882,8 +2876,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set nft_set_destroy(set); } -static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delset(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); @@ -2896,7 +2890,7 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_TABLE] == NULL) return -EINVAL; - err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla); + err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla); if (err < 0) return err; @@ -3024,7 +3018,7 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + [NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 }, }; -static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, +static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3033,7 +3027,6 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, const struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nft_af_info *afi; struct nft_table *table; - struct net *net = sock_net(skb->sk); afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false); if (IS_ERR(afi)) @@ -3045,7 +3038,7 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, if (!trans && (table->flags & NFT_TABLE_INACTIVE)) return -ENOENT; - nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla); + nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla); return 0; } @@ -3135,6 +3128,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_set_dump_args args; struct nft_ctx ctx; @@ -3150,8 +3144,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (err < 0) return err; - err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla, - false); + err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh, + (void *)nla, false); if (err < 0) return err; @@ -3212,11 +3206,12 @@ static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { + struct net *net = sock_net(skb->sk); const struct nft_set *set; struct nft_ctx ctx; int err; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; @@ -3528,11 +3523,10 @@ err1: return err; } -static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { - struct net *net = sock_net(skb->sk); const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; @@ -3541,7 +3535,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, true); if (err < 0) return err; @@ -3623,8 +3617,8 @@ err1: return err; } -static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, - const struct nlmsghdr *nlh, +static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[]) { const struct nlattr *attr; @@ -3635,7 +3629,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; - err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false); + err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla, false); if (err < 0) return err; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 46453ab318db..445590f2c673 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -381,7 +381,7 @@ replay: goto ack; if (nc->call_batch) { - err = nc->call_batch(net->nfnl, skb, nlh, + err = nc->call_batch(net, net->nfnl, skb, nlh, (const struct nlattr **)cda); } -- cgit v1.2.3-59-g8ed1b From 059393c5bdd1420bdf1bed2972f33196dff263ae Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 7 Dec 2015 10:11:11 +0000 Subject: irqchip/gic-v3: Add missing struct device_node declaration When the GICv3 header file is used in a C file that doesn't include any of the OF stuff, we end up with a bunch of ugly warnings. Let's keep GCC quiet by adding a forward declaration. Signed-off-by: Marc Zyngier Cc: Cc: Jason Cooper Link: http://lkml.kernel.org/r/1449483072-17694-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Thomas Gleixner --- include/linux/irqchip/arm-gic-v3.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index c9ae0c6ec050..d5d798b35c1f 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -330,6 +330,7 @@ struct rdists { }; struct irq_domain; +struct device_node; int its_cpu_init(void); int its_init(struct device_node *node, struct rdists *rdists, struct irq_domain *domain); -- cgit v1.2.3-59-g8ed1b From ad87e03213b552a5c33d5e1e7a19a73768397010 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 10 Dec 2015 15:27:21 -0500 Subject: USB: add quirk for devices with broken LPM Some USB device / host controller combinations seem to have problems with Link Power Management. For example, Steinar found that his xHCI controller wouldn't handle bandwidth calculations correctly for two video cards simultaneously when LPM was enabled, even though the bus had plenty of bandwidth available. This patch introduces a new quirk flag for devices that should remain disabled for LPM, and creates quirk entries for Steinar's devices. Signed-off-by: Alan Stern Reported-by: Steinar H. Gunderson Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/hub.c | 7 ++++++- drivers/usb/core/quirks.c | 6 ++++++ include/linux/usb/quirks.h | 3 +++ 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 585c3cb07da6..a5cc032ef77a 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -124,6 +124,10 @@ struct usb_hub *usb_hub_to_struct_hub(struct usb_device *hdev) int usb_device_supports_lpm(struct usb_device *udev) { + /* Some devices have trouble with LPM */ + if (udev->quirks & USB_QUIRK_NO_LPM) + return 0; + /* USB 2.1 (and greater) devices indicate LPM support through * their USB 2.0 Extended Capabilities BOS descriptor. */ @@ -4512,6 +4516,8 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1, goto fail; } + usb_detect_quirks(udev); + if (udev->wusb == 0 && le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201) { retval = usb_get_bos_descriptor(udev); if (!retval) { @@ -4710,7 +4716,6 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, if (status < 0) goto loop; - usb_detect_quirks(udev); if (udev->quirks & USB_QUIRK_DELAY_INIT) msleep(1000); diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index fcd6ac0c667f..6dc810bce295 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -202,6 +202,12 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Blackmagic Design Intensity Shuttle */ + { USB_DEVICE(0x1edb, 0xbd3b), .driver_info = USB_QUIRK_NO_LPM }, + + /* Blackmagic Design UltraStudio SDI */ + { USB_DEVICE(0x1edb, 0xbd4f), .driver_info = USB_QUIRK_NO_LPM }, + { } /* terminating entry must be last */ }; diff --git a/include/linux/usb/quirks.h b/include/linux/usb/quirks.h index 9948c874e3f1..1d0043dc34e4 100644 --- a/include/linux/usb/quirks.h +++ b/include/linux/usb/quirks.h @@ -47,4 +47,7 @@ /* device generates spurious wakeup, ignore remote wakeup capability */ #define USB_QUIRK_IGNORE_REMOTE_WAKEUP BIT(9) +/* device can't handle Link Power Management */ +#define USB_QUIRK_NO_LPM BIT(10) + #endif /* __LINUX_USB_QUIRKS_H */ -- cgit v1.2.3-59-g8ed1b From 98e89cf02aed11166698dd53c6f14865613babb3 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Fri, 11 Dec 2015 13:40:43 -0800 Subject: mm: kmemleak: mark kmemleak_init prototype as __init The kmemleak_init() definition in mm/kmemleak.c is marked __init but its prototype in include/linux/kmemleak.h is marked __ref since commit a6186d89c913 ("kmemleak: Mark the early log buffer as __initdata"). This causes a section mismatch which is reported as a warning when building with clang -Wsection, because kmemleak_init() is declared in section .ref.text but defined in .init.text. Fix this by marking kmemleak_init() prototype __init. Signed-off-by: Nicolas Iooss Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmemleak.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index d0a1f99e24e3..4894c6888bc6 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -25,7 +25,7 @@ #ifdef CONFIG_DEBUG_KMEMLEAK -extern void kmemleak_init(void) __ref; +extern void kmemleak_init(void) __init; extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) __ref; extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, -- cgit v1.2.3-59-g8ed1b From 86fffe4a61dd972d5a4e23260d530be6da02f614 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 11 Dec 2015 13:40:46 -0800 Subject: kernel: remove stop_machine() Kconfig dependency Currently the full stop_machine() routine is only enabled on SMP if module unloading is enabled, or if the CPUs are hotpluggable. This leads to configurations where stop_machine() is broken as it will then only run the callback on the local CPU with irqs disabled, and not stop the other CPUs or run the callback on them. For example, this breaks MTRR setup on x86 in certain configs since ea8596bb2d8d379 ("kprobes/x86: Remove unused text_poke_smp() and text_poke_smp_batch() functions") as the MTRR is only established on the boot CPU. This patch removes the Kconfig option for STOP_MACHINE and uses the SMP and HOTPLUG_CPU config options to compile the correct stop_machine() for the architecture, removing the false dependency on MODULE_UNLOAD in the process. Link: https://lkml.org/lkml/2014/10/8/124 References: https://bugs.freedesktop.org/show_bug.cgi?id=84794 Signed-off-by: Chris Wilson Acked-by: Ingo Molnar Cc: "Paul E. McKenney" Cc: Pranith Kumar Cc: Michal Hocko Cc: Vladimir Davydov Cc: Johannes Weiner Cc: H. Peter Anvin Cc: Tejun Heo Cc: Iulia Manda Cc: Andy Lutomirski Cc: Rusty Russell Cc: Peter Zijlstra Cc: Chuck Ebbert Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/stop_machine.h | 6 +++--- init/Kconfig | 7 ------- kernel/stop_machine.c | 4 ++-- 3 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 0adedca24c5b..0e1b1540597a 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -99,7 +99,7 @@ static inline int try_stop_cpus(const struct cpumask *cpumask, * grabbing every spinlock (and more). So the "read" side to such a * lock is anything which disables preemption. */ -#if defined(CONFIG_STOP_MACHINE) && defined(CONFIG_SMP) +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) /** * stop_machine: freeze the machine on all CPUs and run this function @@ -118,7 +118,7 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus); -#else /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ static inline int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) @@ -137,5 +137,5 @@ static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return stop_machine(fn, data, cpus); } -#endif /* CONFIG_STOP_MACHINE && CONFIG_SMP */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ #endif /* _LINUX_STOP_MACHINE */ diff --git a/init/Kconfig b/init/Kconfig index c24b6f767bf0..235c7a2c0d20 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2030,13 +2030,6 @@ config INIT_ALL_POSSIBLE it was better to provide this option than to break all the archs and have several arch maintainers pursuing me down dark alleys. -config STOP_MACHINE - bool - default y - depends on (SMP && MODULE_UNLOAD) || HOTPLUG_CPU - help - Need stop_machine() primitive. - source "block/Kconfig" config PREEMPT_NOTIFIERS diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a3bbaee77c58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ -- cgit v1.2.3-59-g8ed1b From dfd01f026058a59a513f8a365b439a0681b803af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 13 Dec 2015 22:11:16 +0100 Subject: sched/wait: Fix the signal handling fix Jan Stancek reported that I wrecked things for him by fixing things for Vladimir :/ His report was due to an UNINTERRUPTIBLE wait getting -EINTR, which should not be possible, however my previous patch made this possible by unconditionally checking signal_pending(). We cannot use current->state as was done previously, because the instruction after the store to that variable it can be changed. We must instead pass the initial state along and use that. Fixes: 68985633bccb ("sched/wait: Fix signal handling in bit wait helpers") Reported-by: Jan Stancek Reported-by: Chris Mason Tested-by: Jan Stancek Tested-by: Vladimir Murzin Tested-by: Chris Mason Reviewed-by: Paul Turner Cc: Ingo Molnar Cc: tglx@linutronix.de Cc: Oleg Nesterov Cc: hpa@zytor.com Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds --- fs/cifs/inode.c | 6 +++--- fs/nfs/inode.c | 6 +++--- fs/nfs/internal.h | 2 +- fs/nfs/pagelist.c | 2 +- fs/nfs/pnfs.c | 4 ++-- include/linux/wait.h | 10 +++++----- kernel/sched/wait.c | 20 ++++++++++---------- net/sunrpc/sched.c | 6 +++--- 8 files changed, 28 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 6b66dd5d1540..a329f5ba35aa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1831,11 +1831,11 @@ cifs_invalidate_mapping(struct inode *inode) * @word: long word containing the bit lock */ static int -cifs_wait_bit_killable(struct wait_bit_key *key) +cifs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 31b0a52223a7..c7e8b87da5b2 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -75,11 +75,11 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks * @word: long word containing the bit lock */ -int nfs_wait_bit_killable(struct wait_bit_key *key) +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 56cfde26fb9c..9dea85f7f918 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -379,7 +379,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); -extern int nfs_wait_bit_killable(struct wait_bit_key *key); +extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); /* super.c */ extern const struct super_operations nfs_sops; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index fe3ddd20ff89..452a011ba0d8 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -129,7 +129,7 @@ __nfs_iocounter_wait(struct nfs_io_counter *c) set_bit(NFS_IO_INPROGRESS, &c->flags); if (atomic_read(&c->io_count) == 0) break; - ret = nfs_wait_bit_killable(&q.key); + ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); } while (atomic_read(&c->io_count) != 0 && !ret); finish_wait(wq, &q.wait); return ret; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5a8ae2125b50..bec0384499f7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1466,11 +1466,11 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, } /* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key) +static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) { if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) return 1; - return nfs_wait_bit_killable(key); + return nfs_wait_bit_killable(key, mode); } static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) diff --git a/include/linux/wait.h b/include/linux/wait.h index 1e1bf9f963a9..513b36f04dfd 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -145,7 +145,7 @@ __remove_wait_queue(wait_queue_head_t *head, wait_queue_t *old) list_del(&old->task_list); } -typedef int wait_bit_action_f(struct wait_bit_key *); +typedef int wait_bit_action_f(struct wait_bit_key *, int mode); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); @@ -960,10 +960,10 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); } while (0) -extern int bit_wait(struct wait_bit_key *); -extern int bit_wait_io(struct wait_bit_key *); -extern int bit_wait_timeout(struct wait_bit_key *); -extern int bit_wait_io_timeout(struct wait_bit_key *); +extern int bit_wait(struct wait_bit_key *, int); +extern int bit_wait_io(struct wait_bit_key *, int); +extern int bit_wait_timeout(struct wait_bit_key *, int); +extern int bit_wait_io_timeout(struct wait_bit_key *, int); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f10bd873e684..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,43 +581,43 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { io_schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f14f24ee9983..73ad57a59989 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue) } EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); -static int rpc_wait_bit_killable(struct wait_bit_key *key) +static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (fatal_signal_pending(current)) - return -ERESTARTSYS; freezable_schedule_unsafe(); + if (signal_pending_state(mode, current)) + return -ERESTARTSYS; return 0; } -- cgit v1.2.3-59-g8ed1b