From 4726e8fa1dcad533362475ebf91f70d5b6b6292f Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 9 May 2013 11:41:04 -0400
Subject: security: clarify cap_inode_getsecctx description

Make it clear that cap_inode_getsecctx shouldn't return success without
filling in the context data.

Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 include/linux/security.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 4686491852a7..40560f41e3d5 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1392,7 +1392,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  * 	@ctxlen contains the length of @ctx.
  *
  * @inode_getsecctx:
- *	Returns a string containing all relevant security context information
+ *	On success, returns 0 and fills out @ctx and @ctxlen with the security
+ *	context for the given @inode.
  *
  * 	@inode we wish to get the security context of.
  *	@ctx is a pointer in which to place the allocated security context.
-- 
cgit v1.3-7-g2ca7


From 4f3549d72d1b5c90ecc7e673402f38f4486d22c2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 2 May 2013 22:15:29 +0200
Subject: Driver core: Add offline/online device operations

In some cases, graceful hot-removal of devices is not possible,
although in principle the devices in question support hotplug.
For example, that may happen for the last CPU in the system or
for memory modules holding kernel memory.

In those cases it is nice to be able to check if the given device
can be gracefully hot-removed before triggering a removal procedure
that cannot be aborted or reversed.  Unfortunately, however, the
kernel currently doesn't provide any support for that.

To address that deficiency, introduce support for offline and
online operations that can be performed on devices, respectively,
before a hot-removal and in case when it is necessary (or convenient)
to put a device back online after a successful offline (that has not
been followed by removal).  The idea is that the offline will fail
whenever the given device cannot be gracefully removed from the
system and it will not be allowed to use the device after a
successful offline (until a subsequent online) in analogy with the
existing CPU offline/online mechanism.

For now, the offline and online operations are introduced at the
bus type level, as that should be sufficient for the most urgent use
cases (CPUs and memory modules).  In the future, however, the
approach may be extended to cover some more complicated device
offline/online scenarios involving device drivers etc.

The lock_device_hotplug() and unlock_device_hotplug() functions are
introduced because subsequent patches need to put larger pieces of
code under device_hotplug_lock to prevent race conditions between
device offline and removal from happening.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
---
 Documentation/ABI/testing/sysfs-devices-online |  20 ++++
 drivers/base/core.c                            | 130 +++++++++++++++++++++++++
 include/linux/device.h                         |  21 ++++
 3 files changed, 171 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-online

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-online b/Documentation/ABI/testing/sysfs-devices-online
new file mode 100644
index 000000000000..f990026c0740
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-online
@@ -0,0 +1,20 @@
+What:		/sys/devices/.../online
+Date:		April 2013
+Contact:	Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Description:
+		The /sys/devices/.../online attribute is only present for
+		devices whose bus types provide .online() and .offline()
+		callbacks.  The number read from it (0 or 1) reflects the value
+		of the device's 'offline' field.  If that number is 1 and '0'
+		(or 'n', or 'N') is written to this file, the device bus type's
+		.offline() callback is executed for the device and (if
+		successful) its 'offline' field is updated accordingly.  In
+		turn, if that number is 0 and '1' (or 'y', or 'Y') is written to
+		this file, the device bus type's .online() callback is executed
+		for the device and (if successful) its 'offline' field is
+		updated as appropriate.
+
+		After a successful execution of the bus type's .offline()
+		callback the device cannot be used for any purpose until either
+		it is removed (i.e. device_del() is called for it), or its bus
+		type's .online() is exeucted successfully.
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 016312437577..60c975686089 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -403,6 +403,36 @@ static ssize_t store_uevent(struct device *dev, struct device_attribute *attr,
 static struct device_attribute uevent_attr =
 	__ATTR(uevent, S_IRUGO | S_IWUSR, show_uevent, store_uevent);
 
+static ssize_t show_online(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	bool val;
+
+	lock_device_hotplug();
+	val = !dev->offline;
+	unlock_device_hotplug();
+	return sprintf(buf, "%u\n", val);
+}
+
+static ssize_t store_online(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	bool val;
+	int ret;
+
+	ret = strtobool(buf, &val);
+	if (ret < 0)
+		return ret;
+
+	lock_device_hotplug();
+	ret = val ? device_online(dev) : device_offline(dev);
+	unlock_device_hotplug();
+	return ret < 0 ? ret : count;
+}
+
+static struct device_attribute online_attr =
+	__ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online);
+
 static int device_add_attributes(struct device *dev,
 				 struct device_attribute *attrs)
 {
@@ -516,6 +546,12 @@ static int device_add_attrs(struct device *dev)
 	if (error)
 		goto err_remove_type_groups;
 
+	if (device_supports_offline(dev) && !dev->offline_disabled) {
+		error = device_create_file(dev, &online_attr);
+		if (error)
+			goto err_remove_type_groups;
+	}
+
 	return 0;
 
  err_remove_type_groups:
@@ -536,6 +572,7 @@ static void device_remove_attrs(struct device *dev)
 	struct class *class = dev->class;
 	const struct device_type *type = dev->type;
 
+	device_remove_file(dev, &online_attr);
 	device_remove_groups(dev, dev->groups);
 
 	if (type)
@@ -1431,6 +1468,99 @@ EXPORT_SYMBOL_GPL(put_device);
 EXPORT_SYMBOL_GPL(device_create_file);
 EXPORT_SYMBOL_GPL(device_remove_file);
 
+static DEFINE_MUTEX(device_hotplug_lock);
+
+void lock_device_hotplug(void)
+{
+	mutex_lock(&device_hotplug_lock);
+}
+
+void unlock_device_hotplug(void)
+{
+	mutex_unlock(&device_hotplug_lock);
+}
+
+static int device_check_offline(struct device *dev, void *not_used)
+{
+	int ret;
+
+	ret = device_for_each_child(dev, NULL, device_check_offline);
+	if (ret)
+		return ret;
+
+	return device_supports_offline(dev) && !dev->offline ? -EBUSY : 0;
+}
+
+/**
+ * device_offline - Prepare the device for hot-removal.
+ * @dev: Device to be put offline.
+ *
+ * Execute the device bus type's .offline() callback, if present, to prepare
+ * the device for a subsequent hot-removal.  If that succeeds, the device must
+ * not be used until either it is removed or its bus type's .online() callback
+ * is executed.
+ *
+ * Call under device_hotplug_lock.
+ */
+int device_offline(struct device *dev)
+{
+	int ret;
+
+	if (dev->offline_disabled)
+		return -EPERM;
+
+	ret = device_for_each_child(dev, NULL, device_check_offline);
+	if (ret)
+		return ret;
+
+	device_lock(dev);
+	if (device_supports_offline(dev)) {
+		if (dev->offline) {
+			ret = 1;
+		} else {
+			ret = dev->bus->offline(dev);
+			if (!ret) {
+				kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
+				dev->offline = true;
+			}
+		}
+	}
+	device_unlock(dev);
+
+	return ret;
+}
+
+/**
+ * device_online - Put the device back online after successful device_offline().
+ * @dev: Device to be put back online.
+ *
+ * If device_offline() has been successfully executed for @dev, but the device
+ * has not been removed subsequently, execute its bus type's .online() callback
+ * to indicate that the device can be used again.
+ *
+ * Call under device_hotplug_lock.
+ */
+int device_online(struct device *dev)
+{
+	int ret = 0;
+
+	device_lock(dev);
+	if (device_supports_offline(dev)) {
+		if (dev->offline) {
+			ret = dev->bus->online(dev);
+			if (!ret) {
+				kobject_uevent(&dev->kobj, KOBJ_ONLINE);
+				dev->offline = false;
+			}
+		} else {
+			ret = 1;
+		}
+	}
+	device_unlock(dev);
+
+	return ret;
+}
+
 struct root_device {
 	struct device dev;
 	struct module *owner;
diff --git a/include/linux/device.h b/include/linux/device.h
index c0a126125325..eeb33315514c 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -71,6 +71,10 @@ extern void bus_remove_file(struct bus_type *, struct bus_attribute *);
  *		the specific driver's probe to initial the matched device.
  * @remove:	Called when a device removed from this bus.
  * @shutdown:	Called at shut-down time to quiesce the device.
+ *
+ * @online:	Called to put the device back online (after offlining it).
+ * @offline:	Called to put the device offline for hot-removal. May fail.
+ *
  * @suspend:	Called when a device on this bus wants to go to sleep mode.
  * @resume:	Called to bring a device on this bus out of sleep mode.
  * @pm:		Power management operations of this bus, callback the specific
@@ -104,6 +108,9 @@ struct bus_type {
 	int (*remove)(struct device *dev);
 	void (*shutdown)(struct device *dev);
 
+	int (*online)(struct device *dev);
+	int (*offline)(struct device *dev);
+
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
 
@@ -648,6 +655,8 @@ struct acpi_dev_node {
  * @release:	Callback to free the device after all references have
  * 		gone away. This should be set by the allocator of the
  * 		device (i.e. the bus driver that discovered the device).
+ * @offline_disabled: If set, the device is permanently online.
+ * @offline:	Set after successful invocation of bus type's .offline().
  *
  * At the lowest level, every device in a Linux system is represented by an
  * instance of struct device. The device structure contains the information
@@ -720,6 +729,9 @@ struct device {
 
 	void	(*release)(struct device *dev);
 	struct iommu_group	*iommu_group;
+
+	bool			offline_disabled:1;
+	bool			offline:1;
 };
 
 static inline struct device *kobj_to_dev(struct kobject *kobj)
@@ -856,6 +868,15 @@ extern const char *device_get_devnode(struct device *dev,
 extern void *dev_get_drvdata(const struct device *dev);
 extern int dev_set_drvdata(struct device *dev, void *data);
 
+static inline bool device_supports_offline(struct device *dev)
+{
+	return dev->bus && dev->bus->offline && dev->bus->online;
+}
+
+extern void lock_device_hotplug(void);
+extern void unlock_device_hotplug(void);
+extern int device_offline(struct device *dev);
+extern int device_online(struct device *dev);
 /*
  * Root device objects for grouping under /sys/devices
  */
-- 
cgit v1.3-7-g2ca7


From e2ff39400d81233374e780b133496a2296643d7d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 8 May 2013 00:29:49 +0200
Subject: ACPI / memhotplug: Bind removable memory blocks to ACPI device nodes

During ACPI memory hotplug configuration bind memory blocks residing
in modules removable through the standard ACPI mechanism to struct
acpi_device objects associated with ACPI namespace objects
representing those modules.  Accordingly, unbind those memory blocks
from the struct acpi_device objects when the memory modules in
question are being removed.

When "offline" operation for devices representing memory blocks is
introduced, this will allow the ACPI core's device hot-remove code to
use it to carry out remove_memory() for those memory blocks and check
the results of that before it actually removes the modules holding
them from the system.

Since walk_memory_range() is used for accessing all memory blocks
corresponding to a given ACPI namespace object, it is exported from
memory_hotplug.c so that the code in acpi_memhotplug.c can use it.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
---
 drivers/acpi/acpi_memhotplug.c | 53 +++++++++++++++++++++++++++++++++++++++---
 include/linux/memory_hotplug.h |  2 ++
 mm/memory_hotplug.c            |  4 +++-
 3 files changed, 55 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 5e6301e94920..5590db12028e 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -28,6 +28,7 @@
  */
 
 #include <linux/acpi.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 
 #include "internal.h"
@@ -166,13 +167,50 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
+static unsigned long acpi_meminfo_start_pfn(struct acpi_memory_info *info)
+{
+	return PFN_DOWN(info->start_addr);
+}
+
+static unsigned long acpi_meminfo_end_pfn(struct acpi_memory_info *info)
+{
+	return PFN_UP(info->start_addr + info->length-1);
+}
+
+static int acpi_bind_memblk(struct memory_block *mem, void *arg)
+{
+	return acpi_bind_one(&mem->dev, (acpi_handle)arg);
+}
+
+static int acpi_bind_memory_blocks(struct acpi_memory_info *info,
+				   acpi_handle handle)
+{
+	return walk_memory_range(acpi_meminfo_start_pfn(info),
+				 acpi_meminfo_end_pfn(info), (void *)handle,
+				 acpi_bind_memblk);
+}
+
+static int acpi_unbind_memblk(struct memory_block *mem, void *arg)
+{
+	acpi_unbind_one(&mem->dev);
+	return 0;
+}
+
+static void acpi_unbind_memory_blocks(struct acpi_memory_info *info,
+				      acpi_handle handle)
+{
+	walk_memory_range(acpi_meminfo_start_pfn(info),
+			  acpi_meminfo_end_pfn(info), NULL, acpi_unbind_memblk);
+}
+
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
+	acpi_handle handle = mem_device->device->handle;
 	int result, num_enabled = 0;
 	struct acpi_memory_info *info;
 	int node;
 
-	node = acpi_get_node(mem_device->device->handle);
+	node = acpi_get_node(handle);
 	/*
 	 * Tell the VM there is more memory here...
 	 * Note: Assume that this function returns zero on success
@@ -203,6 +241,12 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 		if (result && result != -EEXIST)
 			continue;
 
+		result = acpi_bind_memory_blocks(info, handle);
+		if (result) {
+			acpi_unbind_memory_blocks(info, handle);
+			return -ENODEV;
+		}
+
 		info->enabled = 1;
 
 		/*
@@ -229,10 +273,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 
 static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
+	acpi_handle handle = mem_device->device->handle;
 	int result = 0, nid;
 	struct acpi_memory_info *info, *n;
 
-	nid = acpi_get_node(mem_device->device->handle);
+	nid = acpi_get_node(handle);
 
 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 		if (!info->enabled)
@@ -240,6 +285,8 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 
 		if (nid < 0)
 			nid = memory_add_physaddr_to_nid(info->start_addr);
+
+		acpi_unbind_memory_blocks(info, handle);
 		result = remove_memory(nid, info->start_addr, info->length);
 		if (result)
 			return result;
@@ -300,7 +347,7 @@ static int acpi_memory_device_add(struct acpi_device *device,
 	if (result) {
 		dev_err(&device->dev, "acpi_memory_enable_device() error\n");
 		acpi_memory_device_free(mem_device);
-		return -ENODEV;
+		return result;
 	}
 
 	dev_dbg(&device->dev, "Memory device configured by ACPI\n");
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3e622c610925..2975b7b2a9d8 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -245,6 +245,8 @@ static inline int is_mem_section_removable(unsigned long pfn,
 static inline void try_offline_node(int nid) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+		void *arg, int (*func)(struct memory_block *, void *));
 extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a221fac1f47d..5ea1287ee91f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1618,6 +1618,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /**
  * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1631,7 +1632,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
  *
  * Returns the return value of func.
  */
-static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
+int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *))
 {
 	struct memory_block *mem = NULL;
@@ -1668,6 +1669,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 	return 0;
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /**
  * offline_memory_block_cb - callback function for offlining memory block
  * @mem: the memory block to be offlined
-- 
cgit v1.3-7-g2ca7


From 4960e05e22604ee270a023f968e0e4f9bd0c6fef Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 8 May 2013 14:18:37 +0200
Subject: Driver core: Introduce offline/online callbacks for memory blocks

Introduce .offline() and .online() callbacks for memory_subsys
that will allow the generic device_offline() and device_online()
to be used with device objects representing memory blocks.  That,
in turn, allows the ACPI subsystem to use device_offline() to put
removable memory blocks offline, if possible, before removing
memory modules holding them.

The 'online' sysfs attribute of memory block devices will attempt to
put them offline if 0 is written to it and will attempt to apply the
previously used online type when onlining them (i.e. when 1 is
written to it).

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Vasilis Liaskovitis <vasilis.liaskovitis@profitbricks.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
---
 drivers/base/memory.c  | 112 ++++++++++++++++++++++++++++++++++++++-----------
 include/linux/memory.h |   1 +
 2 files changed, 88 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 14f8a6954da0..c8f3b63fcacd 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -37,9 +37,14 @@ static inline int base_memory_block_id(int section_nr)
 	return section_nr / sections_per_block;
 }
 
+static int memory_subsys_online(struct device *dev);
+static int memory_subsys_offline(struct device *dev);
+
 static struct bus_type memory_subsys = {
 	.name = MEMORY_CLASS_NAME,
 	.dev_name = MEMORY_CLASS_NAME,
+	.online = memory_subsys_online,
+	.offline = memory_subsys_offline,
 };
 
 static BLOCKING_NOTIFIER_HEAD(memory_chain);
@@ -88,6 +93,7 @@ int register_memory(struct memory_block *memory)
 	memory->dev.bus = &memory_subsys;
 	memory->dev.id = memory->start_section_nr / sections_per_block;
 	memory->dev.release = memory_block_release;
+	memory->dev.offline = memory->state == MEM_OFFLINE;
 
 	error = device_register(&memory->dev);
 	return error;
@@ -278,33 +284,70 @@ static int __memory_block_change_state(struct memory_block *mem,
 {
 	int ret = 0;
 
-	if (mem->state != from_state_req) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (mem->state != from_state_req)
+		return -EINVAL;
 
 	if (to_state == MEM_OFFLINE)
 		mem->state = MEM_GOING_OFFLINE;
 
 	ret = memory_block_action(mem->start_section_nr, to_state, online_type);
-
 	if (ret) {
 		mem->state = from_state_req;
-		goto out;
+	} else {
+		mem->state = to_state;
+		if (to_state == MEM_ONLINE)
+			mem->last_online = online_type;
 	}
+	return ret;
+}
 
-	mem->state = to_state;
-	switch (mem->state) {
-	case MEM_OFFLINE:
-		kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
-		break;
-	case MEM_ONLINE:
-		kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
-		break;
-	default:
-		break;
+static int memory_subsys_online(struct device *dev)
+{
+	struct memory_block *mem = container_of(dev, struct memory_block, dev);
+	int ret;
+
+	mutex_lock(&mem->state_mutex);
+
+	ret = mem->state == MEM_ONLINE ? 0 :
+		__memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE,
+					    mem->last_online);
+
+	mutex_unlock(&mem->state_mutex);
+	return ret;
+}
+
+static int memory_subsys_offline(struct device *dev)
+{
+	struct memory_block *mem = container_of(dev, struct memory_block, dev);
+	int ret;
+
+	mutex_lock(&mem->state_mutex);
+
+	ret = mem->state == MEM_OFFLINE ? 0 :
+		__memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1);
+
+	mutex_unlock(&mem->state_mutex);
+	return ret;
+}
+
+static int __memory_block_change_state_uevent(struct memory_block *mem,
+		unsigned long to_state, unsigned long from_state_req,
+		int online_type)
+{
+	int ret = __memory_block_change_state(mem, to_state, from_state_req,
+					      online_type);
+	if (!ret) {
+		switch (mem->state) {
+		case MEM_OFFLINE:
+			kobject_uevent(&mem->dev.kobj, KOBJ_OFFLINE);
+			break;
+		case MEM_ONLINE:
+			kobject_uevent(&mem->dev.kobj, KOBJ_ONLINE);
+			break;
+		default:
+			break;
+		}
 	}
-out:
 	return ret;
 }
 
@@ -315,8 +358,8 @@ static int memory_block_change_state(struct memory_block *mem,
 	int ret;
 
 	mutex_lock(&mem->state_mutex);
-	ret = __memory_block_change_state(mem, to_state, from_state_req,
-					  online_type);
+	ret = __memory_block_change_state_uevent(mem, to_state, from_state_req,
+						 online_type);
 	mutex_unlock(&mem->state_mutex);
 
 	return ret;
@@ -326,22 +369,34 @@ store_mem_state(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct memory_block *mem;
+	bool offline;
 	int ret = -EINVAL;
 
 	mem = container_of(dev, struct memory_block, dev);
 
-	if (!strncmp(buf, "online_kernel", min_t(int, count, 13)))
+	lock_device_hotplug();
+
+	if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) {
+		offline = false;
 		ret = memory_block_change_state(mem, MEM_ONLINE,
 						MEM_OFFLINE, ONLINE_KERNEL);
-	else if (!strncmp(buf, "online_movable", min_t(int, count, 14)))
+	} else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) {
+		offline = false;
 		ret = memory_block_change_state(mem, MEM_ONLINE,
 						MEM_OFFLINE, ONLINE_MOVABLE);
-	else if (!strncmp(buf, "online", min_t(int, count, 6)))
+	} else if (!strncmp(buf, "online", min_t(int, count, 6))) {
+		offline = false;
 		ret = memory_block_change_state(mem, MEM_ONLINE,
 						MEM_OFFLINE, ONLINE_KEEP);
-	else if(!strncmp(buf, "offline", min_t(int, count, 7)))
+	} else if(!strncmp(buf, "offline", min_t(int, count, 7))) {
+		offline = true;
 		ret = memory_block_change_state(mem, MEM_OFFLINE,
 						MEM_ONLINE, -1);
+	}
+	if (!ret)
+		dev->offline = offline;
+
+	unlock_device_hotplug();
 
 	if (ret)
 		return ret;
@@ -563,6 +618,7 @@ static int init_memory_block(struct memory_block **memory,
 			base_memory_block_id(scn_nr) * sections_per_block;
 	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
 	mem->state = state;
+	mem->last_online = ONLINE_KEEP;
 	mem->section_count++;
 	mutex_init(&mem->state_mutex);
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
@@ -681,14 +737,20 @@ int unregister_memory_section(struct mem_section *section)
 
 /*
  * offline one memory block. If the memory block has been offlined, do nothing.
+ *
+ * Call under device_hotplug_lock.
  */
 int offline_memory_block(struct memory_block *mem)
 {
 	int ret = 0;
 
 	mutex_lock(&mem->state_mutex);
-	if (mem->state != MEM_OFFLINE)
-		ret = __memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE, -1);
+	if (mem->state != MEM_OFFLINE) {
+		ret = __memory_block_change_state_uevent(mem, MEM_OFFLINE,
+							 MEM_ONLINE, -1);
+		if (!ret)
+			mem->dev.offline = true;
+	}
 	mutex_unlock(&mem->state_mutex);
 
 	return ret;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 85c31a8e2904..3d5346583022 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -26,6 +26,7 @@ struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long end_section_nr;
 	unsigned long state;
+	int last_online;
 	int section_count;
 
 	/*
-- 
cgit v1.3-7-g2ca7


From 416ad3c9c0066405b83ec875b75496523549be09 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 6 May 2013 23:50:06 +0000
Subject: freezer: add unsafe versions of freezable helpers for NFS

NFS calls the freezable helpers with locks held, which is unsafe
and will cause lockdep warnings when 6aa9707 "lockdep: check
that no locks held at freeze time" is reapplied (it was reverted
in dbf520a).  NFS shouldn't be doing this, but it has
long-running syscalls that must hold a lock but also shouldn't
block suspend.  Until NFS freeze handling is rewritten to use a
signal to exit out of the critical section, add new *_unsafe
versions of the helpers that will not run the lockdep test when
6aa9707 is reapplied, and call them from NFS.

In practice the likley result of holding the lock while freezing
is that a second task blocked on the lock will never freeze,
aborting suspend, but it is possible to manufacture a case using
the cgroup freezer, the lock, and the suspend freezer to create
a deadlock.  Silencing the lockdep warning here will allow
problems to be found in other drivers that may have a more
serious deadlock risk, and prevent new problems from being added.

Signed-off-by: Colin Cross <ccross@android.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 fs/nfs/inode.c          |  2 +-
 fs/nfs/nfs3proc.c       |  2 +-
 fs/nfs/nfs4proc.c       |  4 ++--
 include/linux/freezer.h | 42 +++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/sched.c      |  2 +-
 5 files changed, 46 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c1c7a9d78722..ce727047ee87 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -79,7 +79,7 @@ int nfs_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	freezable_schedule();
+	freezable_schedule_unsafe();
 	return 0;
 }
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 43ea96ced28c..ce90eb4775c2 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -33,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		freezable_schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
+		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8fbc10054115..9b18af167815 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -268,7 +268,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 		*timeout = NFS4_POLL_RETRY_MIN;
 	if (*timeout > NFS4_POLL_RETRY_MAX)
 		*timeout = NFS4_POLL_RETRY_MAX;
-	freezable_schedule_timeout_killable(*timeout);
+	freezable_schedule_timeout_killable_unsafe(*timeout);
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
 	*timeout <<= 1;
@@ -4528,7 +4528,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	freezable_schedule_timeout_killable(timeout);
+	freezable_schedule_timeout_killable_unsafe(timeout);
 	timeout <<= 1;
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index e70df40d84f6..5b31e21c485f 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -46,7 +46,11 @@ extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
 extern void thaw_kernel_threads(void);
 
-static inline bool try_to_freeze(void)
+/*
+ * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION
+ * If try_to_freeze causes a lockdep warning it means the caller may deadlock
+ */
+static inline bool try_to_freeze_unsafe(void)
 {
 	might_sleep();
 	if (likely(!freezing(current)))
@@ -54,6 +58,11 @@ static inline bool try_to_freeze(void)
 	return __refrigerator(false);
 }
 
+static inline bool try_to_freeze(void)
+{
+	return try_to_freeze_unsafe();
+}
+
 extern bool freeze_task(struct task_struct *p);
 extern bool set_freezable(void);
 
@@ -115,6 +124,14 @@ static inline void freezer_count(void)
 	try_to_freeze();
 }
 
+/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
+static inline void freezer_count_unsafe(void)
+{
+	current->flags &= ~PF_FREEZER_SKIP;
+	smp_mb();
+	try_to_freeze_unsafe();
+}
+
 /**
  * freezer_should_skip - whether to skip a task when determining frozen
  *			 state is reached
@@ -152,6 +169,14 @@ static inline bool freezer_should_skip(struct task_struct *p)
 	freezer_count();						\
 })
 
+/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
+#define freezable_schedule_unsafe()					\
+({									\
+	freezer_do_not_count();						\
+	schedule();							\
+	freezer_count_unsafe();						\
+})
+
 /* Like schedule_timeout_killable(), but should not block the freezer. */
 #define freezable_schedule_timeout_killable(timeout)			\
 ({									\
@@ -162,6 +187,16 @@ static inline bool freezer_should_skip(struct task_struct *p)
 	__retval;							\
 })
 
+/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
+#define freezable_schedule_timeout_killable_unsafe(timeout)		\
+({									\
+	long __retval;							\
+	freezer_do_not_count();						\
+	__retval = schedule_timeout_killable(timeout);			\
+	freezer_count_unsafe();						\
+	__retval;							\
+})
+
 /*
  * Freezer-friendly wrappers around wait_event_interruptible(),
  * wait_event_killable() and wait_event_interruptible_timeout(), originally
@@ -225,9 +260,14 @@ static inline void set_freezable(void) {}
 
 #define freezable_schedule()  schedule()
 
+#define freezable_schedule_unsafe()  schedule()
+
 #define freezable_schedule_timeout_killable(timeout)			\
 	schedule_timeout_killable(timeout)
 
+#define freezable_schedule_timeout_killable_unsafe(timeout)		\
+	schedule_timeout_killable(timeout)
+
 #define wait_event_freezable(wq, condition)				\
 		wait_event_interruptible(wq, condition)
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f8529fc8e542..8dcfadcef5d3 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -254,7 +254,7 @@ static int rpc_wait_bit_killable(void *word)
 {
 	if (fatal_signal_pending(current))
 		return -ERESTARTSYS;
-	freezable_schedule();
+	freezable_schedule_unsafe();
 	return 0;
 }
 
-- 
cgit v1.3-7-g2ca7


From 5853cc2a89f726e21d51ca0fd75757a03126a84b Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Tue, 7 May 2013 17:52:05 +0000
Subject: freezer: add unsafe versions of freezable helpers for CIFS

CIFS calls wait_event_freezekillable_unsafe with a VFS lock held,
which is unsafe and will cause lockdep warnings when 6aa9707
"lockdep: check that no locks held at freeze time" is reapplied
(it was reverted in dbf520a).  CIFS shouldn't be doing this, but
it has long-running syscalls that must hold a lock but also
shouldn't block suspend.  Until CIFS freeze handling is rewritten
to use a signal to exit out of the critical section, add a new
wait_event_freezekillable_unsafe helper that will not run the
lockdep test when 6aa9707 is reapplied, and call it from CIFS.

In practice the likley result of holding the lock while freezing
is that a second task blocked on the lock will never freeze,
aborting suspend, but it is possible to manufacture a case using
the cgroup freezer, the lock, and the suspend freezer to create
a deadlock.  Silencing the lockdep warning here will allow
problems to be found in other drivers that may have a more
serious deadlock risk, and prevent new problems from being added.

Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 fs/cifs/transport.c     |  2 +-
 include/linux/freezer.h | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index bfbf4700d160..b70aa7c91394 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -447,7 +447,7 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ)
 {
 	int error;
 
-	error = wait_event_freezekillable(server->response_q,
+	error = wait_event_freezekillable_unsafe(server->response_q,
 				    midQ->mid_state != MID_REQUEST_SUBMITTED);
 	if (error < 0)
 		return -ERESTARTSYS;
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 5b31e21c485f..d3c038ec9a88 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -212,6 +212,16 @@ static inline bool freezer_should_skip(struct task_struct *p)
 	__retval;							\
 })
 
+/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
+#define wait_event_freezekillable_unsafe(wq, condition)			\
+({									\
+	int __retval;							\
+	freezer_do_not_count();						\
+	__retval = wait_event_killable(wq, (condition));		\
+	freezer_count_unsafe();						\
+	__retval;							\
+})
+
 #define wait_event_freezable(wq, condition)				\
 ({									\
 	int __retval;							\
@@ -277,6 +287,9 @@ static inline void set_freezable(void) {}
 #define wait_event_freezekillable(wq, condition)		\
 		wait_event_killable(wq, condition)
 
+#define wait_event_freezekillable_unsafe(wq, condition)			\
+		wait_event_killable(wq, condition)
+
 #endif /* !CONFIG_FREEZER */
 
 #endif	/* FREEZER_H_INCLUDED */
-- 
cgit v1.3-7-g2ca7


From 1b1d2fb4444231f25ddabc598aa2b5a9c0833fba Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 6 May 2013 23:50:08 +0000
Subject: lockdep: remove task argument from debug_check_no_locks_held

The only existing caller to debug_check_no_locks_held calls it
with 'current' as the task, and the freezer needs to call
debug_check_no_locks_held but doesn't already have a current
task pointer, so remove the argument.  It is already assuming
that the current task is relevant by dumping the current stack
trace as part of the warning.

This was originally part of 6aa9707099c (lockdep: check that
no locks held at freeze time) which was reverted in
dbf520a9d7d4.

Original-author: Mandeep Singh Baines <msb@chromium.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/debug_locks.h |  4 ++--
 kernel/exit.c               |  2 +-
 kernel/lockdep.c            | 17 ++++++++---------
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 21ca773f77bf..822c1354f3a6 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -51,7 +51,7 @@ struct task_struct;
 extern void debug_show_all_locks(void);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
-extern void debug_check_no_locks_held(struct task_struct *task);
+extern void debug_check_no_locks_held(void);
 #else
 static inline void debug_show_all_locks(void)
 {
@@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len)
 }
 
 static inline void
-debug_check_no_locks_held(struct task_struct *task)
+debug_check_no_locks_held(void)
 {
 }
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd499..e59756275000 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
 	/*
 	 * Make sure we are holding no locks:
 	 */
-	debug_check_no_locks_held(tsk);
+	debug_check_no_locks_held();
 	/*
 	 * We can do this unlocked here. The futex code uses this flag
 	 * just to verify whether the pi state cleanup has been done
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 1f3186b37fd5..e16c45b9ee77 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4090,7 +4090,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
 }
 EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
 
-static void print_held_locks_bug(struct task_struct *curr)
+static void print_held_locks_bug(void)
 {
 	if (!debug_locks_off())
 		return;
@@ -4099,22 +4099,21 @@ static void print_held_locks_bug(struct task_struct *curr)
 
 	printk("\n");
 	printk("=====================================\n");
-	printk("[ BUG: lock held at task exit time! ]\n");
+	printk("[ BUG: %s/%d still has locks held! ]\n",
+	       current->comm, task_pid_nr(current));
 	print_kernel_ident();
 	printk("-------------------------------------\n");
-	printk("%s/%d is exiting with locks still held!\n",
-		curr->comm, task_pid_nr(curr));
-	lockdep_print_held_locks(curr);
-
+	lockdep_print_held_locks(current);
 	printk("\nstack backtrace:\n");
 	dump_stack();
 }
 
-void debug_check_no_locks_held(struct task_struct *task)
+void debug_check_no_locks_held(void)
 {
-	if (unlikely(task->lockdep_depth > 0))
-		print_held_locks_bug(task);
+	if (unlikely(current->lockdep_depth > 0))
+		print_held_locks_bug();
 }
+EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
 
 void debug_show_all_locks(void)
 {
-- 
cgit v1.3-7-g2ca7


From 0f9548ca10916dec166eaf74c816bded7d8e611d Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Mon, 6 May 2013 23:50:09 +0000
Subject: lockdep: check that no locks held at freeze time

We shouldn't try_to_freeze if locks are held.  Holding a lock can cause a
deadlock if the lock is later acquired in the suspend or hibernate path
(e.g.  by dpm).  Holding a lock can also cause a deadlock in the case of
cgroup_freezer if a lock is held inside a frozen cgroup that is later
acquired by a process outside that group.

History:
This patch was originally applied as 6aa9707099c and reverted in
dbf520a9d7d4 because NFS was freezing with locks held.  It was
deemed better to keep the bad freeze point in NFS to allow laptops
to suspend consistently.  The previous patch in this series converts
NFS to call _unsafe versions of the freezable helpers so that
lockdep doesn't complain about them until a more correct fix
can be applied.

[akpm@linux-foundation.org: export debug_check_no_locks_held]
Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index d3c038ec9a88..bcf9e651cc85 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -3,6 +3,7 @@
 #ifndef FREEZER_H_INCLUDED
 #define FREEZER_H_INCLUDED
 
+#include <linux/debug_locks.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 #include <linux/atomic.h>
@@ -60,6 +61,8 @@ static inline bool try_to_freeze_unsafe(void)
 
 static inline bool try_to_freeze(void)
 {
+	if (!(current->flags & PF_NOFREEZE))
+		debug_check_no_locks_held();
 	return try_to_freeze_unsafe();
 }
 
-- 
cgit v1.3-7-g2ca7


From b01235861b84c0f6107d3f9da189c9898fc3caaf Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 6 May 2013 23:50:12 +0000
Subject: freezer: convert freezable helpers to freezer_do_not_count()

Freezing tasks will wake up almost every userspace task from
where it is blocking and force it to run until it hits a
call to try_to_sleep(), generally on the exit path from the syscall
it is blocking in.  On resume each task will run again, usually
restarting the syscall and running until it hits the same
blocking call as it was originally blocked in.

Convert the existing wait_event_freezable* wrappers to use
freezer_do_not_count().  Combined with a previous patch,
these tasks will not run during suspend or resume unless they wake
up for another reason, in which case they will run until they hit
the try_to_freeze() in freezer_count(), and then continue processing
the wakeup after tasks are thawed.

This results in a small change in behavior, previously a race
between freezing and a normal wakeup would be won by the wakeup,
now the task will freeze and then handle the wakeup after thawing.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index bcf9e651cc85..c71337af9bbd 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -228,27 +228,19 @@ static inline bool freezer_should_skip(struct task_struct *p)
 #define wait_event_freezable(wq, condition)				\
 ({									\
 	int __retval;							\
-	for (;;) {							\
-		__retval = wait_event_interruptible(wq, 		\
-				(condition) || freezing(current));	\
-		if (__retval || (condition))				\
-			break;						\
-		try_to_freeze();					\
-	}								\
+	freezer_do_not_count();						\
+	__retval = wait_event_interruptible(wq, (condition));		\
+	freezer_count();						\
 	__retval;							\
 })
 
 #define wait_event_freezable_timeout(wq, condition, timeout)		\
 ({									\
 	long __retval = timeout;					\
-	for (;;) {							\
-		__retval = wait_event_interruptible_timeout(wq,		\
-				(condition) || freezing(current),	\
-				__retval); 				\
-		if (__retval <= 0 || (condition))			\
-			break;						\
-		try_to_freeze();					\
-	}								\
+	freezer_do_not_count();						\
+	__retval = wait_event_interruptible_timeout(wq,	(condition),	\
+				__retval);				\
+	freezer_count();						\
 	__retval;							\
 })
 
-- 
cgit v1.3-7-g2ca7


From 8ee492d6595573a0d4be168ebda1c7ceb4ec509d Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 6 May 2013 23:50:13 +0000
Subject: freezer: convert freezable helpers to static inline where possible

Some of the freezable helpers have to be macros because their
condition argument needs to get evaluated every time through
the wait loop.  Convert the others to static inline to make
future changes easier.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 58 ++++++++++++++++++++++++-------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index c71337af9bbd..8430d4c51ece 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -159,46 +159,46 @@ static inline bool freezer_should_skip(struct task_struct *p)
 }
 
 /*
- * These macros are intended to be used whenever you want allow a sleeping
+ * These functions are intended to be used whenever you want allow a sleeping
  * task to be frozen. Note that neither return any clear indication of
  * whether a freeze event happened while in this function.
  */
 
 /* Like schedule(), but should not block the freezer. */
-#define freezable_schedule()						\
-({									\
-	freezer_do_not_count();						\
-	schedule();							\
-	freezer_count();						\
-})
+static inline void freezable_schedule(void)
+{
+	freezer_do_not_count();
+	schedule();
+	freezer_count();
+}
 
 /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-#define freezable_schedule_unsafe()					\
-({									\
-	freezer_do_not_count();						\
-	schedule();							\
-	freezer_count_unsafe();						\
-})
+static inline void freezable_schedule_unsafe(void)
+{
+	freezer_do_not_count();
+	schedule();
+	freezer_count_unsafe();
+}
 
 /* Like schedule_timeout_killable(), but should not block the freezer. */
-#define freezable_schedule_timeout_killable(timeout)			\
-({									\
-	long __retval;							\
-	freezer_do_not_count();						\
-	__retval = schedule_timeout_killable(timeout);			\
-	freezer_count();						\
-	__retval;							\
-})
+static inline long freezable_schedule_timeout_killable(long timeout)
+{
+	long __retval;
+	freezer_do_not_count();
+	__retval = schedule_timeout_killable(timeout);
+	freezer_count();
+	return __retval;
+}
 
 /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-#define freezable_schedule_timeout_killable_unsafe(timeout)		\
-({									\
-	long __retval;							\
-	freezer_do_not_count();						\
-	__retval = schedule_timeout_killable(timeout);			\
-	freezer_count_unsafe();						\
-	__retval;							\
-})
+static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
+{
+	long __retval;
+	freezer_do_not_count();
+	__retval = schedule_timeout_killable(timeout);
+	freezer_count_unsafe();
+	return __retval;
+}
 
 /*
  * Freezer-friendly wrappers around wait_event_interruptible(),
-- 
cgit v1.3-7-g2ca7


From dd5ec0f4e72bed3d0e589e21fdf46eedafc106b7 Mon Sep 17 00:00:00 2001
From: Colin Cross <ccross@android.com>
Date: Mon, 6 May 2013 23:50:14 +0000
Subject: freezer: add new freezable helpers using freezer_do_not_count()

Freezing tasks will wake up almost every userspace task from
where it is blocking and force it to run until it hits a
call to try_to_sleep(), generally on the exit path from the syscall
it is blocking in.  On resume each task will run again, usually
restarting the syscall and running until it hits the same
blocking call as it was originally blocked in.

To allow tasks to avoid running on every suspend/resume cycle,
this patch adds additional freezable wrappers around blocking calls
that call freezer_do_not_count().  Combined with the previous patch,
these tasks will not run during suspend or resume unless they wake
up for another reason, in which case they will run until they hit
the try_to_freeze() in freezer_count(), and then continue processing
the wakeup after tasks are thawed.

Additional patches will convert the most common locations that
userspace blocks in to use freezable helpers.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Colin Cross <ccross@android.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/freezer.h | 61 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 8430d4c51ece..7fd81b8c4897 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -180,6 +180,32 @@ static inline void freezable_schedule_unsafe(void)
 	freezer_count_unsafe();
 }
 
+/*
+ * Like freezable_schedule_timeout(), but should not block the freezer.  Do not
+ * call this with locks held.
+ */
+static inline long freezable_schedule_timeout(long timeout)
+{
+	long __retval;
+	freezer_do_not_count();
+	__retval = schedule_timeout(timeout);
+	freezer_count();
+	return __retval;
+}
+
+/*
+ * Like schedule_timeout_interruptible(), but should not block the freezer.  Do not
+ * call this with locks held.
+ */
+static inline long freezable_schedule_timeout_interruptible(long timeout)
+{
+	long __retval;
+	freezer_do_not_count();
+	__retval = schedule_timeout_interruptible(timeout);
+	freezer_count();
+	return __retval;
+}
+
 /* Like schedule_timeout_killable(), but should not block the freezer. */
 static inline long freezable_schedule_timeout_killable(long timeout)
 {
@@ -200,6 +226,20 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
 	return __retval;
 }
 
+/*
+ * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
+ * call this with locks held.
+ */
+static inline int freezable_schedule_hrtimeout_range(ktime_t *expires,
+		unsigned long delta, const enum hrtimer_mode mode)
+{
+	int __retval;
+	freezer_do_not_count();
+	__retval = schedule_hrtimeout_range(expires, delta, mode);
+	freezer_count();
+	return __retval;
+}
+
 /*
  * Freezer-friendly wrappers around wait_event_interruptible(),
  * wait_event_killable() and wait_event_interruptible_timeout(), originally
@@ -244,6 +284,16 @@ static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
 	__retval;							\
 })
 
+#define wait_event_freezable_exclusive(wq, condition)			\
+({									\
+	int __retval;							\
+	freezer_do_not_count();						\
+	__retval = wait_event_interruptible_exclusive(wq, condition);	\
+	freezer_count();						\
+	__retval;							\
+})
+
+
 #else /* !CONFIG_FREEZER */
 static inline bool frozen(struct task_struct *p) { return false; }
 static inline bool freezing(struct task_struct *p) { return false; }
@@ -267,18 +317,29 @@ static inline void set_freezable(void) {}
 
 #define freezable_schedule_unsafe()  schedule()
 
+#define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)
+
+#define freezable_schedule_timeout_interruptible(timeout)		\
+	schedule_timeout_interruptible(timeout)
+
 #define freezable_schedule_timeout_killable(timeout)			\
 	schedule_timeout_killable(timeout)
 
 #define freezable_schedule_timeout_killable_unsafe(timeout)		\
 	schedule_timeout_killable(timeout)
 
+#define freezable_schedule_hrtimeout_range(expires, delta, mode)	\
+	schedule_hrtimeout_range(expires, delta, mode)
+
 #define wait_event_freezable(wq, condition)				\
 		wait_event_interruptible(wq, condition)
 
 #define wait_event_freezable_timeout(wq, condition, timeout)		\
 		wait_event_interruptible_timeout(wq, condition, timeout)
 
+#define wait_event_freezable_exclusive(wq, condition)			\
+		wait_event_interruptible_exclusive(wq, condition)
+
 #define wait_event_freezekillable(wq, condition)		\
 		wait_event_killable(wq, condition)
 
-- 
cgit v1.3-7-g2ca7


From 697e85bc6a9aa44ecd73392586fe9cfd7e0467ba Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 8 May 2013 13:55:22 +0100
Subject: regmap: Add support for discarding parts of the register cache

Allow drivers to discard parts of the register cache, for example if part
of the hardware has been reset.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/base/regmap/internal.h |  1 +
 drivers/base/regmap/regcache.c | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h         |  9 +++++++++
 include/trace/events/regmap.h  | 23 +++++++++++++++++++++++
 4 files changed, 70 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index c130536e0ab0..b33a4ff67adf 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -148,6 +148,7 @@ struct regcache_ops {
 	int (*read)(struct regmap *map, unsigned int reg, unsigned int *value);
 	int (*write)(struct regmap *map, unsigned int reg, unsigned int value);
 	int (*sync)(struct regmap *map, unsigned int min, unsigned int max);
+	int (*drop)(struct regmap *map, unsigned int min, unsigned int max);
 };
 
 bool regmap_writeable(struct regmap *map, unsigned int reg);
diff --git a/drivers/base/regmap/regcache.c b/drivers/base/regmap/regcache.c
index 75923f2396bd..8a0ab5fa75f5 100644
--- a/drivers/base/regmap/regcache.c
+++ b/drivers/base/regmap/regcache.c
@@ -358,6 +358,43 @@ out:
 }
 EXPORT_SYMBOL_GPL(regcache_sync_region);
 
+/**
+ * regcache_drop_region: Discard part of the register cache
+ *
+ * @map: map to operate on
+ * @min: first register to discard
+ * @max: last register to discard
+ *
+ * Discard part of the register cache.
+ *
+ * Return a negative value on failure, 0 on success.
+ */
+int regcache_drop_region(struct regmap *map, unsigned int min,
+			 unsigned int max)
+{
+	unsigned int reg;
+	int ret = 0;
+
+	if (!map->cache_present && !(map->cache_ops && map->cache_ops->drop))
+		return -EINVAL;
+
+	map->lock(map);
+
+	trace_regcache_drop_region(map->dev, min, max);
+
+	if (map->cache_present)
+		for (reg = min; reg < max + 1; reg++)
+			clear_bit(reg, map->cache_present);
+
+	if (map->cache_ops && map->cache_ops->drop)
+		ret = map->cache_ops->drop(map, min, max);
+
+	map->unlock(map);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regcache_drop_region);
+
 /**
  * regcache_cache_only: Put a register map into cache only mode
  *
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 02d84e24b7c2..5067ee94eb92 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -394,6 +394,8 @@ bool regmap_can_raw_write(struct regmap *map);
 int regcache_sync(struct regmap *map);
 int regcache_sync_region(struct regmap *map, unsigned int min,
 			 unsigned int max);
+int regcache_drop_region(struct regmap *map, unsigned int min,
+			 unsigned int max);
 void regcache_cache_only(struct regmap *map, bool enable);
 void regcache_cache_bypass(struct regmap *map, bool enable);
 void regcache_mark_dirty(struct regmap *map);
@@ -562,6 +564,13 @@ static inline int regcache_sync_region(struct regmap *map, unsigned int min,
 	return -EINVAL;
 }
 
+static inline int regcache_drop_region(struct regmap *map, unsigned int min,
+				       unsigned int max)
+{
+	WARN_ONCE(1, "regmap API is disabled");
+	return -EINVAL;
+}
+
 static inline void regcache_cache_only(struct regmap *map, bool enable)
 {
 	WARN_ONCE(1, "regmap API is disabled");
diff --git a/include/trace/events/regmap.h b/include/trace/events/regmap.h
index a43a2f67bd8e..23d561512f64 100644
--- a/include/trace/events/regmap.h
+++ b/include/trace/events/regmap.h
@@ -223,6 +223,29 @@ DEFINE_EVENT(regmap_async, regmap_async_complete_done,
 
 );
 
+TRACE_EVENT(regcache_drop_region,
+
+	TP_PROTO(struct device *dev, unsigned int from,
+		 unsigned int to),
+
+	TP_ARGS(dev, from, to),
+
+	TP_STRUCT__entry(
+		__string(       name,           dev_name(dev)   )
+		__field(	unsigned int,	from		)
+		__field(	unsigned int,	to		)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, dev_name(dev));
+		__entry->from = from;
+		__entry->to = to;
+	),
+
+	TP_printk("%s %u-%u", __get_str(name), (unsigned int)__entry->from,
+		  (unsigned int)__entry->to)
+);
+
 #endif /* _TRACE_REGMAP_H */
 
 /* This part must be outside protection */
-- 
cgit v1.3-7-g2ca7


From 154881e59b8dbf84121e3e78c4e613e840752aa9 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 8 May 2013 13:55:23 +0100
Subject: regmap: Make regmap_check_range_table() a public API

Allow drivers to use an access table as part of their implementation.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/base/regmap/regmap.c | 14 +++++++-------
 include/linux/regmap.h       |  3 +++
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index a941dcfe7590..307f5a1c1fe8 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -65,9 +65,8 @@ bool regmap_reg_in_ranges(unsigned int reg,
 }
 EXPORT_SYMBOL_GPL(regmap_reg_in_ranges);
 
-static bool _regmap_check_range_table(struct regmap *map,
-				      unsigned int reg,
-				      const struct regmap_access_table *table)
+bool regmap_check_range_table(struct regmap *map, unsigned int reg,
+			      const struct regmap_access_table *table)
 {
 	/* Check "no ranges" first */
 	if (regmap_reg_in_ranges(reg, table->no_ranges, table->n_no_ranges))
@@ -80,6 +79,7 @@ static bool _regmap_check_range_table(struct regmap *map,
 	return regmap_reg_in_ranges(reg, table->yes_ranges,
 				    table->n_yes_ranges);
 }
+EXPORT_SYMBOL_GPL(regmap_check_range_table);
 
 bool regmap_writeable(struct regmap *map, unsigned int reg)
 {
@@ -90,7 +90,7 @@ bool regmap_writeable(struct regmap *map, unsigned int reg)
 		return map->writeable_reg(map->dev, reg);
 
 	if (map->wr_table)
-		return _regmap_check_range_table(map, reg, map->wr_table);
+		return regmap_check_range_table(map, reg, map->wr_table);
 
 	return true;
 }
@@ -107,7 +107,7 @@ bool regmap_readable(struct regmap *map, unsigned int reg)
 		return map->readable_reg(map->dev, reg);
 
 	if (map->rd_table)
-		return _regmap_check_range_table(map, reg, map->rd_table);
+		return regmap_check_range_table(map, reg, map->rd_table);
 
 	return true;
 }
@@ -121,7 +121,7 @@ bool regmap_volatile(struct regmap *map, unsigned int reg)
 		return map->volatile_reg(map->dev, reg);
 
 	if (map->volatile_table)
-		return _regmap_check_range_table(map, reg, map->volatile_table);
+		return regmap_check_range_table(map, reg, map->volatile_table);
 
 	return true;
 }
@@ -135,7 +135,7 @@ bool regmap_precious(struct regmap *map, unsigned int reg)
 		return map->precious_reg(map->dev, reg);
 
 	if (map->precious_table)
-		return _regmap_check_range_table(map, reg, map->precious_table);
+		return regmap_check_range_table(map, reg, map->precious_table);
 
 	return false;
 }
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 5067ee94eb92..d6f3221e29d4 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -400,6 +400,9 @@ void regcache_cache_only(struct regmap *map, bool enable);
 void regcache_cache_bypass(struct regmap *map, bool enable);
 void regcache_mark_dirty(struct regmap *map);
 
+bool regmap_check_range_table(struct regmap *map, unsigned int reg,
+			      const struct regmap_access_table *table);
+
 int regmap_register_patch(struct regmap *map, const struct reg_default *regs,
 			  int num_regs);
 
-- 
cgit v1.3-7-g2ca7


From cee22a15052faa817e3ec8985a28154d3fabc7aa Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 8 Apr 2013 16:45:40 +0530
Subject: workqueues: Introduce new flag WQ_POWER_EFFICIENT for power oriented
 workqueues

Workqueues can be performance or power-oriented. Currently, most workqueues are
bound to the CPU they were created on. This gives good performance (due to cache
effects) at the cost of potentially waking up otherwise idle cores (Idle from
scheduler's perspective. Which may or may not be physically idle) just to
process some work. To save power, we can allow the work to be rescheduled on a
core that is already awake.

Workqueues created with the WQ_UNBOUND flag will allow some power savings.
However, we don't change the default behaviour of the system.  To enable
power-saving behaviour, a new config option CONFIG_WQ_POWER_EFFICIENT needs to
be turned on. This option can also be overridden by the
workqueue.power_efficient boot parameter.

tj: Updated config description and comments.  Renamed
    CONFIG_WQ_POWER_EFFICIENT to CONFIG_WQ_POWER_EFFICIENT_DEFAULT.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/kernel-parameters.txt | 15 +++++++++++++++
 include/linux/workqueue.h           | 27 +++++++++++++++++++++++++++
 kernel/power/Kconfig                | 20 ++++++++++++++++++++
 kernel/workqueue.c                  | 13 +++++++++++++
 4 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c3bfacb92910..37dfd720de79 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3320,6 +3320,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			that this also can be controlled per-workqueue for
 			workqueues visible under /sys/bus/workqueue/.
 
+	workqueue.power_efficient
+			Per-cpu workqueues are generally preferred because
+			they show better performance thanks to cache
+			locality; unfortunately, per-cpu workqueues tend to
+			be more power hungry than unbound workqueues.
+
+			Enabling this makes the per-cpu workqueues which
+			were observed to contribute significantly to power
+			consumption unbound, leading to measurably lower
+			power usage at the cost of small performance
+			overhead.
+
+			The default value of this parameter is determined by
+			the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
+
 	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 623488fdc1f5..fc0136b604f2 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -303,6 +303,33 @@ enum {
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 	WQ_SYSFS		= 1 << 6, /* visible in sysfs, see wq_sysfs_register() */
 
+	/*
+	 * Per-cpu workqueues are generally preferred because they tend to
+	 * show better performance thanks to cache locality.  Per-cpu
+	 * workqueues exclude the scheduler from choosing the CPU to
+	 * execute the worker threads, which has an unfortunate side effect
+	 * of increasing power consumption.
+	 *
+	 * The scheduler considers a CPU idle if it doesn't have any task
+	 * to execute and tries to keep idle cores idle to conserve power;
+	 * however, for example, a per-cpu work item scheduled from an
+	 * interrupt handler on an idle CPU will force the scheduler to
+	 * excute the work item on that CPU breaking the idleness, which in
+	 * turn may lead to more scheduling choices which are sub-optimal
+	 * in terms of power consumption.
+	 *
+	 * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
+	 * but become unbound if workqueue.power_efficient kernel param is
+	 * specified.  Per-cpu workqueues which are identified to
+	 * contribute significantly to power-consumption are identified and
+	 * marked with this flag and enabling the power_efficient mode
+	 * leads to noticeable power saving at the cost of small
+	 * performance disadvantage.
+	 *
+	 * http://thread.gmane.org/gmane.linux.kernel/1480396
+	 */
+	WQ_POWER_EFFICIENT	= 1 << 7,
+
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180b..46455961a88f 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS
 	bool
 	depends on PM
 
+config WQ_POWER_EFFICIENT_DEFAULT
+	bool "Enable workqueue power-efficient mode by default"
+	depends on PM
+	default n
+	help
+	  Per-cpu workqueues are generally preferred because they show
+	  better performance thanks to cache locality; unfortunately,
+	  per-cpu workqueues tend to be more power hungry than unbound
+	  workqueues.
+
+	  Enabling workqueue.power_efficient kernel parameter makes the
+	  per-cpu workqueues which were observed to contribute
+	  significantly to power consumption unbound, leading to measurably
+	  lower power usage at the cost of small performance overhead.
+
+	  This config option determines whether workqueue.power_efficient
+	  is enabled by default.
+
+	  If in doubt, say N.
+
 config PM_GENERIC_DOMAINS_SLEEP
 	def_bool y
 	depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4aa9f5bc6b2d..8068d97ce141 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
 static bool wq_disable_numa;
 module_param_named(disable_numa, wq_disable_numa, bool, 0444);
 
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -4085,6 +4094,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	struct workqueue_struct *wq;
 	struct pool_workqueue *pwq;
 
+	/* see the comment above the definition of WQ_POWER_EFFICIENT */
+	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+		flags |= WQ_UNBOUND;
+
 	/* allocate wq and format name */
 	if (flags & WQ_UNBOUND)
 		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
-- 
cgit v1.3-7-g2ca7


From 0668106ca3865ba945e155097fb042bf66d364d3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 24 Apr 2013 17:12:54 +0530
Subject: workqueue: Add system wide power_efficient workqueues

This patch adds system wide workqueues aligned towards power saving. This is
done by allocating them with WQ_UNBOUND flag if 'wq_power_efficient' is set to
'true'.

tj: updated comments a bit.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  8 ++++++++
 kernel/workqueue.c        | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index fc0136b604f2..a9f4119c7e2e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -360,11 +360,19 @@ enum {
  *
  * system_freezable_wq is equivalent to system_wq except that it's
  * freezable.
+ *
+ * *_power_efficient_wq are inclined towards saving power and converted
+ * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
+ * they are same as their non-power-efficient counterparts - e.g.
+ * system_power_efficient_wq is identical to system_wq if
+ * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
  */
 extern struct workqueue_struct *system_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
+extern struct workqueue_struct *system_power_efficient_wq;
+extern struct workqueue_struct *system_freezable_power_efficient_wq;
 
 static inline struct workqueue_struct * __deprecated __system_nrt_wq(void)
 {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8068d97ce141..16ca2d3dd29f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -314,6 +314,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_unbound_wq);
 struct workqueue_struct *system_freezable_wq __read_mostly;
 EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
 
 static int worker_thread(void *__worker);
 static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -4987,8 +4991,15 @@ static int __init init_workqueues(void)
 					    WQ_UNBOUND_MAX_ACTIVE);
 	system_freezable_wq = alloc_workqueue("events_freezable",
 					      WQ_FREEZABLE, 0);
+	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+					      WQ_POWER_EFFICIENT, 0);
+	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+					      0);
 	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
-	       !system_unbound_wq || !system_freezable_wq);
+	       !system_unbound_wq || !system_freezable_wq ||
+	       !system_power_efficient_wq ||
+	       !system_freezable_power_efficient_wq);
 	return 0;
 }
 early_initcall(init_workqueues);
-- 
cgit v1.3-7-g2ca7


From 857a2beb09ab83e9a8185821ae16db7dfbe8b837 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:50:08 -0700
Subject: cgroup: implement task_cgroup_path_from_hierarchy()

kdbus folks want a sane way to determine the cgroup path that a given
task belongs to on a given hierarchy, which is a reasonble thing to
expect from cgroup core.

Implement task_cgroup_path_from_hierarchy().

v2: Dropped unnecessary NULL check on the return value of
    task_cgroup_from_root() as suggested by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <greg@kroah.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <daniel@zonque.org>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5047355b9a0f..383c630f36f9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -542,6 +542,8 @@ int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+				    char *buf, size_t buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dcb417c6c242..6b2b1d945df2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1827,6 +1827,38 @@ out:
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+/**
+ * task_cgroup_path_from_hierarchy - cgroup path of a task on a hierarchy
+ * @task: target task
+ * @hierarchy_id: the hierarchy to look up @task's cgroup from
+ * @buf: the buffer to write the path into
+ * @buflen: the length of the buffer
+ *
+ * Determine @task's cgroup on the hierarchy specified by @hierarchy_id and
+ * copy its path into @buf.  This function grabs cgroup_mutex and shouldn't
+ * be used inside locks used by cgroup controller callbacks.
+ */
+int task_cgroup_path_from_hierarchy(struct task_struct *task, int hierarchy_id,
+				    char *buf, size_t buflen)
+{
+	struct cgroupfs_root *root;
+	struct cgroup *cgrp = NULL;
+	int ret = -ENOENT;
+
+	mutex_lock(&cgroup_mutex);
+
+	root = idr_find(&cgroup_hierarchy_idr, hierarchy_id);
+	if (root) {
+		cgrp = task_cgroup_from_root(task, root);
+		ret = cgroup_path(cgrp, buf, buflen);
+	}
+
+	mutex_unlock(&cgroup_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy);
+
 /*
  * Control Group taskset
  */
-- 
cgit v1.3-7-g2ca7


From e628dc999e43a9dd51fb6bd810772c277f934484 Mon Sep 17 00:00:00 2001
From: David Milburn <dmilburn@redhat.com>
Date: Tue, 14 May 2013 13:48:40 -0500
Subject: libata: export ata_port port_no attribute via /sys

While registering host controller track port number based upon number
of ports available on the controller, export port_no attribute through
/sys. This patch is needed by udev for composing persistent links in
/dev/disk/by-path.

/sys/devices/pci0000:00/0000:00:1f.2/ata8/ata_port/ata8
total 0
lrwxrwxrwx. 1 root root    0 Mar  6 12:43 device -> ../../../ata8
-r--r--r--. 1 root root 4096 Mar  6 12:43 idle_irq
-r--r--r--. 1 root root 4096 Mar  6 12:43 nr_pmp_links
-r--r--r--. 1 root root 4096 Mar  6 12:43 port_no
drwxr-xr-x. 2 root root    0 Mar  6 12:42 power
lrwxrwxrwx. 1 root root    0 Mar  6 12:41 subsystem -> ../../../../../../class/ata_port
-rw-r--r--. 1 root root 4096 Mar  6 12:40 uevent
1

Signed-off-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libata-core.c      | 6 ++++--
 drivers/ata/libata-transport.c | 4 +++-
 include/linux/libata.h         | 1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 63c743baf920..5f7d5f9ee820 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5636,6 +5636,7 @@ struct ata_port *ata_port_alloc(struct ata_host *host)
 	ap->pflags |= ATA_PFLAG_INITIALIZING | ATA_PFLAG_FROZEN;
 	ap->lock = &host->lock;
 	ap->print_id = -1;
+	ap->local_port_no = -1;
 	ap->host = host;
 	ap->dev = host->dev;
 
@@ -6126,9 +6127,10 @@ int ata_host_register(struct ata_host *host, struct scsi_host_template *sht)
 		kfree(host->ports[i]);
 
 	/* give ports names and add SCSI hosts */
-	for (i = 0; i < host->n_ports; i++)
+	for (i = 0; i < host->n_ports; i++) {
 		host->ports[i]->print_id = atomic_inc_return(&ata_print_id);
-
+		host->ports[i]->local_port_no = i + 1;
+	}
 
 	/* Create associated sysfs transport objects  */
 	for (i = 0; i < host->n_ports; i++) {
diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c
index c04d393d20c1..077a856f5fd0 100644
--- a/drivers/ata/libata-transport.c
+++ b/drivers/ata/libata-transport.c
@@ -37,7 +37,7 @@
 #include "libata.h"
 #include "libata-transport.h"
 
-#define ATA_PORT_ATTRS		2
+#define ATA_PORT_ATTRS		3
 #define ATA_LINK_ATTRS		3
 #define ATA_DEV_ATTRS		9
 
@@ -216,6 +216,7 @@ static DEVICE_ATTR(name, S_IRUGO, show_ata_port_##name, NULL)
 
 ata_port_simple_attr(nr_pmp_links, nr_pmp_links, "%d\n", int);
 ata_port_simple_attr(stats.idle_irq, idle_irq, "%ld\n", unsigned long);
+ata_port_simple_attr(local_port_no, port_no, "%u\n", unsigned int);
 
 static DECLARE_TRANSPORT_CLASS(ata_port_class,
 			       "ata_port", NULL, NULL, NULL);
@@ -709,6 +710,7 @@ struct scsi_transport_template *ata_attach_transport(void)
 	count = 0;
 	SETUP_PORT_ATTRIBUTE(nr_pmp_links);
 	SETUP_PORT_ATTRIBUTE(idle_irq);
+	SETUP_PORT_ATTRIBUTE(port_no);
 	BUG_ON(count > ATA_PORT_ATTRS);
 	i->port_attrs[count] = NULL;
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index eae7a053dc51..47e029236f6e 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -746,6 +746,7 @@ struct ata_port {
 	/* Flags that change dynamically, protected by ap->lock */
 	unsigned int		pflags; /* ATA_PFLAG_xxx */
 	unsigned int		print_id; /* user visible unique port ID */
+	unsigned int            local_port_no; /* host local port num */
 	unsigned int		port_no; /* 0 based port no. inside the host */
 
 #ifdef CONFIG_ATA_SFF
-- 
cgit v1.3-7-g2ca7


From 23958e729e7029678e746bf8f4094c8863a79c3d Mon Sep 17 00:00:00 2001
From: Greg KH <gregkh@linuxfoundation.org>
Date: Fri, 3 May 2013 16:26:59 -0700
Subject: cgroup.h: remove some functions that are now gone

cgroup_lock() and cgroup_unlock() are now no longer exported, so fix
cgroup.h to not declare them if CONFIG_CGROUPS is not enabled.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 383c630f36f9..4f6f5138c340 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -840,8 +840,6 @@ static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
-static inline void cgroup_lock(void) {}
-static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
-- 
cgit v1.3-7-g2ca7


From 0061d53daf26ff713ab43ab84ae5c44b5edbefa9 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 9 May 2013 20:21:41 -0300
Subject: KVM: x86: limit difference between kvmclock updates

kvmclock updates which are isolated to a given vcpu, such as vcpu->cpu
migration, should not allow system_timestamp from the rest of the vcpus
to remain static. Otherwise ntp frequency correction applies to one
vcpu's system_timestamp but not the others.

So in those cases, request a kvmclock update for all vcpus. The worst
case for a remote vcpu to update its kvmclock is then bounded by maximum
nohz sleep latency.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/kvm/x86.c       | 30 ++++++++++++++++++++++++++++--
 include/linux/kvm_host.h |  1 +
 2 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 094b5d96ab14..8d28810a5f88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1588,6 +1588,30 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	return 0;
 }
 
+/*
+ * kvmclock updates which are isolated to a given vcpu, such as
+ * vcpu->cpu migration, should not allow system_timestamp from
+ * the rest of the vcpus to remain static. Otherwise ntp frequency
+ * correction applies to one vcpu's system_timestamp but not
+ * the others.
+ *
+ * So in those cases, request a kvmclock update for all vcpus.
+ * The worst case for a remote vcpu to update its kvmclock
+ * is then bounded by maximum nohz sleep latency.
+ */
+
+static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
+{
+	int i;
+	struct kvm *kvm = v->kvm;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+		kvm_vcpu_kick(vcpu);
+	}
+}
+
 static bool msr_mtrr_valid(unsigned msr)
 {
 	switch (msr) {
@@ -1985,7 +2009,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		kvmclock_reset(vcpu);
 
 		vcpu->arch.time = data;
-		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 
 		/* we verify if the enable bit is set... */
 		if (!(data & 1))
@@ -2702,7 +2726,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * kvmclock on vcpu->cpu migration
 		 */
 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
-			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
 		if (vcpu->cpu != cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
@@ -5703,6 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
 			kvm_gen_update_masterclock(vcpu->kvm);
+		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
+			kvm_gen_kvmclock_update(vcpu);
 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
 			r = kvm_guest_time_update(vcpu);
 			if (unlikely(r))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f0eea07d2c2b..d9a3c30eab2e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MCLOCK_INPROGRESS 19
 #define KVM_REQ_EPR_EXIT          20
 #define KVM_REQ_SCAN_IOAPIC       21
+#define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit v1.3-7-g2ca7


From b3087e48ce20be784fae1dbabc2e42e2ad0f21bc Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 20 May 2013 12:15:44 +0930
Subject: virtio: remove virtqueue_add_buf().

All users changed to virtqueue_add_sg() or virtqueue_add_outbuf/inbuf.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 37 +++----------------------------------
 include/linux/virtio.h       |  7 -------
 2 files changed, 3 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5217baf5528c..c70207478e57 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -295,37 +295,6 @@ add_head:
 	return 0;
 }
 
-/**
- * virtqueue_add_buf - expose buffer to other end
- * @vq: the struct virtqueue we're talking about.
- * @sg: the description of the buffer(s).
- * @out_num: the number of sg readable by other side
- * @in_num: the number of sg which are writable (after readable ones)
- * @data: the token identifying the buffer.
- * @gfp: how to do memory allocations (if necessary).
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
-		      struct scatterlist sg[],
-		      unsigned int out,
-		      unsigned int in,
-		      void *data,
-		      gfp_t gfp)
-{
-	struct scatterlist *sgs[2];
-
-	sgs[0] = sg;
-	sgs[1] = sg + out;
-
-	return virtqueue_add(_vq, sgs, sg_next_arr,
-			     out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
-}
-EXPORT_SYMBOL_GPL(virtqueue_add_buf);
-
 /**
  * virtqueue_add_sgs - expose buffers to other end
  * @vq: the struct virtqueue we're talking about.
@@ -473,7 +442,7 @@ EXPORT_SYMBOL_GPL(virtqueue_notify);
  * virtqueue_kick - update after add_buf
  * @vq: the struct virtqueue
  *
- * After one or more virtqueue_add_buf calls, invoke this to kick
+ * After one or more virtqueue_add_* calls, invoke this to kick
  * the other side.
  *
  * Caller must ensure we don't call this with other virtqueue
@@ -530,7 +499,7 @@ static inline bool more_used(const struct vring_virtqueue *vq)
  * operations at the same time (except where noted).
  *
  * Returns NULL if there are no used buffers, or the "data" token
- * handed to virtqueue_add_buf().
+ * handed to virtqueue_add_*().
  */
 void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 {
@@ -685,7 +654,7 @@ EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
  * virtqueue_detach_unused_buf - detach first unused buffer
  * @vq: the struct virtqueue we're talking about.
  *
- * Returns NULL or the "data" token handed to virtqueue_add_buf().
+ * Returns NULL or the "data" token handed to virtqueue_add_*().
  * This is not valid on an active queue; it is useful only for device
  * shutdown.
  */
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 9ff8645b7e0b..e94c75ded111 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -34,13 +34,6 @@ struct virtqueue {
 	void *priv;
 };
 
-int virtqueue_add_buf(struct virtqueue *vq,
-		      struct scatterlist sg[],
-		      unsigned int out_num,
-		      unsigned int in_num,
-		      void *data,
-		      gfp_t gfp);
-
 int virtqueue_add_outbuf(struct virtqueue *vq,
 			 struct scatterlist sg[], unsigned int num,
 			 void *data,
-- 
cgit v1.3-7-g2ca7


From 5927467d0ca274bc3b8eed9fd5db964bbde56e1c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 23 Apr 2013 19:44:16 +0100
Subject: mfd: arizona: Support use of external DCVDD

When the device is used with an external DCVDD supply instead of the
internal LDO1 then an extra step is required when suspending and resuming
the device.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/arizona-core.c       | 43 ++++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/arizona/core.h |  2 ++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index d8d30c0a488d..437f199fe5f2 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -22,6 +22,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/regmap.h>
 #include <linux/regulator/consumer.h>
+#include <linux/regulator/machine.h>
 #include <linux/slab.h>
 
 #include <linux/mfd/arizona/core.h>
@@ -347,6 +348,17 @@ static int arizona_runtime_resume(struct device *dev)
 
 	switch (arizona->type) {
 	case WM5102:
+		if (arizona->external_dcvdd) {
+			ret = regmap_update_bits(arizona->regmap,
+						 ARIZONA_ISOLATION_CONTROL,
+						 ARIZONA_ISOLATE_DCVDD1, 0);
+			if (ret != 0) {
+				dev_err(arizona->dev,
+					"Failed to connect DCVDD: %d\n", ret);
+				goto err;
+			}
+		}
+
 		ret = wm5102_patch(arizona);
 		if (ret != 0) {
 			dev_err(arizona->dev, "Failed to apply patch: %d\n",
@@ -368,6 +380,16 @@ static int arizona_runtime_resume(struct device *dev)
 			goto err;
 		}
 
+		if (arizona->external_dcvdd) {
+			ret = regmap_update_bits(arizona->regmap,
+						 ARIZONA_ISOLATION_CONTROL,
+						 ARIZONA_ISOLATE_DCVDD1, 0);
+			if (ret != 0) {
+				dev_err(arizona->dev,
+					"Failed to connect DCVDD: %d\n", ret);
+				goto err;
+			}
+		}
 		break;
 	}
 
@@ -400,9 +422,22 @@ err:
 static int arizona_runtime_suspend(struct device *dev)
 {
 	struct arizona *arizona = dev_get_drvdata(dev);
+	int ret;
 
 	dev_dbg(arizona->dev, "Entering AoD mode\n");
 
+	if (arizona->external_dcvdd) {
+		ret = regmap_update_bits(arizona->regmap,
+					 ARIZONA_ISOLATION_CONTROL,
+					 ARIZONA_ISOLATE_DCVDD1,
+					 ARIZONA_ISOLATE_DCVDD1);
+		if (ret != 0) {
+			dev_err(arizona->dev, "Failed to isolate DCVDD: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
 	regulator_disable(arizona->dcvdd);
 	regcache_cache_only(arizona->regmap, true);
 	regcache_mark_dirty(arizona->regmap);
@@ -771,6 +806,14 @@ int arizona_dev_init(struct arizona *arizona)
 			     arizona->pdata.gpio_defaults[i]);
 	}
 
+	/*
+	 * LDO1 can only be used to supply DCVDD so if it has no
+	 * consumers then DCVDD is supplied externally.
+	 */
+	if (arizona->pdata.ldo1 &&
+	    arizona->pdata.ldo1->num_consumer_supplies == 0)
+		arizona->external_dcvdd = true;
+
 	pm_runtime_set_autosuspend_delay(arizona->dev, 100);
 	pm_runtime_use_autosuspend(arizona->dev);
 	pm_runtime_enable(arizona->dev);
diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index cc281368dc55..f797bb9b8b56 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -95,6 +95,8 @@ struct arizona {
 
 	struct arizona_pdata pdata;
 
+	unsigned int external_dcvdd:1;
+
 	int irq;
 	struct irq_domain *virq;
 	struct regmap_irq_chip_data *aod_irq_chip;
-- 
cgit v1.3-7-g2ca7


From 435705e89265dec3c641fe75deb748f05e232e59 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 20 May 2013 11:16:10 -0500
Subject: ASoC: wm8994: Handle LRCLK inversion for WM8958 and WM1811A

On WM8958 and WM1811A separate control of the LRCLK inversion bit is
available for the DAC and ADC LRCLKs which for compatibility reasons is
done in a new register bit.

Since writes to each scheme have no effect on parts using the other just
always write to both for simplicity.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Acked-by: Vinod Koul <vinod.koul@intel.com>
Tested-by: Samreen Nilofer <samreen.nilofer@intel.com>
---
 include/linux/mfd/wm8994/registers.h |  8 ++++++++
 sound/soc/codecs/wm8994.c            | 14 ++++++++++++++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/registers.h b/include/linux/mfd/wm8994/registers.h
index 053548961c15..db8cef3d5321 100644
--- a/include/linux/mfd/wm8994/registers.h
+++ b/include/linux/mfd/wm8994/registers.h
@@ -2668,6 +2668,10 @@
 /*
  * R772 (0x304) - AIF1ADC LRCLK
  */
+#define WM8958_AIF1_LRCLK_INV                   0x1000  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_MASK              0x1000  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_SHIFT                 12  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_WIDTH                  1  /* AIF1_LRCLK_INV */
 #define WM8994_AIF1ADC_LRCLK_DIR                0x0800  /* AIF1ADC_LRCLK_DIR */
 #define WM8994_AIF1ADC_LRCLK_DIR_MASK           0x0800  /* AIF1ADC_LRCLK_DIR */
 #define WM8994_AIF1ADC_LRCLK_DIR_SHIFT              11  /* AIF1ADC_LRCLK_DIR */
@@ -2679,6 +2683,10 @@
 /*
  * R773 (0x305) - AIF1DAC LRCLK
  */
+#define WM8958_AIF1_LRCLK_INV                   0x1000  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_MASK              0x1000  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_SHIFT                 12  /* AIF1_LRCLK_INV */
+#define WM8958_AIF1_LRCLK_INV_WIDTH                  1  /* AIF1_LRCLK_INV */
 #define WM8994_AIF1DAC_LRCLK_DIR                0x0800  /* AIF1DAC_LRCLK_DIR */
 #define WM8994_AIF1DAC_LRCLK_DIR_MASK           0x0800  /* AIF1DAC_LRCLK_DIR */
 #define WM8994_AIF1DAC_LRCLK_DIR_SHIFT              11  /* AIF1DAC_LRCLK_DIR */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 303d755b9342..f1c54af45dcf 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -2574,17 +2574,24 @@ static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 	struct wm8994 *control = wm8994->wm8994;
 	int ms_reg;
 	int aif1_reg;
+	int dac_reg;
+	int adc_reg;
 	int ms = 0;
 	int aif1 = 0;
+	int lrclk = 0;
 
 	switch (dai->id) {
 	case 1:
 		ms_reg = WM8994_AIF1_MASTER_SLAVE;
 		aif1_reg = WM8994_AIF1_CONTROL_1;
+		dac_reg = WM8994_AIF1DAC_LRCLK;
+		adc_reg = WM8994_AIF1ADC_LRCLK;
 		break;
 	case 2:
 		ms_reg = WM8994_AIF2_MASTER_SLAVE;
 		aif1_reg = WM8994_AIF2_CONTROL_1;
+		dac_reg = WM8994_AIF1DAC_LRCLK;
+		adc_reg = WM8994_AIF1ADC_LRCLK;
 		break;
 	default:
 		return -EINVAL;
@@ -2603,6 +2610,7 @@ static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
 	case SND_SOC_DAIFMT_DSP_B:
 		aif1 |= WM8994_AIF1_LRCLK_INV;
+		lrclk |= WM8958_AIF1_LRCLK_INV;
 	case SND_SOC_DAIFMT_DSP_A:
 		aif1 |= 0x18;
 		break;
@@ -2641,12 +2649,14 @@ static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 			break;
 		case SND_SOC_DAIFMT_IB_IF:
 			aif1 |= WM8994_AIF1_BCLK_INV | WM8994_AIF1_LRCLK_INV;
+			lrclk |= WM8958_AIF1_LRCLK_INV;
 			break;
 		case SND_SOC_DAIFMT_IB_NF:
 			aif1 |= WM8994_AIF1_BCLK_INV;
 			break;
 		case SND_SOC_DAIFMT_NB_IF:
 			aif1 |= WM8994_AIF1_LRCLK_INV;
+			lrclk |= WM8958_AIF1_LRCLK_INV;
 			break;
 		default:
 			return -EINVAL;
@@ -2677,6 +2687,10 @@ static int wm8994_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
 			    aif1);
 	snd_soc_update_bits(codec, ms_reg, WM8994_AIF1_MSTR,
 			    ms);
+	snd_soc_update_bits(codec, dac_reg,
+			    WM8958_AIF1_LRCLK_INV, lrclk);
+	snd_soc_update_bits(codec, adc_reg,
+			    WM8958_AIF1_LRCLK_INV, lrclk);
 
 	return 0;
 }
-- 
cgit v1.3-7-g2ca7


From 1c16c9636cad24f0e654f19e8f73ede0c7bd5540 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 21 May 2013 10:40:47 -0400
Subject: tpm: move TPM_DIGEST_SIZE defintion

IMA requires access to TPM_DIGEST_SIZE definition.  This patch
moves the definition to <linux/tpm.h>.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: Kent Yoder <key@linux.vnet.ibm.com>
---
 drivers/char/tpm/tpm.h | 1 -
 include/linux/tpm.h    | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 0770d1d79366..433423288709 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -272,7 +272,6 @@ typedef union {
 	struct	tpm_output_header out;
 } tpm_cmd_header;
 
-#define TPM_DIGEST_SIZE 20
 struct tpm_pcrread_out {
 	u8	pcr_result[TPM_DIGEST_SIZE];
 } __packed;
diff --git a/include/linux/tpm.h b/include/linux/tpm.h
index fcb627ff8d3e..9a9051bb1a03 100644
--- a/include/linux/tpm.h
+++ b/include/linux/tpm.h
@@ -22,6 +22,8 @@
 #ifndef __LINUX_TPM_H__
 #define __LINUX_TPM_H__
 
+#define TPM_DIGEST_SIZE 20	/* Max TPM v1.2 PCR size */
+
 /*
  * Chip num is this value or a valid tpm idx
  */
-- 
cgit v1.3-7-g2ca7


From f773fc6dca4619bdf8da767eaba101a83b766059 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Tue, 21 May 2013 14:56:58 +0100
Subject: mfd: arizona: Change fast_start pdata name to better reflect
 functionality

The bit in the register enables MICBIAS fast startup when clear not when
set. This patch changes the name of this pdata option to soft_start to
better match the functionality. We rename rather than invert the
handling to keep the same default functionality, which is fast start
active.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/arizona-core.c        | 2 +-
 include/linux/mfd/arizona/pdata.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index 437f199fe5f2..74b4481754fd 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -860,7 +860,7 @@ int arizona_dev_init(struct arizona *arizona)
 		if (arizona->pdata.micbias[i].discharge)
 			val |= ARIZONA_MICB1_DISCH;
 
-		if (arizona->pdata.micbias[i].fast_start)
+		if (arizona->pdata.micbias[i].soft_start)
 			val |= ARIZONA_MICB1_RATE;
 
 		if (arizona->pdata.micbias[i].bypass)
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 80dead1f7100..12a5c135c746 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -77,7 +77,7 @@ struct arizona_micbias {
 	int mV;                    /** Regulated voltage */
 	unsigned int ext_cap:1;    /** External capacitor fitted */
 	unsigned int discharge:1;  /** Actively discharge */
-	unsigned int fast_start:1; /** Enable aggressive startup ramp rate */
+	unsigned int soft_start:1; /** Disable aggressive startup ramp rate */
 	unsigned int bypass:1;     /** Use bypass mode */
 };
 
-- 
cgit v1.3-7-g2ca7


From 966fbe193f47c68e70a80ec9991098e88e7959cb Mon Sep 17 00:00:00 2001
From: Vincent Pelletier <plr.vincent@gmail.com>
Date: Tue, 21 May 2013 22:30:58 +0200
Subject: libata: Add atapi_dmadir force flag

Some device require DMADIR to be enabled, but are not detected as such
by atapi_id_dmadir.  One such example is "Asus Serillel 2"
SATA-host-to-PATA-device bridge: the bridge itself requires DMADIR,
even if the bridged device does not.

As atapi_dmadir module parameter can cause problems with some devices
(as per Tejun Heo's memory), enabling it globally may not be possible
depending on the hardware.

This patch adds atapi_dmadir in the form of a "force" horkage value,
allowing global, per-bus and per-device control.

Signed-off-by: Vincent Pelletier <plr.vincent@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/kernel-parameters.txt | 2 ++
 drivers/ata/libata-core.c           | 3 ++-
 include/linux/libata.h              | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c3bfacb92910..489815e3e484 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1456,6 +1456,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 			* dump_id: dump IDENTIFY data.
 
+			* atapi_dmadir: Enable ATAPI DMADIR bridge support
+
 			If there are multiple matching configurations changing
 			the same attribute, the last one is used.
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 5f7d5f9ee820..c97a244c099a 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2395,7 +2395,7 @@ int ata_dev_configure(struct ata_device *dev)
 			cdb_intr_string = ", CDB intr";
 		}
 
-		if (atapi_dmadir || atapi_id_dmadir(dev->id)) {
+		if (atapi_dmadir || (dev->horkage & ATA_HORKAGE_ATAPI_DMADIR) || atapi_id_dmadir(dev->id)) {
 			dev->flags |= ATA_DFLAG_DMADIR;
 			dma_dir_string = ", DMADIR";
 		}
@@ -6496,6 +6496,7 @@ static int __init ata_parse_force_one(char **cur,
 		{ "nosrst",	.lflags		= ATA_LFLAG_NO_SRST },
 		{ "norst",	.lflags		= ATA_LFLAG_NO_HRST | ATA_LFLAG_NO_SRST },
 		{ "rstonce",	.lflags		= ATA_LFLAG_RST_ONCE },
+		{ "atapi_dmadir", .horkage_on	= ATA_HORKAGE_ATAPI_DMADIR },
 	};
 	char *start = *cur, *p = *cur;
 	char *id, *val, *endp;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 47e029236f6e..c886dc87aa81 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -399,6 +399,7 @@ enum {
 	ATA_HORKAGE_BROKEN_FPDMA_AA	= (1 << 15),	/* skip AA */
 	ATA_HORKAGE_DUMP_ID	= (1 << 16),	/* dump IDENTIFY data */
 	ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17),	/* Set max sects to 65535 */
+	ATA_HORKAGE_ATAPI_DMADIR = (1 << 18),	/* device requires dmadir */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit v1.3-7-g2ca7


From 2922a8de996956893bb98e4aa91be9774c958336 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@wwwdotorg.org>
Date: Tue, 21 May 2013 20:36:34 -0600
Subject: spi: introduce macros to set bits_per_word_mask

Introduce two macros to make setting up spi_master.bits_per_word_mask
easier, and avoid mistakes like writing BIT(n) instead of BIT(n - 1).

SPI_BPW_MASK is for a single supported value of bits_per_word_mask.

SPI_BPW_RANGE_MASK represents a contiguous set of bit lengths.

Signed-off-by: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/spi/spi.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 6ff26c8db7b9..173725622252 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -308,6 +308,8 @@ struct spi_master {
 
 	/* bitmask of supported bits_per_word for transfers */
 	u32			bits_per_word_mask;
+#define SPI_BPW_MASK(bits) BIT((bits) - 1)
+#define SPI_BPW_RANGE_MASK(min, max) ((BIT(max) - 1) - (BIT(min) - 1))
 
 	/* other constraints relevant to this driver */
 	u16			flags;
-- 
cgit v1.3-7-g2ca7


From e5657054ff508c9e547a8ceb931de07838ee3ba0 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 22 May 2013 18:55:25 -0500
Subject: mfd: wm5110: Make DSPn_STATUS_3 readable

These registers have been documented since the driver was originally
submitted so expose them.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/wm5110-tables.c           | 8 ++++++++
 include/linux/mfd/arizona/registers.h | 3 +++
 2 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm5110-tables.c b/drivers/mfd/wm5110-tables.c
index c41599815299..2a7972349159 100644
--- a/drivers/mfd/wm5110-tables.c
+++ b/drivers/mfd/wm5110-tables.c
@@ -2273,18 +2273,22 @@ static bool wm5110_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_DSP1_CLOCKING_1:
 	case ARIZONA_DSP1_STATUS_1:
 	case ARIZONA_DSP1_STATUS_2:
+	case ARIZONA_DSP1_STATUS_3:
 	case ARIZONA_DSP2_CONTROL_1:
 	case ARIZONA_DSP2_CLOCKING_1:
 	case ARIZONA_DSP2_STATUS_1:
 	case ARIZONA_DSP2_STATUS_2:
+	case ARIZONA_DSP2_STATUS_3:
 	case ARIZONA_DSP3_CONTROL_1:
 	case ARIZONA_DSP3_CLOCKING_1:
 	case ARIZONA_DSP3_STATUS_1:
 	case ARIZONA_DSP3_STATUS_2:
+	case ARIZONA_DSP3_STATUS_3:
 	case ARIZONA_DSP4_CONTROL_1:
 	case ARIZONA_DSP4_CLOCKING_1:
 	case ARIZONA_DSP4_STATUS_1:
 	case ARIZONA_DSP4_STATUS_2:
+	case ARIZONA_DSP4_STATUS_3:
 		return true;
 	default:
 		return false;
@@ -2334,12 +2338,16 @@ static bool wm5110_volatile_register(struct device *dev, unsigned int reg)
 	case ARIZONA_DSP1_CLOCKING_1:
 	case ARIZONA_DSP1_STATUS_1:
 	case ARIZONA_DSP1_STATUS_2:
+	case ARIZONA_DSP1_STATUS_3:
 	case ARIZONA_DSP2_STATUS_1:
 	case ARIZONA_DSP2_STATUS_2:
+	case ARIZONA_DSP2_STATUS_3:
 	case ARIZONA_DSP3_STATUS_1:
 	case ARIZONA_DSP3_STATUS_2:
+	case ARIZONA_DSP3_STATUS_3:
 	case ARIZONA_DSP4_STATUS_1:
 	case ARIZONA_DSP4_STATUS_2:
+	case ARIZONA_DSP4_STATUS_3:
 		return true;
 	default:
 		return false;
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 715b6ba3d52a..4730b5c576e2 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -1002,6 +1002,7 @@
 #define ARIZONA_DSP2_CLOCKING_1                  0x1201
 #define ARIZONA_DSP2_STATUS_1                    0x1204
 #define ARIZONA_DSP2_STATUS_2                    0x1205
+#define ARIZONA_DSP2_STATUS_3                    0x1206
 #define ARIZONA_DSP2_SCRATCH_0                   0x1240
 #define ARIZONA_DSP2_SCRATCH_1                   0x1241
 #define ARIZONA_DSP2_SCRATCH_2                   0x1242
@@ -1010,6 +1011,7 @@
 #define ARIZONA_DSP3_CLOCKING_1                  0x1301
 #define ARIZONA_DSP3_STATUS_1                    0x1304
 #define ARIZONA_DSP3_STATUS_2                    0x1305
+#define ARIZONA_DSP3_STATUS_3                    0x1306
 #define ARIZONA_DSP3_SCRATCH_0                   0x1340
 #define ARIZONA_DSP3_SCRATCH_1                   0x1341
 #define ARIZONA_DSP3_SCRATCH_2                   0x1342
@@ -1018,6 +1020,7 @@
 #define ARIZONA_DSP4_CLOCKING_1                  0x1401
 #define ARIZONA_DSP4_STATUS_1                    0x1404
 #define ARIZONA_DSP4_STATUS_2                    0x1405
+#define ARIZONA_DSP4_STATUS_3                    0x1406
 #define ARIZONA_DSP4_SCRATCH_0                   0x1440
 #define ARIZONA_DSP4_SCRATCH_1                   0x1441
 #define ARIZONA_DSP4_SCRATCH_2                   0x1442
-- 
cgit v1.3-7-g2ca7


From bdc7119f1bdd0632d42f435941dc290216a436e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: make cgroup_is_removed() static

cgroup_is_removed() no longer has external users and it shouldn't grow
any - controllers should deal with cgroup_subsys_state on/offline
state instead of cgroup removal state.  Make it static.

While at it, make it return bool.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 1df5f699be61..8d9f3c911fca 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -538,7 +538,6 @@ static inline const char *cgroup_name(const struct cgroup *cgrp)
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
-int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a19419f4af1a..501974823b33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -226,7 +226,7 @@ static int css_refcnt(struct cgroup_subsys_state *css)
 }
 
 /* convenient tests for these bits */
-inline int cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_removed(const struct cgroup *cgrp)
 {
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
-- 
cgit v1.3-7-g2ca7


From 53fa5261747a90746531e8a1c81eeb78fedc2f71 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: add cgroup->serial_nr and implement cgroup_next_sibling()

Currently, there's no easy way to find out the next sibling cgroup
unless it's known that the current cgroup is accessed from the
parent's children list in a single RCU critical section.  This in turn
forces all iterators to require whole iteration to be enclosed in a
single RCU critical section, which sometimes is too restrictive.  This
patch implements cgroup_next_sibling() which can reliably determine
the next sibling regardless of the state of the current cgroup as long
as it's accessible.

It currently is impossible to determine the next sibling after
dropping RCU read lock because the cgroup being iterated could be
removed anytime and if RCU read lock is dropped, nothing guarantess
its ->sibling.next pointer is accessible.  A removed cgroup would
continue to point to its next sibling for RCU accesses but stop
receiving updates from the sibling.  IOW, the next sibling could be
removed and then complete its grace period while RCU read lock is
dropped, making it unsafe to dereference ->sibling.next after dropping
and re-acquiring RCU read lock.

This can be solved by adding a way to traverse to the next sibling
without dereferencing ->sibling.next.  This patch adds a monotonically
increasing cgroup serial number, cgroup->serial_nr, which guarantees
that all cgroup->children lists are kept in increasing serial_nr
order.  A new function, cgroup_next_sibling(), is implemented, which,
if CGRP_REMOVED is not set on the current cgroup, follows
->sibling.next; otherwise, traverses the parent's ->children list
until it sees a sibling with higher ->serial_nr.

This allows the function to always return the next sibling regardless
of the state of the current cgroup without adding overhead in the fast
path.

Further patches will update the iterators to use cgroup_next_sibling()
so that they allow dropping RCU read lock and blocking while iteration
is in progress which in turn will be used to simplify controllers.

v2: Typo fix as per Serge.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
---
 include/linux/cgroup.h | 10 ++++++++
 kernel/cgroup.c        | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8d9f3c911fca..ee041a01a67e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -188,6 +188,14 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
 
+	/*
+	 * Monotonically increasing unique serial number which defines a
+	 * uniform order among all cgroups.  It's guaranteed that all
+	 * ->children lists are in the ascending order of ->serial_nr.
+	 * It's used to allow interrupting and resuming iterations.
+	 */
+	u64 serial_nr;
+
 	/*
 	 * This is a copy of dentry->d_name, and it's needed because
 	 * we can't use dentry->d_name in cgroup_path().
@@ -675,6 +683,8 @@ static inline struct cgroup* task_cgroup(struct task_struct *task,
 	return task_subsys_state(task, subsys_id)->cgroup;
 }
 
+struct cgroup *cgroup_next_sibling(struct cgroup *pos);
+
 /**
  * cgroup_for_each_child - iterate through children of a cgroup
  * @pos: the cgroup * to use as the loop cursor
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 501974823b33..b87c7a5a5497 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2975,6 +2975,55 @@ static void cgroup_enable_task_cg_lists(void)
 	write_unlock(&css_set_lock);
 }
 
+/**
+ * cgroup_next_sibling - find the next sibling of a given cgroup
+ * @pos: the current cgroup
+ *
+ * This function returns the next sibling of @pos and should be called
+ * under RCU read lock.  The only requirement is that @pos is accessible.
+ * The next sibling is guaranteed to be returned regardless of @pos's
+ * state.
+ */
+struct cgroup *cgroup_next_sibling(struct cgroup *pos)
+{
+	struct cgroup *next;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	/*
+	 * @pos could already have been removed.  Once a cgroup is removed,
+	 * its ->sibling.next is no longer updated when its next sibling
+	 * changes.  As CGRP_REMOVED is set on removal which is fully
+	 * serialized, if we see it unasserted, it's guaranteed that the
+	 * next sibling hasn't finished its grace period even if it's
+	 * already removed, and thus safe to dereference from this RCU
+	 * critical section.  If ->sibling.next is inaccessible,
+	 * cgroup_is_removed() is guaranteed to be visible as %true here.
+	 */
+	if (likely(!cgroup_is_removed(pos))) {
+		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
+		if (&next->sibling != &pos->parent->children)
+			return next;
+		return NULL;
+	}
+
+	/*
+	 * Can't dereference the next pointer.  Each cgroup is given a
+	 * monotonically increasing unique serial number and always
+	 * appended to the sibling list, so the next one can be found by
+	 * walking the parent's children until we see a cgroup with higher
+	 * serial number than @pos's.
+	 *
+	 * While this path can be slow, it's taken only when either the
+	 * current cgroup is removed or iteration and removal race.
+	 */
+	list_for_each_entry_rcu(next, &pos->parent->children, sibling)
+		if (next->serial_nr > pos->serial_nr)
+			return next;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(cgroup_next_sibling);
+
 /**
  * cgroup_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
@@ -4137,6 +4186,7 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     umode_t mode)
 {
+	static atomic64_t serial_nr_cursor = ATOMIC64_INIT(0);
 	struct cgroup *cgrp;
 	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
@@ -4217,6 +4267,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 		goto err_free_all;
 	lockdep_assert_held(&dentry->d_inode->i_mutex);
 
+	/*
+	 * Assign a monotonically increasing serial number.  With the list
+	 * appending below, it guarantees that sibling cgroups are always
+	 * sorted in the ascending serial number order on the parent's
+	 * ->children.
+	 */
+	cgrp->serial_nr = atomic64_inc_return(&serial_nr_cursor);
+
 	/* allocation complete, commit to creation */
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
@@ -4304,6 +4362,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * removed.  This makes future css_tryget() and child creation
 	 * attempts fail thus maintaining the removal conditions verified
 	 * above.
+	 *
+	 * Note that CGRP_REMVOED clearing is depended upon by
+	 * cgroup_next_sibling() to resume iteration after dropping RCU
+	 * read lock.  See cgroup_next_sibling() for details.
 	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-- 
cgit v1.3-7-g2ca7


From 75501a6d59e989e5c286716e5b3b66ace4660e83 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 24 May 2013 10:55:38 +0900
Subject: cgroup: update iterators to use cgroup_next_sibling()

This patch converts cgroup_for_each_child(),
cgroup_next_descendant_pre/post() and thus
cgroup_for_each_descendant_pre/post() to use cgroup_next_sibling()
instead of manually dereferencing ->sibling.next.

The only reason the iterators couldn't allow dropping RCU read lock
while iteration is in progress was because they couldn't determine the
next sibling safely once RCU read lock is dropped.  Using
cgroup_next_sibling() removes that problem and enables all iterators
to allow dropping RCU read lock in the middle.  Comments are updated
accordingly.

This makes the iterators easier to use and will simplify controllers.

Note that @cgroup argument is renamed to @cgrp in
cgroup_for_each_child() because it conflicts with "struct cgroup" used
in the new macro body.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
---
 include/linux/cgroup.h | 18 ++++++++++++++----
 kernel/cgroup.c        | 25 +++++++++++++++++++------
 2 files changed, 33 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ee041a01a67e..d0ad3794b947 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -688,9 +688,9 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos);
 /**
  * cgroup_for_each_child - iterate through children of a cgroup
  * @pos: the cgroup * to use as the loop cursor
- * @cgroup: cgroup whose children to walk
+ * @cgrp: cgroup whose children to walk
  *
- * Walk @cgroup's children.  Must be called under rcu_read_lock().  A child
+ * Walk @cgrp's children.  Must be called under rcu_read_lock().  A child
  * cgroup which hasn't finished ->css_online() or already has finished
  * ->css_offline() may show up during traversal and it's each subsystem's
  * responsibility to verify that each @pos is alive.
@@ -698,9 +698,15 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos);
  * If a subsystem synchronizes against the parent in its ->css_online() and
  * before starting iterating, a cgroup which finished ->css_online() is
  * guaranteed to be visible in the future iterations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
  */
-#define cgroup_for_each_child(pos, cgroup)				\
-	list_for_each_entry_rcu(pos, &(cgroup)->children, sibling)
+#define cgroup_for_each_child(pos, cgrp)				\
+	for ((pos) = list_first_or_null_rcu(&(cgrp)->children,		\
+					    struct cgroup, sibling);	\
+	     (pos); (pos) = cgroup_next_sibling((pos)))
 
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup);
@@ -759,6 +765,10 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos);
  * Alternatively, a subsystem may choose to use a single global lock to
  * synchronize ->css_online() and ->css_offline() against tree-walking
  * operations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define cgroup_for_each_descendant_pre(pos, cgroup)			\
 	for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos);	\
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b87c7a5a5497..fefc41c1a147 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3031,6 +3031,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_sibling);
  *
  * To be used by cgroup_for_each_descendant_pre().  Find the next
  * descendant to visit for pre-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
  */
 struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 					  struct cgroup *cgroup)
@@ -3050,11 +3055,9 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
 
 	/* no child, visit my or the closest ancestor's next sibling */
 	while (pos != cgroup) {
-		next = list_entry_rcu(pos->sibling.next, struct cgroup,
-				      sibling);
-		if (&next->sibling != &pos->parent->children)
+		next = cgroup_next_sibling(pos);
+		if (next)
 			return next;
-
 		pos = pos->parent;
 	}
 
@@ -3069,6 +3072,11 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
  * Return the rightmost descendant of @pos.  If there's no descendant,
  * @pos is returned.  This can be used during pre-order traversal to skip
  * subtree of @pos.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct rightmost descendant as long as @pos is
+ * accessible.
  */
 struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
 {
@@ -3108,6 +3116,11 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
  *
  * To be used by cgroup_for_each_descendant_post().  Find the next
  * descendant to visit for post-order traversal of @cgroup's descendants.
+ *
+ * While this function requires RCU read locking, it doesn't require the
+ * whole traversal to be contained in a single RCU critical section.  This
+ * function will return the correct next descendant as long as both @pos
+ * and @cgroup are accessible and @pos is a descendant of @cgroup.
  */
 struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 					   struct cgroup *cgroup)
@@ -3123,8 +3136,8 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 	}
 
 	/* if there's an unvisited sibling, visit its leftmost descendant */
-	next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
-	if (&next->sibling != &pos->parent->children)
+	next = cgroup_next_sibling(pos);
+	if (next)
 		return cgroup_leftmost_descendant(next);
 
 	/* no sibling left, visit parent */
-- 
cgit v1.3-7-g2ca7


From b6b5e76bb8bb22ecff90a7840dc4845d63328289 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Thu, 23 May 2013 20:14:50 +0200
Subject: ASoC: Add ssm2518 support

This patch adds a ASoC CODEC driver for the SSM2516. The SSM2516 is a stereo
Class-D audio amplifier with an I2S interface for audio in and a built-in
dynamic range control processor.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 .../devicetree/bindings/sound/ssm2518.txt          |  20 +
 include/linux/platform_data/ssm2518.h              |  22 +
 sound/soc/codecs/Kconfig                           |   4 +
 sound/soc/codecs/Makefile                          |   2 +
 sound/soc/codecs/ssm2518.c                         | 856 +++++++++++++++++++++
 sound/soc/codecs/ssm2518.h                         |  20 +
 6 files changed, 924 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/sound/ssm2518.txt
 create mode 100644 include/linux/platform_data/ssm2518.h
 create mode 100644 sound/soc/codecs/ssm2518.c
 create mode 100644 sound/soc/codecs/ssm2518.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/sound/ssm2518.txt b/Documentation/devicetree/bindings/sound/ssm2518.txt
new file mode 100644
index 000000000000..59381a778c79
--- /dev/null
+++ b/Documentation/devicetree/bindings/sound/ssm2518.txt
@@ -0,0 +1,20 @@
+SSM2518 audio amplifier
+
+This device supports I2C only.
+
+Required properties:
+  - compatible : Must be "adi,ssm2518"
+  - reg : the I2C address of the device. This will either be 0x34 (ADDR pin low)
+	or 0x35 (ADDR pin high)
+
+Optional properties:
+  - gpios : GPIO connected to the nSD pin. If the property is not present it is
+	        assumed that the nSD pin is hardwired to always on.
+
+Example:
+
+	ssm2518: ssm2518@34 {
+		compatible = "adi,ssm2518";
+		reg = <0x34>;
+		gpios = <&gpio 5 0>;
+	};
diff --git a/include/linux/platform_data/ssm2518.h b/include/linux/platform_data/ssm2518.h
new file mode 100644
index 000000000000..9a8e3ea287e3
--- /dev/null
+++ b/include/linux/platform_data/ssm2518.h
@@ -0,0 +1,22 @@
+/*
+ * SSM2518 amplifier audio driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *  Author: Lars-Peter Clausen <lars@metafoo.de>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_SSM2518_H__
+#define __LINUX_PLATFORM_DATA_SSM2518_H__
+
+/**
+ * struct ssm2518_platform_data - Platform data for the ssm2518 driver
+ * @enable_gpio: GPIO connected to the nSD pin. Set to -1 if the nSD pin is
+ *            hardwired.
+ */
+struct ssm2518_platform_data {
+	int enable_gpio;
+};
+
+#endif
diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig
index 2f45f00e31b0..d76609adb85b 100644
--- a/sound/soc/codecs/Kconfig
+++ b/sound/soc/codecs/Kconfig
@@ -60,6 +60,7 @@ config SND_SOC_ALL_CODECS
 	select SND_SOC_SI476X if MFD_SI476X_CORE
 	select SND_SOC_SN95031 if INTEL_SCU_IPC
 	select SND_SOC_SPDIF
+	select SND_SOC_SSM2518 if I2C
 	select SND_SOC_SSM2602 if SND_SOC_I2C_AND_SPI
 	select SND_SOC_STA32X if I2C
 	select SND_SOC_STA529 if I2C
@@ -313,6 +314,9 @@ config SND_SOC_SN95031
 config SND_SOC_SPDIF
 	tristate
 
+config SND_SOC_SSM2518
+	tristate
+
 config SND_SOC_SSM2602
 	tristate
 
diff --git a/sound/soc/codecs/Makefile b/sound/soc/codecs/Makefile
index b9e41c9a1f4c..d85be48a6c07 100644
--- a/sound/soc/codecs/Makefile
+++ b/sound/soc/codecs/Makefile
@@ -52,6 +52,7 @@ snd-soc-si476x-objs := si476x.o
 snd-soc-sn95031-objs := sn95031.o
 snd-soc-spdif-tx-objs := spdif_transciever.o
 snd-soc-spdif-rx-objs := spdif_receiver.o
+snd-soc-ssm2518-objs := ssm2518.o
 snd-soc-ssm2602-objs := ssm2602.o
 snd-soc-sta32x-objs := sta32x.o
 snd-soc-sta529-objs := sta529.o
@@ -176,6 +177,7 @@ obj-$(CONFIG_SND_SOC_SIGMADSP)	+= snd-soc-sigmadsp.o
 obj-$(CONFIG_SND_SOC_SI476X)	+= snd-soc-si476x.o
 obj-$(CONFIG_SND_SOC_SN95031)	+=snd-soc-sn95031.o
 obj-$(CONFIG_SND_SOC_SPDIF)	+= snd-soc-spdif-rx.o snd-soc-spdif-tx.o
+obj-$(CONFIG_SND_SOC_SSM2518)	+= snd-soc-ssm2518.o
 obj-$(CONFIG_SND_SOC_SSM2602)	+= snd-soc-ssm2602.o
 obj-$(CONFIG_SND_SOC_STA32X)   += snd-soc-sta32x.o
 obj-$(CONFIG_SND_SOC_STA529)   += snd-soc-sta529.o
diff --git a/sound/soc/codecs/ssm2518.c b/sound/soc/codecs/ssm2518.c
new file mode 100644
index 000000000000..3139a1bde295
--- /dev/null
+++ b/sound/soc/codecs/ssm2518.c
@@ -0,0 +1,856 @@
+/*
+ * SSM2518 amplifier audio driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *  Author: Lars-Peter Clausen <lars@metafoo.de>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+#include <linux/platform_data/ssm2518.h>
+#include <sound/core.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+#include <sound/initval.h>
+#include <sound/tlv.h>
+
+#include "ssm2518.h"
+
+#define SSM2518_REG_POWER1		0x00
+#define SSM2518_REG_CLOCK		0x01
+#define SSM2518_REG_SAI_CTRL1		0x02
+#define SSM2518_REG_SAI_CTRL2		0x03
+#define SSM2518_REG_CHAN_MAP		0x04
+#define SSM2518_REG_LEFT_VOL		0x05
+#define SSM2518_REG_RIGHT_VOL		0x06
+#define SSM2518_REG_MUTE_CTRL		0x07
+#define SSM2518_REG_FAULT_CTRL		0x08
+#define SSM2518_REG_POWER2		0x09
+#define SSM2518_REG_DRC_1		0x0a
+#define SSM2518_REG_DRC_2		0x0b
+#define SSM2518_REG_DRC_3		0x0c
+#define SSM2518_REG_DRC_4		0x0d
+#define SSM2518_REG_DRC_5		0x0e
+#define SSM2518_REG_DRC_6		0x0f
+#define SSM2518_REG_DRC_7		0x10
+#define SSM2518_REG_DRC_8		0x11
+#define SSM2518_REG_DRC_9		0x12
+
+#define SSM2518_POWER1_RESET			BIT(7)
+#define SSM2518_POWER1_NO_BCLK			BIT(5)
+#define SSM2518_POWER1_MCS_MASK			(0xf << 1)
+#define SSM2518_POWER1_MCS_64FS			(0x0 << 1)
+#define SSM2518_POWER1_MCS_128FS		(0x1 << 1)
+#define SSM2518_POWER1_MCS_256FS		(0x2 << 1)
+#define SSM2518_POWER1_MCS_384FS		(0x3 << 1)
+#define SSM2518_POWER1_MCS_512FS		(0x4 << 1)
+#define SSM2518_POWER1_MCS_768FS		(0x5 << 1)
+#define SSM2518_POWER1_MCS_100FS		(0x6 << 1)
+#define SSM2518_POWER1_MCS_200FS		(0x7 << 1)
+#define SSM2518_POWER1_MCS_400FS		(0x8 << 1)
+#define SSM2518_POWER1_SPWDN			BIT(0)
+
+#define SSM2518_CLOCK_ASR			BIT(0)
+
+#define SSM2518_SAI_CTRL1_FMT_MASK		(0x3 << 5)
+#define SSM2518_SAI_CTRL1_FMT_I2S		(0x0 << 5)
+#define SSM2518_SAI_CTRL1_FMT_LJ		(0x1 << 5)
+#define SSM2518_SAI_CTRL1_FMT_RJ_24BIT		(0x2 << 5)
+#define SSM2518_SAI_CTRL1_FMT_RJ_16BIT		(0x3 << 5)
+
+#define SSM2518_SAI_CTRL1_SAI_MASK		(0x7 << 2)
+#define SSM2518_SAI_CTRL1_SAI_I2S		(0x0 << 2)
+#define SSM2518_SAI_CTRL1_SAI_TDM_2		(0x1 << 2)
+#define SSM2518_SAI_CTRL1_SAI_TDM_4		(0x2 << 2)
+#define SSM2518_SAI_CTRL1_SAI_TDM_8		(0x3 << 2)
+#define SSM2518_SAI_CTRL1_SAI_TDM_16		(0x4 << 2)
+#define SSM2518_SAI_CTRL1_SAI_MONO		(0x5 << 2)
+
+#define SSM2518_SAI_CTRL1_FS_MASK		(0x3)
+#define SSM2518_SAI_CTRL1_FS_8000_12000		(0x0)
+#define SSM2518_SAI_CTRL1_FS_16000_24000	(0x1)
+#define SSM2518_SAI_CTRL1_FS_32000_48000	(0x2)
+#define SSM2518_SAI_CTRL1_FS_64000_96000	(0x3)
+
+#define SSM2518_SAI_CTRL2_BCLK_INTERAL		BIT(7)
+#define SSM2518_SAI_CTRL2_LRCLK_PULSE		BIT(6)
+#define SSM2518_SAI_CTRL2_LRCLK_INVERT		BIT(5)
+#define SSM2518_SAI_CTRL2_MSB			BIT(4)
+#define SSM2518_SAI_CTRL2_SLOT_WIDTH_MASK	(0x3 << 2)
+#define SSM2518_SAI_CTRL2_SLOT_WIDTH_32		(0x0 << 2)
+#define SSM2518_SAI_CTRL2_SLOT_WIDTH_24		(0x1 << 2)
+#define SSM2518_SAI_CTRL2_SLOT_WIDTH_16		(0x2 << 2)
+#define SSM2518_SAI_CTRL2_BCLK_INVERT		BIT(1)
+
+#define SSM2518_CHAN_MAP_RIGHT_SLOT_OFFSET	4
+#define SSM2518_CHAN_MAP_RIGHT_SLOT_MASK	0xf0
+#define SSM2518_CHAN_MAP_LEFT_SLOT_OFFSET	0
+#define SSM2518_CHAN_MAP_LEFT_SLOT_MASK		0x0f
+
+#define SSM2518_MUTE_CTRL_ANA_GAIN		BIT(5)
+#define SSM2518_MUTE_CTRL_MUTE_MASTER		BIT(0)
+
+#define SSM2518_POWER2_APWDN			BIT(0)
+
+#define SSM2518_DAC_MUTE			BIT(6)
+#define SSM2518_DAC_FS_MASK			0x07
+#define SSM2518_DAC_FS_8000			0x00
+#define SSM2518_DAC_FS_16000			0x01
+#define SSM2518_DAC_FS_32000			0x02
+#define SSM2518_DAC_FS_64000			0x03
+#define SSM2518_DAC_FS_128000			0x04
+
+struct ssm2518 {
+	struct regmap *regmap;
+	bool right_j;
+
+	unsigned int sysclk;
+	const struct snd_pcm_hw_constraint_list *constraints;
+
+	int enable_gpio;
+};
+
+static const struct reg_default ssm2518_reg_defaults[] = {
+	{ 0x00, 0x05 },
+	{ 0x01, 0x00 },
+	{ 0x02, 0x02 },
+	{ 0x03, 0x00 },
+	{ 0x04, 0x10 },
+	{ 0x05, 0x40 },
+	{ 0x06, 0x40 },
+	{ 0x07, 0x81 },
+	{ 0x08, 0x0c },
+	{ 0x09, 0x99 },
+	{ 0x0a, 0x7c },
+	{ 0x0b, 0x5b },
+	{ 0x0c, 0x57 },
+	{ 0x0d, 0x89 },
+	{ 0x0e, 0x8c },
+	{ 0x0f, 0x77 },
+	{ 0x10, 0x26 },
+	{ 0x11, 0x1c },
+	{ 0x12, 0x97 },
+};
+
+static const DECLARE_TLV_DB_MINMAX_MUTE(ssm2518_vol_tlv, -7125, 2400);
+static const DECLARE_TLV_DB_SCALE(ssm2518_compressor_tlv, -3400, 200, 0);
+static const DECLARE_TLV_DB_SCALE(ssm2518_expander_tlv, -8100, 300, 0);
+static const DECLARE_TLV_DB_SCALE(ssm2518_noise_gate_tlv, -9600, 300, 0);
+static const DECLARE_TLV_DB_SCALE(ssm2518_post_drc_tlv, -2400, 300, 0);
+
+static const DECLARE_TLV_DB_RANGE(ssm2518_limiter_tlv,
+	0, 7, TLV_DB_SCALE_ITEM(-2200, 200, 0),
+	7, 15, TLV_DB_SCALE_ITEM(-800, 100, 0),
+);
+
+static const char * const ssm2518_drc_peak_detector_attack_time_text[] = {
+	"0 ms", "0.1 ms", "0.19 ms", "0.37 ms", "0.75 ms", "1.5 ms", "3 ms",
+	"6 ms", "12 ms", "24 ms", "48 ms", "96 ms", "192 ms", "384 ms",
+	"768 ms", "1536 ms",
+};
+
+static const char * const ssm2518_drc_peak_detector_release_time_text[] = {
+	"0 ms", "1.5 ms", "3 ms", "6 ms", "12 ms", "24 ms", "48 ms", "96 ms",
+	"192 ms", "384 ms", "768 ms", "1536 ms", "3072 ms", "6144 ms",
+	"12288 ms", "24576 ms"
+};
+
+static const char * const ssm2518_drc_hold_time_text[] = {
+	"0 ms", "0.67 ms", "1.33 ms", "2.67 ms", "5.33 ms", "10.66 ms",
+	"21.32 ms", "42.64 ms", "85.28 ms", "170.56 ms", "341.12 ms",
+	"682.24 ms", "1364 ms",
+};
+
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_peak_detector_attack_time_enum,
+	SSM2518_REG_DRC_2, 4, ssm2518_drc_peak_detector_attack_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_peak_detector_release_time_enum,
+	SSM2518_REG_DRC_2, 0, ssm2518_drc_peak_detector_release_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_attack_time_enum,
+	SSM2518_REG_DRC_6, 4, ssm2518_drc_peak_detector_attack_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_decay_time_enum,
+	SSM2518_REG_DRC_6, 0, ssm2518_drc_peak_detector_release_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_hold_time_enum,
+	SSM2518_REG_DRC_7, 4, ssm2518_drc_hold_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_noise_gate_hold_time_enum,
+	SSM2518_REG_DRC_7, 0, ssm2518_drc_hold_time_text);
+static const SOC_ENUM_SINGLE_DECL(ssm2518_drc_rms_averaging_time_enum,
+	SSM2518_REG_DRC_9, 0, ssm2518_drc_peak_detector_release_time_text);
+
+static const struct snd_kcontrol_new ssm2518_snd_controls[] = {
+	SOC_SINGLE("Playback De-emphasis Switch", SSM2518_REG_MUTE_CTRL,
+			4, 1, 0),
+	SOC_DOUBLE_R_TLV("Master Playback Volume", SSM2518_REG_LEFT_VOL,
+			SSM2518_REG_RIGHT_VOL, 0, 0xff, 1, ssm2518_vol_tlv),
+	SOC_DOUBLE("Master Playback Switch", SSM2518_REG_MUTE_CTRL, 2, 1, 1, 1),
+
+	SOC_SINGLE("Amp Low Power Mode Switch", SSM2518_REG_POWER2, 4, 1, 0),
+	SOC_SINGLE("DAC Low Power Mode Switch", SSM2518_REG_POWER2, 3, 1, 0),
+
+	SOC_SINGLE("DRC Limiter Switch", SSM2518_REG_DRC_1, 5, 1, 0),
+	SOC_SINGLE("DRC Compressor Switch", SSM2518_REG_DRC_1, 4, 1, 0),
+	SOC_SINGLE("DRC Expander Switch", SSM2518_REG_DRC_1, 3, 1, 0),
+	SOC_SINGLE("DRC Noise Gate Switch", SSM2518_REG_DRC_1, 2, 1, 0),
+	SOC_DOUBLE("DRC Switch", SSM2518_REG_DRC_1, 0, 1, 1, 0),
+
+	SOC_SINGLE_TLV("DRC Limiter Threshold Volume",
+			SSM2518_REG_DRC_3, 4, 15, 1, ssm2518_limiter_tlv),
+	SOC_SINGLE_TLV("DRC Compressor Lower Threshold Volume",
+			SSM2518_REG_DRC_3, 0, 15, 1, ssm2518_compressor_tlv),
+	SOC_SINGLE_TLV("DRC Expander Upper Threshold Volume", SSM2518_REG_DRC_4,
+			4, 15, 1, ssm2518_expander_tlv),
+	SOC_SINGLE_TLV("DRC Noise Gate Threshold Volume",
+			SSM2518_REG_DRC_4, 0, 15, 1, ssm2518_noise_gate_tlv),
+	SOC_SINGLE_TLV("DRC Upper Output Threshold Volume",
+			SSM2518_REG_DRC_5, 4, 15, 1, ssm2518_limiter_tlv),
+	SOC_SINGLE_TLV("DRC Lower Output Threshold Volume",
+			SSM2518_REG_DRC_5, 0, 15, 1, ssm2518_noise_gate_tlv),
+	SOC_SINGLE_TLV("DRC Post Volume", SSM2518_REG_DRC_8,
+			2, 15, 1, ssm2518_post_drc_tlv),
+
+	SOC_ENUM("DRC Peak Detector Attack Time",
+		ssm2518_drc_peak_detector_attack_time_enum),
+	SOC_ENUM("DRC Peak Detector Release Time",
+		ssm2518_drc_peak_detector_release_time_enum),
+	SOC_ENUM("DRC Attack Time", ssm2518_drc_attack_time_enum),
+	SOC_ENUM("DRC Decay Time", ssm2518_drc_decay_time_enum),
+	SOC_ENUM("DRC Hold Time", ssm2518_drc_hold_time_enum),
+	SOC_ENUM("DRC Noise Gate Hold Time",
+		ssm2518_drc_noise_gate_hold_time_enum),
+	SOC_ENUM("DRC RMS Averaging Time", ssm2518_drc_rms_averaging_time_enum),
+};
+
+static const struct snd_soc_dapm_widget ssm2518_dapm_widgets[] = {
+	SND_SOC_DAPM_DAC("DACL", "HiFi Playback", SSM2518_REG_POWER2, 1, 1),
+	SND_SOC_DAPM_DAC("DACR", "HiFi Playback", SSM2518_REG_POWER2, 2, 1),
+
+	SND_SOC_DAPM_OUTPUT("OUTL"),
+	SND_SOC_DAPM_OUTPUT("OUTR"),
+};
+
+static const struct snd_soc_dapm_route ssm2518_routes[] = {
+	{ "OUTL", NULL, "DACL" },
+	{ "OUTR", NULL, "DACR" },
+};
+
+struct ssm2518_mcs_lut {
+	unsigned int rate;
+	const unsigned int *sysclks;
+};
+
+static const unsigned int ssm2518_sysclks_2048000[] = {
+	2048000, 4096000, 8192000, 12288000, 16384000, 24576000,
+	3200000, 6400000, 12800000, 0
+};
+
+static const unsigned int ssm2518_sysclks_2822000[] = {
+	2822000, 5644800, 11289600, 16934400, 22579200, 33868800,
+	4410000, 8820000, 17640000, 0
+};
+
+static const unsigned int ssm2518_sysclks_3072000[] = {
+	3072000, 6144000, 12288000, 16384000, 24576000, 38864000,
+	4800000, 9600000, 19200000, 0
+};
+
+static const struct ssm2518_mcs_lut ssm2518_mcs_lut[] = {
+	{ 8000,  ssm2518_sysclks_2048000, },
+	{ 11025, ssm2518_sysclks_2822000, },
+	{ 12000, ssm2518_sysclks_3072000, },
+	{ 16000, ssm2518_sysclks_2048000, },
+	{ 24000, ssm2518_sysclks_3072000, },
+	{ 22050, ssm2518_sysclks_2822000, },
+	{ 32000, ssm2518_sysclks_2048000, },
+	{ 44100, ssm2518_sysclks_2822000, },
+	{ 48000, ssm2518_sysclks_3072000, },
+	{ 96000, ssm2518_sysclks_3072000, },
+};
+
+static const unsigned int ssm2518_rates_2048000[] = {
+	8000, 16000, 32000,
+};
+
+static const struct snd_pcm_hw_constraint_list ssm2518_constraints_2048000 = {
+	.list = ssm2518_rates_2048000,
+	.count = ARRAY_SIZE(ssm2518_rates_2048000),
+};
+
+static const unsigned int ssm2518_rates_2822000[] = {
+	11025, 22050, 44100,
+};
+
+static const struct snd_pcm_hw_constraint_list ssm2518_constraints_2822000 = {
+	.list = ssm2518_rates_2822000,
+	.count = ARRAY_SIZE(ssm2518_rates_2822000),
+};
+
+static const unsigned int ssm2518_rates_3072000[] = {
+	12000, 24000, 48000, 96000,
+};
+
+static const struct snd_pcm_hw_constraint_list ssm2518_constraints_3072000 = {
+	.list = ssm2518_rates_3072000,
+	.count = ARRAY_SIZE(ssm2518_rates_3072000),
+};
+
+static const unsigned int ssm2518_rates_12288000[] = {
+	8000, 12000, 16000, 24000, 32000, 48000, 96000,
+};
+
+static const struct snd_pcm_hw_constraint_list ssm2518_constraints_12288000 = {
+	.list = ssm2518_rates_12288000,
+	.count = ARRAY_SIZE(ssm2518_rates_12288000),
+};
+
+static unsigned int ssm2518_lookup_mcs(struct ssm2518 *ssm2518,
+	unsigned int rate)
+{
+	const unsigned int *sysclks = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ssm2518_mcs_lut); i++) {
+		if (ssm2518_mcs_lut[i].rate == rate) {
+			sysclks = ssm2518_mcs_lut[i].sysclks;
+			break;
+		}
+	}
+
+	if (!sysclks)
+		return -EINVAL;
+
+	for (i = 0; sysclks[i]; i++) {
+		if (sysclks[i] == ssm2518->sysclk)
+			return i;
+	}
+
+	return -EINVAL;
+}
+
+static int ssm2518_hw_params(struct snd_pcm_substream *substream,
+	struct snd_pcm_hw_params *params, struct snd_soc_dai *dai)
+{
+	struct snd_soc_codec *codec = dai->codec;
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(codec);
+	unsigned int rate = params_rate(params);
+	unsigned int ctrl1, ctrl1_mask;
+	int mcs;
+	int ret;
+
+	mcs = ssm2518_lookup_mcs(ssm2518, rate);
+	if (mcs < 0)
+		return mcs;
+
+	ctrl1_mask = SSM2518_SAI_CTRL1_FS_MASK;
+
+	if (rate >= 8000 && rate <= 12000)
+		ctrl1 = SSM2518_SAI_CTRL1_FS_8000_12000;
+	else if (rate >= 16000 && rate <= 24000)
+		ctrl1 = SSM2518_SAI_CTRL1_FS_16000_24000;
+	else if (rate >= 32000 && rate <= 48000)
+		ctrl1 = SSM2518_SAI_CTRL1_FS_32000_48000;
+	else if (rate >= 64000 && rate <= 96000)
+		ctrl1 = SSM2518_SAI_CTRL1_FS_64000_96000;
+	else
+		return -EINVAL;
+
+	if (ssm2518->right_j) {
+		switch (params_format(params)) {
+		case SNDRV_PCM_FORMAT_S16_LE:
+			ctrl1 |= SSM2518_SAI_CTRL1_FMT_RJ_16BIT;
+			break;
+		case SNDRV_PCM_FORMAT_S24_LE:
+			ctrl1 |= SSM2518_SAI_CTRL1_FMT_RJ_24BIT;
+			break;
+		default:
+			return -EINVAL;
+		}
+		ctrl1_mask |= SSM2518_SAI_CTRL1_FMT_MASK;
+	}
+
+	/* Disable auto samplerate detection */
+	ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_CLOCK,
+				SSM2518_CLOCK_ASR, SSM2518_CLOCK_ASR);
+	if (ret < 0)
+		return ret;
+
+	ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_SAI_CTRL1,
+				ctrl1_mask, ctrl1);
+	if (ret < 0)
+		return ret;
+
+	return regmap_update_bits(ssm2518->regmap, SSM2518_REG_POWER1,
+				SSM2518_POWER1_MCS_MASK, mcs << 1);
+}
+
+static int ssm2518_mute(struct snd_soc_dai *dai, int mute)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(dai->codec);
+	unsigned int val;
+
+	if (mute)
+		val = SSM2518_MUTE_CTRL_MUTE_MASTER;
+	else
+		val = 0;
+
+	return regmap_update_bits(ssm2518->regmap, SSM2518_REG_MUTE_CTRL,
+			SSM2518_MUTE_CTRL_MUTE_MASTER, val);
+}
+
+static int ssm2518_set_dai_fmt(struct snd_soc_dai *dai, unsigned int fmt)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(dai->codec);
+	unsigned int ctrl1 = 0, ctrl2 = 0;
+	bool invert_fclk;
+	int ret;
+
+	switch (fmt & SND_SOC_DAIFMT_MASTER_MASK) {
+	case SND_SOC_DAIFMT_CBS_CFS:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (fmt & SND_SOC_DAIFMT_INV_MASK) {
+	case SND_SOC_DAIFMT_NB_NF:
+		invert_fclk = false;
+		break;
+	case SND_SOC_DAIFMT_IB_NF:
+		ctrl2 |= SSM2518_SAI_CTRL2_BCLK_INVERT;
+		invert_fclk = false;
+		break;
+	case SND_SOC_DAIFMT_NB_IF:
+		invert_fclk = true;
+		break;
+	case SND_SOC_DAIFMT_IB_IF:
+		ctrl2 |= SSM2518_SAI_CTRL2_BCLK_INVERT;
+		invert_fclk = true;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ssm2518->right_j = false;
+	switch (fmt & SND_SOC_DAIFMT_FORMAT_MASK) {
+	case SND_SOC_DAIFMT_I2S:
+		ctrl1 |= SSM2518_SAI_CTRL1_FMT_I2S;
+		break;
+	case SND_SOC_DAIFMT_LEFT_J:
+		ctrl1 |= SSM2518_SAI_CTRL1_FMT_LJ;
+		invert_fclk = !invert_fclk;
+		break;
+	case SND_SOC_DAIFMT_RIGHT_J:
+		ctrl1 |= SSM2518_SAI_CTRL1_FMT_RJ_24BIT;
+		ssm2518->right_j = true;
+		invert_fclk = !invert_fclk;
+		break;
+	case SND_SOC_DAIFMT_DSP_A:
+		ctrl2 |= SSM2518_SAI_CTRL2_LRCLK_PULSE;
+		ctrl1 |= SSM2518_SAI_CTRL1_FMT_I2S;
+		invert_fclk = false;
+		break;
+	case SND_SOC_DAIFMT_DSP_B:
+		ctrl2 |= SSM2518_SAI_CTRL2_LRCLK_PULSE;
+		ctrl1 |= SSM2518_SAI_CTRL1_FMT_LJ;
+		invert_fclk = false;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (invert_fclk)
+		ctrl2 |= SSM2518_SAI_CTRL2_LRCLK_INVERT;
+
+	ret = regmap_write(ssm2518->regmap, SSM2518_REG_SAI_CTRL1, ctrl1);
+	if (ret)
+		return ret;
+
+	return regmap_write(ssm2518->regmap, SSM2518_REG_SAI_CTRL2, ctrl2);
+}
+
+static int ssm2518_set_power(struct ssm2518 *ssm2518, bool enable)
+{
+	int ret = 0;
+
+	if (!enable) {
+		ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_POWER1,
+			SSM2518_POWER1_SPWDN, SSM2518_POWER1_SPWDN);
+		regcache_mark_dirty(ssm2518->regmap);
+	}
+
+	if (gpio_is_valid(ssm2518->enable_gpio))
+		gpio_set_value(ssm2518->enable_gpio, enable);
+
+	regcache_cache_only(ssm2518->regmap, !enable);
+
+	if (enable) {
+		ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_POWER1,
+			SSM2518_POWER1_SPWDN | SSM2518_POWER1_RESET, 0x00);
+		regcache_sync(ssm2518->regmap);
+	}
+
+	return ret;
+}
+
+static int ssm2518_set_bias_level(struct snd_soc_codec *codec,
+	enum snd_soc_bias_level level)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(codec);
+	int ret = 0;
+
+	switch (level) {
+	case SND_SOC_BIAS_ON:
+		break;
+	case SND_SOC_BIAS_PREPARE:
+		break;
+	case SND_SOC_BIAS_STANDBY:
+		if (codec->dapm.bias_level == SND_SOC_BIAS_OFF)
+			ret = ssm2518_set_power(ssm2518, true);
+		break;
+	case SND_SOC_BIAS_OFF:
+		ret = ssm2518_set_power(ssm2518, false);
+		break;
+	}
+
+	if (ret)
+		return ret;
+
+	codec->dapm.bias_level = level;
+
+	return 0;
+}
+
+static int ssm2518_set_tdm_slot(struct snd_soc_dai *dai, unsigned int tx_mask,
+	unsigned int rx_mask, int slots, int width)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(dai->codec);
+	unsigned int ctrl1, ctrl2;
+	int left_slot, right_slot;
+	int ret;
+
+	if (slots == 0)
+		return regmap_update_bits(ssm2518->regmap,
+			SSM2518_REG_SAI_CTRL1, SSM2518_SAI_CTRL1_SAI_MASK,
+			SSM2518_SAI_CTRL1_SAI_I2S);
+
+	if (tx_mask == 0 || tx_mask != 0)
+		return -EINVAL;
+
+	if (slots == 1) {
+		if (tx_mask != 1)
+			return -EINVAL;
+		left_slot = 0;
+		right_slot = 0;
+	} else {
+		/* We assume the left channel < right channel */
+		left_slot = ffs(tx_mask);
+		tx_mask &= ~(1 << tx_mask);
+		if (tx_mask == 0) {
+			right_slot = left_slot;
+		} else {
+			right_slot = ffs(tx_mask);
+			tx_mask &= ~(1 << tx_mask);
+		}
+	}
+
+	if (tx_mask != 0 || left_slot >= slots || right_slot >= slots)
+		return -EINVAL;
+
+	switch (width) {
+	case 16:
+		ctrl2 = SSM2518_SAI_CTRL2_SLOT_WIDTH_16;
+		break;
+	case 24:
+		ctrl2 = SSM2518_SAI_CTRL2_SLOT_WIDTH_24;
+		break;
+	case 32:
+		ctrl2 = SSM2518_SAI_CTRL2_SLOT_WIDTH_32;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (slots) {
+	case 1:
+		ctrl1 = SSM2518_SAI_CTRL1_SAI_MONO;
+		break;
+	case 2:
+		ctrl1 = SSM2518_SAI_CTRL1_SAI_TDM_2;
+		break;
+	case 4:
+		ctrl1 = SSM2518_SAI_CTRL1_SAI_TDM_4;
+		break;
+	case 8:
+		ctrl1 = SSM2518_SAI_CTRL1_SAI_TDM_8;
+		break;
+	case 16:
+		ctrl1 = SSM2518_SAI_CTRL1_SAI_TDM_16;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ret = regmap_write(ssm2518->regmap, SSM2518_REG_CHAN_MAP,
+		(left_slot << SSM2518_CHAN_MAP_LEFT_SLOT_OFFSET) |
+		(right_slot << SSM2518_CHAN_MAP_RIGHT_SLOT_OFFSET));
+	if (ret)
+		return ret;
+
+	ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_SAI_CTRL1,
+		SSM2518_SAI_CTRL1_SAI_MASK, ctrl1);
+	if (ret)
+		return ret;
+
+	return regmap_update_bits(ssm2518->regmap, SSM2518_REG_SAI_CTRL2,
+		SSM2518_SAI_CTRL2_SLOT_WIDTH_MASK, ctrl2);
+}
+
+static int ssm2518_startup(struct snd_pcm_substream *substream,
+	struct snd_soc_dai *dai)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(dai->codec);
+
+	if (ssm2518->constraints)
+		snd_pcm_hw_constraint_list(substream->runtime, 0,
+				SNDRV_PCM_HW_PARAM_RATE, ssm2518->constraints);
+
+	return 0;
+}
+
+#define SSM2518_FORMATS (SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_S16_LE | \
+			SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_S32)
+
+static const struct snd_soc_dai_ops ssm2518_dai_ops = {
+	.startup = ssm2518_startup,
+	.hw_params	= ssm2518_hw_params,
+	.digital_mute	= ssm2518_mute,
+	.set_fmt	= ssm2518_set_dai_fmt,
+	.set_tdm_slot	= ssm2518_set_tdm_slot,
+};
+
+static struct snd_soc_dai_driver ssm2518_dai = {
+	.name = "ssm2518-hifi",
+	.playback = {
+		.stream_name = "Playback",
+		.channels_min = 2,
+		.channels_max = 2,
+		.rates = SNDRV_PCM_RATE_8000_96000,
+		.formats = SSM2518_FORMATS,
+	},
+	.ops = &ssm2518_dai_ops,
+};
+
+static int ssm2518_probe(struct snd_soc_codec *codec)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(codec);
+	int ret;
+
+	codec->control_data = ssm2518->regmap;
+	ret = snd_soc_codec_set_cache_io(codec, 0, 0, SND_SOC_REGMAP);
+	if (ret < 0) {
+		dev_err(codec->dev, "Failed to set cache I/O: %d\n", ret);
+		return ret;
+	}
+
+	return ssm2518_set_bias_level(codec, SND_SOC_BIAS_OFF);
+}
+
+static int ssm2518_remove(struct snd_soc_codec *codec)
+{
+	ssm2518_set_bias_level(codec, SND_SOC_BIAS_OFF);
+	return 0;
+}
+
+static int ssm2518_set_sysclk(struct snd_soc_codec *codec, int clk_id,
+	int source, unsigned int freq, int dir)
+{
+	struct ssm2518 *ssm2518 = snd_soc_codec_get_drvdata(codec);
+	unsigned int val;
+
+	if (clk_id != SSM2518_SYSCLK)
+		return -EINVAL;
+
+	switch (source) {
+	case SSM2518_SYSCLK_SRC_MCLK:
+		val = 0;
+		break;
+	case SSM2518_SYSCLK_SRC_BCLK:
+		/* In this case the bitclock is used as the system clock, and
+		 * the bitclock signal needs to be connected to the MCLK pin and
+		 * the BCLK pin is left unconnected */
+		val = SSM2518_POWER1_NO_BCLK;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	switch (freq) {
+	case 0:
+		ssm2518->constraints = NULL;
+		break;
+	case 2048000:
+	case 4096000:
+	case 8192000:
+	case 3200000:
+	case 6400000:
+	case 12800000:
+		ssm2518->constraints = &ssm2518_constraints_2048000;
+		break;
+	case 2822000:
+	case 5644800:
+	case 11289600:
+	case 16934400:
+	case 22579200:
+	case 33868800:
+	case 4410000:
+	case 8820000:
+	case 17640000:
+		ssm2518->constraints = &ssm2518_constraints_2822000;
+		break;
+	case 3072000:
+	case 6144000:
+	case 38864000:
+	case 4800000:
+	case 9600000:
+	case 19200000:
+		ssm2518->constraints = &ssm2518_constraints_3072000;
+		break;
+	case 12288000:
+	case 16384000:
+	case 24576000:
+		ssm2518->constraints = &ssm2518_constraints_12288000;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ssm2518->sysclk = freq;
+
+	return regmap_update_bits(ssm2518->regmap, SSM2518_REG_POWER1,
+			SSM2518_POWER1_NO_BCLK, val);
+}
+
+static struct snd_soc_codec_driver ssm2518_codec_driver = {
+	.probe = ssm2518_probe,
+	.remove = ssm2518_remove,
+	.set_bias_level = ssm2518_set_bias_level,
+	.set_sysclk = ssm2518_set_sysclk,
+	.idle_bias_off = true,
+
+	.controls = ssm2518_snd_controls,
+	.num_controls = ARRAY_SIZE(ssm2518_snd_controls),
+	.dapm_widgets = ssm2518_dapm_widgets,
+	.num_dapm_widgets = ARRAY_SIZE(ssm2518_dapm_widgets),
+	.dapm_routes = ssm2518_routes,
+	.num_dapm_routes = ARRAY_SIZE(ssm2518_routes),
+};
+
+static bool ssm2518_register_volatile(struct device *dev, unsigned int reg)
+{
+	return false;
+}
+
+static const struct regmap_config ssm2518_regmap_config = {
+	.val_bits = 8,
+	.reg_bits = 8,
+
+	.max_register = SSM2518_REG_DRC_9,
+	.volatile_reg = ssm2518_register_volatile,
+
+	.cache_type = REGCACHE_RBTREE,
+	.reg_defaults = ssm2518_reg_defaults,
+	.num_reg_defaults = ARRAY_SIZE(ssm2518_reg_defaults),
+};
+
+static int ssm2518_i2c_probe(struct i2c_client *i2c,
+	const struct i2c_device_id *id)
+{
+	struct ssm2518_platform_data *pdata = i2c->dev.platform_data;
+	struct ssm2518 *ssm2518;
+	int ret;
+
+	ssm2518 = devm_kzalloc(&i2c->dev, sizeof(*ssm2518), GFP_KERNEL);
+	if (ssm2518 == NULL)
+		return -ENOMEM;
+
+	if (pdata) {
+		ssm2518->enable_gpio = pdata->enable_gpio;
+	} else if (i2c->dev.of_node) {
+		ssm2518->enable_gpio = of_get_gpio(i2c->dev.of_node, 0);
+		if (ssm2518->enable_gpio < 0 && ssm2518->enable_gpio != -ENOENT)
+			return ssm2518->enable_gpio;
+	} else {
+		ssm2518->enable_gpio = -1;
+	}
+
+	if (gpio_is_valid(ssm2518->enable_gpio)) {
+		ret = devm_gpio_request_one(&i2c->dev, ssm2518->enable_gpio,
+				GPIOF_OUT_INIT_HIGH, "SSM2518 nSD");
+		if (ret)
+			return ret;
+	}
+
+	i2c_set_clientdata(i2c, ssm2518);
+
+	ssm2518->regmap = devm_regmap_init_i2c(i2c, &ssm2518_regmap_config);
+	if (IS_ERR(ssm2518->regmap))
+		return PTR_ERR(ssm2518->regmap);
+
+	/*
+	 * The reset bit is obviously volatile, but we need to be able to cache
+	 * the other bits in the register, so we can't just mark the whole
+	 * register as volatile. Since this is the only place where we'll ever
+	 * touch the reset bit just bypass the cache for this operation.
+	 */
+	regcache_cache_bypass(ssm2518->regmap, true);
+	ret = regmap_write(ssm2518->regmap, SSM2518_REG_POWER1,
+			SSM2518_POWER1_RESET);
+	regcache_cache_bypass(ssm2518->regmap, false);
+	if (ret)
+		return ret;
+
+	ret = regmap_update_bits(ssm2518->regmap, SSM2518_REG_POWER2,
+				SSM2518_POWER2_APWDN, 0x00);
+	if (ret)
+		return ret;
+
+	ret = ssm2518_set_power(ssm2518, false);
+	if (ret)
+		return ret;
+
+	return snd_soc_register_codec(&i2c->dev, &ssm2518_codec_driver,
+			&ssm2518_dai, 1);
+}
+
+static int ssm2518_i2c_remove(struct i2c_client *client)
+{
+	snd_soc_unregister_codec(&client->dev);
+	return 0;
+}
+
+static const struct i2c_device_id ssm2518_i2c_ids[] = {
+	{ "ssm2518", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, ssm2518_i2c_ids);
+
+static struct i2c_driver ssm2518_driver = {
+	.driver = {
+		.name = "ssm2518",
+		.owner = THIS_MODULE,
+	},
+	.probe = ssm2518_i2c_probe,
+	.remove = ssm2518_i2c_remove,
+	.id_table = ssm2518_i2c_ids,
+};
+module_i2c_driver(ssm2518_driver);
+
+MODULE_DESCRIPTION("ASoC SSM2518 driver");
+MODULE_AUTHOR("Lars-Peter Clausen <lars@metafoo.de>");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/codecs/ssm2518.h b/sound/soc/codecs/ssm2518.h
new file mode 100644
index 000000000000..62511d80518e
--- /dev/null
+++ b/sound/soc/codecs/ssm2518.h
@@ -0,0 +1,20 @@
+/*
+ * SSM2518 amplifier audio driver
+ *
+ * Copyright 2013 Analog Devices Inc.
+ *  Author: Lars-Peter Clausen <lars@metafoo.de>
+ *
+ * Licensed under the GPL-2.
+ */
+
+#ifndef __SND_SOC_CODECS_SSM2518_H__
+#define __SND_SOC_CODECS_SSM2518_H__
+
+#define SSM2518_SYSCLK 0
+
+enum ssm2518_sysclk_src {
+	SSM2518_SYSCLK_SRC_MCLK = 0,
+	SSM2518_SYSCLK_SRC_BCLK = 1,
+};
+
+#endif
-- 
cgit v1.3-7-g2ca7


From 944e9a0316e60bc5bc122e46c1fde36e5f6e9f56 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 16 May 2013 05:09:57 +0000
Subject: cpufreq: governors: Move get_governor_parent_kobj() to cpufreq.c

get_governor_parent_kobj() can be used by any governor, generic
cpufreq governors or platform specific ones and so must be present in
cpufreq.c instead of cpufreq_governor.c.

This patch moves it to cpufreq.c. This also adds
EXPORT_SYMBOL_GPL(get_governor_parent_kobj) so that modules can use
this function too.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c          | 9 +++++++++
 drivers/cpufreq/cpufreq_governor.c | 8 --------
 include/linux/cpufreq.h            | 1 +
 3 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 2f254691ed1f..3faf62bdbad0 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -134,6 +134,15 @@ bool have_governor_per_policy(void)
 }
 EXPORT_SYMBOL_GPL(have_governor_per_policy);
 
+struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
+{
+	if (have_governor_per_policy())
+		return &policy->kobj;
+	else
+		return cpufreq_global_kobject;
+}
+EXPORT_SYMBOL_GPL(get_governor_parent_kobj);
+
 static struct cpufreq_policy *__cpufreq_cpu_get(unsigned int cpu, bool sysfs)
 {
 	struct cpufreq_policy *data;
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5af40ad82d23..d1421b498c76 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -29,14 +29,6 @@
 
 #include "cpufreq_governor.h"
 
-static struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
-{
-	if (have_governor_per_policy())
-		return &policy->kobj;
-	else
-		return cpufreq_global_kobject;
-}
-
 static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
 {
 	if (have_governor_per_policy())
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 037d36ae63e5..dd1a5d41357b 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -340,6 +340,7 @@ const char *cpufreq_get_current_driver(void);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
+struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 
 #ifdef CONFIG_CPU_FREQ
 /* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */
-- 
cgit v1.3-7-g2ca7


From 72a4ce340a7ebf39e1c6fdc8f5feb4f974d6c635 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 17 May 2013 11:26:32 +0000
Subject: cpufreq: Move get_cpu_idle_time() to cpufreq.c

Governors other than ondemand and conservative can also use
get_cpu_idle_time() and they aren't required to compile
cpufreq_governor.c. So, move these independent routines to
cpufreq.c instead.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c          | 38 ++++++++++++++++++++++++++++++++++++++
 drivers/cpufreq/cpufreq_governor.c | 36 ------------------------------------
 drivers/cpufreq/cpufreq_governor.h |  1 -
 include/linux/cpufreq.h            |  1 +
 4 files changed, 39 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 3faf62bdbad0..c6ab21880c07 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -17,7 +17,9 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <asm/cputime.h>
 #include <linux/kernel.h>
+#include <linux/kernel_stat.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/notifier.h>
@@ -25,6 +27,7 @@
 #include <linux/delay.h>
 #include <linux/interrupt.h>
 #include <linux/spinlock.h>
+#include <linux/tick.h>
 #include <linux/device.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
@@ -143,6 +146,41 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
 }
 EXPORT_SYMBOL_GPL(get_governor_parent_kobj);
 
+static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
+{
+	u64 idle_time;
+	u64 cur_wall_time;
+	u64 busy_time;
+
+	cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
+
+	busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
+	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
+	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
+	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
+	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
+	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
+
+	idle_time = cur_wall_time - busy_time;
+	if (wall)
+		*wall = cputime_to_usecs(cur_wall_time);
+
+	return cputime_to_usecs(idle_time);
+}
+
+u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
+{
+	u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL);
+
+	if (idle_time == -1ULL)
+		return get_cpu_idle_time_jiffy(cpu, wall);
+	else if (!io_busy)
+		idle_time += get_cpu_iowait_time_us(cpu, wall);
+
+	return idle_time;
+}
+EXPORT_SYMBOL_GPL(get_cpu_idle_time);
+
 static struct cpufreq_policy *__cpufreq_cpu_get(unsigned int cpu, bool sysfs)
 {
 	struct cpufreq_policy *data;
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index d1421b498c76..b6cfd55d8266 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -23,7 +23,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/tick.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
@@ -37,41 +36,6 @@ static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
 		return dbs_data->cdata->attr_group_gov_sys;
 }
 
-static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
-{
-	u64 idle_time;
-	u64 cur_wall_time;
-	u64 busy_time;
-
-	cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
-
-	busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
-	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
-	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
-	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
-	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
-	busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
-
-	idle_time = cur_wall_time - busy_time;
-	if (wall)
-		*wall = cputime_to_usecs(cur_wall_time);
-
-	return cputime_to_usecs(idle_time);
-}
-
-u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy)
-{
-	u64 idle_time = get_cpu_idle_time_us(cpu, io_busy ? wall : NULL);
-
-	if (idle_time == -1ULL)
-		return get_cpu_idle_time_jiffy(cpu, wall);
-	else if (!io_busy)
-		idle_time += get_cpu_iowait_time_us(cpu, wall);
-
-	return idle_time;
-}
-EXPORT_SYMBOL_GPL(get_cpu_idle_time);
-
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 {
 	struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e16a96130cb3..e7bbf767380d 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -256,7 +256,6 @@ static ssize_t show_sampling_rate_min_gov_pol				\
 	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
 }
 
-u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 bool need_load_eval(struct cpu_dbs_common_info *cdbs,
 		unsigned int sampling_rate);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index dd1a5d41357b..fbf392aaa02e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -337,6 +337,7 @@ const char *cpufreq_get_current_driver(void);
 /*********************************************************************
  *                        CPUFREQ 2.6. INTERFACE                     *
  *********************************************************************/
+u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
-- 
cgit v1.3-7-g2ca7


From 2361be23666232dbb4851a527f466c4cbf5340fc Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 17 May 2013 16:09:09 +0530
Subject: cpufreq: Don't create empty /sys/devices/system/cpu/cpufreq directory

When we don't have any file in cpu/cpufreq directory we shouldn't
create it. Specially with the introduction of per-policy governor
instance patchset, even governors are moved to
cpu/cpu*/cpufreq/governor-name directory and so this directory is
just not required.

Lets have it only when required.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c     |  4 ++--
 drivers/cpufreq/cpufreq.c          | 48 ++++++++++++++++++++++++++++++++++----
 drivers/cpufreq/cpufreq_governor.c |  6 +++++
 include/linux/cpufreq.h            |  4 ++++
 4 files changed, 56 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 11b8b4b54ceb..8c02622b35a4 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -947,7 +947,7 @@ static void __init acpi_cpufreq_boost_init(void)
 	/* We create the boost file in any case, though for systems without
 	 * hardware support it will be read-only and hardwired to return 0.
 	 */
-	if (sysfs_create_file(cpufreq_global_kobject, &(global_boost.attr)))
+	if (cpufreq_sysfs_create_file(&(global_boost.attr)))
 		pr_warn(PFX "could not register global boost sysfs file\n");
 	else
 		pr_debug("registered global boost sysfs file\n");
@@ -955,7 +955,7 @@ static void __init acpi_cpufreq_boost_init(void)
 
 static void __exit acpi_cpufreq_boost_exit(void)
 {
-	sysfs_remove_file(cpufreq_global_kobject, &(global_boost.attr));
+	cpufreq_sysfs_remove_file(&(global_boost.attr));
 
 	if (msrs) {
 		unregister_cpu_notifier(&boost_nb);
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c6ab21880c07..ce9273a7b4e3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -678,9 +678,6 @@ static struct attribute *default_attrs[] = {
 	NULL
 };
 
-struct kobject *cpufreq_global_kobject;
-EXPORT_SYMBOL(cpufreq_global_kobject);
-
 #define to_policy(k) container_of(k, struct cpufreq_policy, kobj)
 #define to_attr(a) container_of(a, struct freq_attr, attr)
 
@@ -751,6 +748,49 @@ static struct kobj_type ktype_cpufreq = {
 	.release	= cpufreq_sysfs_release,
 };
 
+struct kobject *cpufreq_global_kobject;
+EXPORT_SYMBOL(cpufreq_global_kobject);
+
+static int cpufreq_global_kobject_usage;
+
+int cpufreq_get_global_kobject(void)
+{
+	if (!cpufreq_global_kobject_usage++)
+		return kobject_add(cpufreq_global_kobject,
+				&cpu_subsys.dev_root->kobj, "%s", "cpufreq");
+
+	return 0;
+}
+EXPORT_SYMBOL(cpufreq_get_global_kobject);
+
+void cpufreq_put_global_kobject(void)
+{
+	if (!--cpufreq_global_kobject_usage)
+		kobject_del(cpufreq_global_kobject);
+}
+EXPORT_SYMBOL(cpufreq_put_global_kobject);
+
+int cpufreq_sysfs_create_file(const struct attribute *attr)
+{
+	int ret = cpufreq_get_global_kobject();
+
+	if (!ret) {
+		ret = sysfs_create_file(cpufreq_global_kobject, attr);
+		if (ret)
+			cpufreq_put_global_kobject();
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(cpufreq_sysfs_create_file);
+
+void cpufreq_sysfs_remove_file(const struct attribute *attr)
+{
+	sysfs_remove_file(cpufreq_global_kobject, attr);
+	cpufreq_put_global_kobject();
+}
+EXPORT_SYMBOL(cpufreq_sysfs_remove_file);
+
 /* symlink affected CPUs */
 static int cpufreq_add_dev_symlink(unsigned int cpu,
 				   struct cpufreq_policy *policy)
@@ -2020,7 +2060,7 @@ static int __init cpufreq_core_init(void)
 		init_rwsem(&per_cpu(cpu_policy_rwsem, cpu));
 	}
 
-	cpufreq_global_kobject = kobject_create_and_add("cpufreq", &cpu_subsys.dev_root->kobj);
+	cpufreq_global_kobject = kobject_create();
 	BUG_ON(!cpufreq_global_kobject);
 	register_syscore_ops(&cpufreq_syscore_ops);
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index b6cfd55d8266..7532570c42b4 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -231,6 +231,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			return rc;
 		}
 
+		if (!have_governor_per_policy())
+			WARN_ON(cpufreq_get_global_kobject());
+
 		rc = sysfs_create_group(get_governor_parent_kobj(policy),
 				get_sysfs_attr(dbs_data));
 		if (rc) {
@@ -269,6 +272,9 @@ int cpufreq_governor_dbs(struct cpufreq_policy *policy,
 			sysfs_remove_group(get_governor_parent_kobj(policy),
 					get_sysfs_attr(dbs_data));
 
+			if (!have_governor_per_policy())
+				cpufreq_put_global_kobject();
+
 			if ((dbs_data->cdata->governor == GOV_CONSERVATIVE) &&
 				(policy->governor->initialized == 1)) {
 				struct cs_ops *cs_ops = dbs_data->cdata->gov_ops;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index fbf392aaa02e..1b5b5efa3e3a 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -71,6 +71,10 @@ struct cpufreq_governor;
 
 /* /sys/devices/system/cpu/cpufreq: entry point for global variables */
 extern struct kobject *cpufreq_global_kobject;
+int cpufreq_get_global_kobject(void);
+void cpufreq_put_global_kobject(void);
+int cpufreq_sysfs_create_file(const struct attribute *attr);
+void cpufreq_sysfs_remove_file(const struct attribute *attr);
 
 #define CPUFREQ_ETERNAL			(-1)
 struct cpufreq_cpuinfo {
-- 
cgit v1.3-7-g2ca7


From fe830ef62ac6d8814e27b7e2f632848694b0e5c7 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Sat, 25 May 2013 21:48:29 +0800
Subject: PCI: Introduce pci_bus_{get|put}() to manage PCI bus reference count

Introduce helper functions pci_bus_{get|put}() to manage PCI bus
reference count.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/bus.c   | 15 +++++++++++++++
 include/linux/pci.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 32e66a6f12d9..b1ff02ab4f13 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -283,6 +283,21 @@ void pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void *),
 }
 EXPORT_SYMBOL_GPL(pci_walk_bus);
 
+struct pci_bus *pci_bus_get(struct pci_bus *bus)
+{
+	if (bus)
+		get_device(&bus->dev);
+	return bus;
+}
+EXPORT_SYMBOL(pci_bus_get);
+
+void pci_bus_put(struct pci_bus *bus)
+{
+	if (bus)
+		put_device(&bus->dev);
+}
+EXPORT_SYMBOL(pci_bus_put);
+
 EXPORT_SYMBOL(pci_bus_alloc_resource);
 EXPORT_SYMBOL_GPL(pci_bus_add_device);
 EXPORT_SYMBOL(pci_bus_add_devices);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3a24e4ff3248..7556c590ddfd 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1018,6 +1018,8 @@ int pci_request_selected_regions_exclusive(struct pci_dev *, int, const char *);
 void pci_release_selected_regions(struct pci_dev *, int);
 
 /* drivers/pci/bus.c */
+struct pci_bus *pci_bus_get(struct pci_bus *bus);
+void pci_bus_put(struct pci_bus *bus);
 void pci_add_resource(struct list_head *resources, struct resource *res);
 void pci_add_resource_offset(struct list_head *resources, struct resource *res,
 			     resource_size_t offset);
-- 
cgit v1.3-7-g2ca7


From 3c6e6ae770f338ef3e54c5823c21063204f53537 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Sat, 25 May 2013 21:48:30 +0800
Subject: PCI: Introduce pci_alloc_dev(struct pci_bus*) to replace
 alloc_pci_dev()

Here we introduce a new interface to replace alloc_pci_dev():

    struct pci_dev *pci_alloc_dev(struct pci_bus *bus)

It takes a "struct pci_bus *" argument, so we can alloc a PCI device
on a target PCI bus, and it acquires a reference on the pci_bus.
We use pci_alloc_dev(NULL) to simplify the old alloc_pci_dev(),
and keep it for a while but mark it as __deprecated.

Holding a reference to the pci_bus ensures that referencing
pci_dev->bus is valid as long as the pci_dev is valid.

[bhelgaas: keep existing "return error early" structure in pci_alloc_dev()]
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/probe.c | 9 ++++++++-
 include/linux/pci.h | 3 ++-
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 70f10fa3c1b2..d47ce1400c26 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1200,7 +1200,7 @@ static void pci_release_bus_bridge_dev(struct device *dev)
 	kfree(bridge);
 }
 
-struct pci_dev *alloc_pci_dev(void)
+struct pci_dev *pci_alloc_dev(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
 
@@ -1210,9 +1210,16 @@ struct pci_dev *alloc_pci_dev(void)
 
 	INIT_LIST_HEAD(&dev->bus_list);
 	dev->dev.type = &pci_dev_type;
+	dev->bus = pci_bus_get(bus);
 
 	return dev;
 }
+EXPORT_SYMBOL(pci_alloc_dev);
+
+struct pci_dev *alloc_pci_dev(void)
+{
+	return pci_alloc_dev(NULL);
+}
 EXPORT_SYMBOL(alloc_pci_dev);
 
 bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *l,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7556c590ddfd..b0f4a8264118 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -364,7 +364,8 @@ static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
 	return dev;
 }
 
-struct pci_dev *alloc_pci_dev(void);
+struct pci_dev *pci_alloc_dev(struct pci_bus *bus);
+struct pci_dev * __deprecated alloc_pci_dev(void);
 
 #define	to_pci_dev(n) container_of(n, struct pci_dev, dev)
 #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
-- 
cgit v1.3-7-g2ca7


From ab573844e3058eef2788803d373019f8bebead57 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Wed, 1 May 2013 17:25:44 +0200
Subject: perf: Fix hw breakpoints overflow period sampling

The hw breakpoint pmu 'add' function is missing the
period_left update needed for SW events.

The perf HW breakpoint events use the SW events framework
to process the overflow, so it needs to be properly initialized
in the PMU 'add' method.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Reviewed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1367421944-19082-5-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h    | 2 ++
 kernel/events/core.c          | 2 +-
 kernel/events/hw_breakpoint.c | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f463a46424e2..fa38612d70b6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -743,6 +743,7 @@ extern unsigned int perf_output_skip(struct perf_output_handle *handle,
 				     unsigned int len);
 extern int perf_swevent_get_recursion_context(void);
 extern void perf_swevent_put_recursion_context(int rctx);
+extern u64 perf_swevent_set_period(struct perf_event *event);
 extern void perf_event_enable(struct perf_event *event);
 extern void perf_event_disable(struct perf_event *event);
 extern int __perf_event_disable(void *info);
@@ -782,6 +783,7 @@ static inline void perf_event_fork(struct task_struct *tsk)		{ }
 static inline void perf_event_init(void)				{ }
 static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
 static inline void perf_swevent_put_recursion_context(int rctx)		{ }
+static inline u64 perf_swevent_set_period(struct perf_event *event)	{ return 0; }
 static inline void perf_event_enable(struct perf_event *event)		{ }
 static inline void perf_event_disable(struct perf_event *event)		{ }
 static inline int __perf_event_disable(void *info)			{ return -1; }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..e0dcced282e4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4961,7 +4961,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
  * sign as trigger.
  */
 
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	u64 period = hwc->last_period;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1f..966a241e8616 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -612,6 +612,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
 	if (!(flags & PERF_EF_START))
 		bp->hw.state = PERF_HES_STOPPED;
 
+	if (is_sampling_event(bp)) {
+		bp->hw.last_period = bp->hw.sample_period;
+		perf_swevent_set_period(bp);
+	}
+
 	return arch_install_hw_breakpoint(bp);
 }
 
-- 
cgit v1.3-7-g2ca7


From 9e6302056f8029f438e853432a856b9f13de26a6 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 3 Apr 2013 14:21:33 +0200
Subject: perf: Use hrtimers for event multiplexing

The current scheme of using the timer tick was fine for per-thread
events. However, it was causing bias issues in system-wide mode
(including for uncore PMUs). Event groups would not get their fair
share of runtime on the PMU. With tickless kernels, if a core is idle
there is no timer tick, and thus no event rotation (multiplexing).
However, there are events (especially uncore events) which do count
even though cores are asleep.

This patch changes the timer source for multiplexing.  It introduces a
per-PMU per-cpu hrtimer. The advantage is that even when a core goes
idle, it will come back to service the hrtimer, thus multiplexing on
system-wide events works much better.

The per-PMU implementation (suggested by PeterZ) enables adjusting the
multiplexing interval per PMU. The preferred interval is stashed into
the struct pmu. If not set, it will be forced to the default interval
value.

In order to minimize the impact of the hrtimer, it is turned on and
off on demand. When the PMU on a CPU is overcommited, the hrtimer is
activated.  It is stopped when the PMU is not overcommitted.

In order for this to work properly, we had to change the order of
initialization in start_kernel() such that hrtimer_init() is run
before perf_event_init().

The default interval in milliseconds is set to a timer tick just like
with the old code. We will provide a sysctl to tune this in another
patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/r/1364991694-5876-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |   3 +-
 init/main.c                |   2 +-
 kernel/events/core.c       | 114 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 109 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fa38612d70b6..72138d75a60a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -501,8 +501,9 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
+	struct hrtimer			hrtimer;
+	ktime_t				hrtimer_interval;
 	struct list_head		rotation_list;
-	int				jiffies_interval;
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
diff --git a/init/main.c b/init/main.c
index 9484f4ba88d0..ec549581d732 100644
--- a/init/main.c
+++ b/init/main.c
@@ -542,7 +542,6 @@ asmlinkage void __init start_kernel(void)
 	if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
 		local_irq_disable();
 	idr_init_cache();
-	perf_event_init();
 	rcu_init();
 	tick_nohz_init();
 	radix_tree_init();
@@ -555,6 +554,7 @@ asmlinkage void __init start_kernel(void)
 	softirq_init();
 	timekeeping_init();
 	time_init();
+	perf_event_init();
 	profile_init();
 	call_function_init();
 	WARN(!irqs_disabled(), "Interrupts were enabled early\n");
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e0dcced282e4..97bfac7e6f45 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly =
 	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
 int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -658,6 +660,98 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
 
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+	struct perf_cpu_context *cpuctx;
+	enum hrtimer_restart ret = HRTIMER_NORESTART;
+	int rotations = 0;
+
+	WARN_ON(!irqs_disabled());
+
+	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+	rotations = perf_rotate_context(cpuctx);
+
+	/*
+	 * arm timer if needed
+	 */
+	if (rotations) {
+		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+		ret = HRTIMER_RESTART;
+	}
+
+	return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	if (WARN_ON(cpu != smp_processor_id()))
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+		if (pmu->task_ctx_nr == perf_sw_context)
+			continue;
+
+		hrtimer_cancel(&cpuctx->hrtimer);
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct pmu *pmu = cpuctx->ctx.pmu;
+
+	/* no multiplexing needed for SW PMU */
+	if (pmu->task_ctx_nr == perf_sw_context)
+		return;
+
+	cpuctx->hrtimer_interval =
+		ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER);
+
+	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct pmu *pmu = cpuctx->ctx.pmu;
+
+	/* not for SW PMU */
+	if (pmu->task_ctx_nr == perf_sw_context)
+		return;
+
+	if (hrtimer_active(hr))
+		return;
+
+	if (!hrtimer_callback_running(hr))
+		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+					 0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1600,7 @@ group_sched_in(struct perf_event *group_event,
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
+		perf_cpu_hrtimer_restart(cpuctx);
 		return -EAGAIN;
 	}
 
@@ -1552,6 +1647,8 @@ group_error:
 
 	pmu->cancel_txn(pmu);
 
+	perf_cpu_hrtimer_restart(cpuctx);
+
 	return -EAGAIN;
 }
 
@@ -1807,8 +1904,10 @@ static int __perf_event_enable(void *info)
 		 * If this event can't go on and it's part of a
 		 * group, then the whole group has to come off.
 		 */
-		if (leader != event)
+		if (leader != event) {
 			group_sched_out(leader, cpuctx, ctx);
+			perf_cpu_hrtimer_restart(cpuctx);
+		}
 		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2654,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
  * because they're strictly cpu affine and rotate_start is called with IRQs
  * disabled, while rotate_context is called from IRQ context.
  */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
 	struct perf_event_context *ctx = NULL;
 	int rotate = 0, remove = 1;
@@ -2594,6 +2693,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
 	if (remove)
 		list_del_init(&cpuctx->rotation_list);
+
+	return rotate;
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2726,6 @@ void perf_event_task_tick(void)
 		ctx = cpuctx->task_ctx;
 		if (ctx)
 			perf_adjust_freq_unthr_context(ctx, throttled);
-
-		if (cpuctx->jiffies_interval == 1 ||
-				!(jiffies % cpuctx->jiffies_interval))
-			perf_rotate_context(cpuctx);
 	}
 }
 
@@ -6001,7 +6098,9 @@ skip_type:
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
-		cpuctx->jiffies_interval = 1;
+
+		__perf_cpu_hrtimer_init(cpuctx, cpu);
+
 		INIT_LIST_HEAD(&cpuctx->rotation_list);
 		cpuctx->unique_pmu = pmu;
 	}
@@ -7387,7 +7486,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	case CPU_DOWN_PREPARE:
 		perf_event_exit_cpu(cpu);
 		break;
-
 	default:
 		break;
 	}
-- 
cgit v1.3-7-g2ca7


From 62b8563979273424d6ebe9201e34d1acc133ad4f Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Wed, 3 Apr 2013 14:21:34 +0200
Subject: perf: Add sysfs entry to adjust multiplexing interval per PMU

This patch adds /sys/device/xxx/perf_event_mux_interval_ms to ajust
the multiplexing interval per PMU. The unit is milliseconds. Value has
to be >= 1.

In the 4th version, we renamed the sysfs file to be more consistent
with the other /proc/sys/kernel entries for perf_events.

In the 5th version, we handle the reprogramming of the hrtimer using
hrtimer_forward_now(). That way, we sync up to new timer value quickly
(suggested by Jiri Olsa).

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/r/1364991694-5876-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h |  1 +
 kernel/events/core.c       | 63 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 72138d75a60a..6fddac1b27cb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -194,6 +194,7 @@ struct pmu {
 	int * __percpu			pmu_disable_count;
 	struct perf_cpu_context * __percpu pmu_cpu_context;
 	int				task_ctx_nr;
+	int				hrtimer_interval_ms;
 
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 97bfac7e6f45..53d1b300116a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -723,13 +723,21 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct hrtimer *hr = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	int timer;
 
 	/* no multiplexing needed for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
 		return;
 
-	cpuctx->hrtimer_interval =
-		ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER);
+	/*
+	 * check default is sane, if not set then force to
+	 * default interval (1/tick)
+	 */
+	timer = pmu->hrtimer_interval_ms;
+	if (timer < 1)
+		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+
+	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
 	hr->function = perf_cpu_hrtimer_handler;
@@ -6001,9 +6009,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
 	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
 
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+				struct device_attribute *attr,
+				char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	int timer, cpu, ret;
+
+	ret = kstrtoint(buf, 0, &timer);
+	if (ret)
+		return ret;
+
+	if (timer < 1)
+		return -EINVAL;
+
+	/* same value, noting to do */
+	if (timer == pmu->hrtimer_interval_ms)
+		return count;
+
+	pmu->hrtimer_interval_ms = timer;
+
+	/* update all cpuctx for this PMU */
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+
+		if (hrtimer_active(&cpuctx->hrtimer))
+			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+	}
+
+	return count;
+}
+
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
+
 static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
-       __ATTR_NULL,
+	__ATTR_RO(type),
+	__ATTR_RW(perf_event_mux_interval_ms),
+	__ATTR_NULL,
 };
 
 static int pmu_bus_running;
-- 
cgit v1.3-7-g2ca7


From 114276ac0a3beb9c391a410349bd770653e185ce Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 26 May 2013 17:32:13 +0300
Subject: mm, sched: Drop voluntary schedule from might_fault()

might_fault() is called from functions like copy_to_user()
which most callers expect to be very fast, like a couple of
instructions.

So functions like memcpy_toiovec() call them many times in a loop.

But might_fault() calls might_sleep() and with CONFIG_PREEMPT_VOLUNTARY
this results in a function call.

Let's not do this - just call __might_sleep() that produces
a diagnostic for sleep within atomic, but drop
might_preempt().

Here's a test sending traffic between the VM and the host,
host is built with CONFIG_PREEMPT_VOLUNTARY:

 before:
	incoming: 7122.77   Mb/s
	outgoing: 8480.37   Mb/s

 after:
	incoming: 8619.24   Mb/s
	outgoing: 9455.42   Mb/s

As a side effect, this fixes an issue pointed
out by Ingo: might_fault might schedule differently
depending on PROVE_LOCKING. Now there's no
preemption point in both cases, so it's consistent.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1369577426-26721-10-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h | 2 +-
 mm/memory.c            | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e9ef6d6b51d5..24719eaa1209 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -198,7 +198,7 @@ void might_fault(void);
 #else
 static inline void might_fault(void)
 {
-	might_sleep();
+	__might_sleep(__FILE__, __LINE__, 0);
 }
 #endif
 
diff --git a/mm/memory.c b/mm/memory.c
index 6dc1882fbd72..c1f190f51f6f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4222,7 +4222,8 @@ void might_fault(void)
 	if (segment_eq(get_fs(), KERNEL_DS))
 		return;
 
-	might_sleep();
+	__might_sleep(__FILE__, __LINE__, 0);
+
 	/*
 	 * it would be nicer only to annotate paths which are not under
 	 * pagefault_disable, however that requires a larger audit and
-- 
cgit v1.3-7-g2ca7


From 662bbcb2747c2422cf98d3d97619509379eee466 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 26 May 2013 17:32:23 +0300
Subject: mm, sched: Allow uaccess in atomic with pagefault_disable()

This changes might_fault() so that it does not
trigger a false positive diagnostic for e.g. the following
sequence:

	spin_lock_irqsave()
	pagefault_disable()
	copy_to_user()
	pagefault_enable()
	spin_unlock_irqrestore()

In particular vhost wants to do this, to call
socket ops from under a lock.

There are 3 cases to consider:

 - CONFIG_PROVE_LOCKING - might_fault is non-inline
   so it's easy to move the in_atomic test to fix
   up the false positive warning.

 - CONFIG_DEBUG_ATOMIC_SLEEP - might_fault
   is currently inline, but we are calling a
   non-inline __might_sleep anyway,
   so let's use the non-line version of might_fault
   that does the right thing.

 - !CONFIG_DEBUG_ATOMIC_SLEEP && !CONFIG_PROVE_LOCKING
   __might_sleep is a nop so might_fault is a nop.

Make this explicit.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1369577426-26721-11-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/kernel.h |  7 ++-----
 mm/memory.c            | 11 +++++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 24719eaa1209..4c7e2e508766 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -193,13 +193,10 @@ extern int _cond_resched(void);
 		(__x < 0) ? -__x : __x;		\
 	})
 
-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 void might_fault(void);
 #else
-static inline void might_fault(void)
-{
-	__might_sleep(__FILE__, __LINE__, 0);
-}
+static inline void might_fault(void) { }
 #endif
 
 extern struct atomic_notifier_head panic_notifier_list;
diff --git a/mm/memory.c b/mm/memory.c
index c1f190f51f6f..d7d54a114773 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4210,7 +4210,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 	up_read(&mm->mmap_sem);
 }
 
-#ifdef CONFIG_PROVE_LOCKING
+#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
 void might_fault(void)
 {
 	/*
@@ -4222,14 +4222,17 @@ void might_fault(void)
 	if (segment_eq(get_fs(), KERNEL_DS))
 		return;
 
-	__might_sleep(__FILE__, __LINE__, 0);
-
 	/*
 	 * it would be nicer only to annotate paths which are not under
 	 * pagefault_disable, however that requires a larger audit and
 	 * providing helpers like get_user_atomic.
 	 */
-	if (!in_atomic() && current->mm)
+	if (in_atomic())
+		return;
+
+	__might_sleep(__FILE__, __LINE__, 0);
+
+	if (current->mm)
 		might_lock_read(&current->mm->mmap_sem);
 }
 EXPORT_SYMBOL(might_fault);
-- 
cgit v1.3-7-g2ca7


From 65f6ae66a6fb44f614a1226c398fcb38e94b3c59 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Mon, 13 May 2013 11:05:48 +0200
Subject: PCI: Allocate only as many MSI vectors as requested by driver

Because of the encoding of the "Multiple Message Capable" and "Multiple
Message Enable" fields, a device can only advertise that it's capable of a
power-of-two number of vectors, and the OS can only enable a power-of-two
number.

For example, a device that's limited internally to using 18 vectors would
have to advertise that it's capable of 32.  The 14 extra vectors consume
vector numbers and IRQ descriptors even though the device can't actually
use them.

This fix introduces a 'msi_desc::nvec_used' field to address this issue.
When non-zero, it is the actual number of MSIs the device will send, as
requested by the device driver.  This value should be used by architectures
to set up and tear down only as many interrupt resources as the device will
actually use.

Note, although the existing 'msi_desc::multiple' field might seem
redundant, in fact it is not.  The number of MSIs advertised need not be
the smallest power-of-two larger than the number of MSIs the device will
send.  Thus, it is not always possible to derive the former from the
latter, so we need to keep them both to handle this case.

[bhelgaas: changelog, rename to "nvec_used"]
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi.c   | 10 ++++++++--
 include/linux/msi.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2c1075213bec..aca7578b05e5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -81,7 +81,10 @@ void default_teardown_msi_irqs(struct pci_dev *dev)
 		int i, nvec;
 		if (entry->irq == 0)
 			continue;
-		nvec = 1 << entry->msi_attrib.multiple;
+		if (entry->nvec_used)
+			nvec = entry->nvec_used;
+		else
+			nvec = 1 << entry->msi_attrib.multiple;
 		for (i = 0; i < nvec; i++)
 			arch_teardown_msi_irq(entry->irq + i);
 	}
@@ -336,7 +339,10 @@ static void free_msi_irqs(struct pci_dev *dev)
 		int i, nvec;
 		if (!entry->irq)
 			continue;
-		nvec = 1 << entry->msi_attrib.multiple;
+		if (entry->nvec_used)
+			nvec = entry->nvec_used;
+		else
+			nvec = 1 << entry->msi_attrib.multiple;
 #ifdef CONFIG_GENERIC_HARDIRQS
 		for (i = 0; i < nvec; i++)
 			BUG_ON(irq_has_action(entry->irq + i));
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 20c2d6dd5d25..ee66f3a12fb6 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -35,6 +35,7 @@ struct msi_desc {
 
 	u32 masked;			/* mask bits */
 	unsigned int irq;
+	unsigned int nvec_used;		/* number of messages */
 	struct list_head list;
 
 	union {
-- 
cgit v1.3-7-g2ca7


From 1a0483d2a4c2c5e218d415c90d1a62b3b917d34e Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Fri, 3 May 2013 07:33:27 +0200
Subject: clk: si5351: Allow user to define disabled state for every clock
 output

This patch adds platform data and DT bindings to allow to overwrite
the stored disabled state for each clock output.

Signed-off-by: Marek Belisko <marek.belisko@streamunlimited.com>
Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 .../devicetree/bindings/clock/silabs,si5351.txt    |  5 ++
 drivers/clk/clk-si5351.c                           | 74 +++++++++++++++++++++-
 drivers/clk/clk-si5351.h                           |  1 +
 include/linux/platform_data/si5351.h               | 18 ++++++
 4 files changed, 95 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/silabs,si5351.txt b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
index cc374651662c..66c75b2d6158 100644
--- a/Documentation/devicetree/bindings/clock/silabs,si5351.txt
+++ b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
@@ -44,6 +44,11 @@ Optional child node properties:
 - silabs,multisynth-source: source pll A(0) or B(1) of corresponding multisynth
   divider.
 - silabs,pll-master: boolean, multisynth can change pll frequency.
+- silabs,disable-state : clock output disable state, shall be
+  0 = clock output is driven LOW when disabled
+  1 = clock output is driven HIGH when disabled
+  2 = clock output is FLOATING (HIGH-Z) when disabled
+  3 = clock output is NEVER disabled
 
 ==Example==
 
diff --git a/drivers/clk/clk-si5351.c b/drivers/clk/clk-si5351.c
index 892728412e9d..efc6d5e9268b 100644
--- a/drivers/clk/clk-si5351.c
+++ b/drivers/clk/clk-si5351.c
@@ -851,6 +851,41 @@ static int _si5351_clkout_set_drive_strength(
 	return 0;
 }
 
+static int _si5351_clkout_set_disable_state(
+	struct si5351_driver_data *drvdata, int num,
+	enum si5351_disable_state state)
+{
+	u8 reg = (num < 4) ? SI5351_CLK3_0_DISABLE_STATE :
+		SI5351_CLK7_4_DISABLE_STATE;
+	u8 shift = (num < 4) ? (2 * num) : (2 * (num-4));
+	u8 mask = SI5351_CLK_DISABLE_STATE_MASK << shift;
+	u8 val;
+
+	if (num > 8)
+		return -EINVAL;
+
+	switch (state) {
+	case SI5351_DISABLE_LOW:
+		val = SI5351_CLK_DISABLE_STATE_LOW;
+		break;
+	case SI5351_DISABLE_HIGH:
+		val = SI5351_CLK_DISABLE_STATE_HIGH;
+		break;
+	case SI5351_DISABLE_FLOATING:
+		val = SI5351_CLK_DISABLE_STATE_FLOAT;
+		break;
+	case SI5351_DISABLE_NEVER:
+		val = SI5351_CLK_DISABLE_STATE_NEVER;
+		break;
+	default:
+		return 0;
+	}
+
+	si5351_set_bits(drvdata, reg, mask, val << shift);
+
+	return 0;
+}
+
 static int si5351_clkout_prepare(struct clk_hw *hw)
 {
 	struct si5351_hw_data *hwdata =
@@ -1225,6 +1260,33 @@ static int si5351_dt_parse(struct i2c_client *client)
 			}
 		}
 
+		if (!of_property_read_u32(child, "silabs,disable-state",
+					  &val)) {
+			switch (val) {
+			case 0:
+				pdata->clkout[num].disable_state =
+					SI5351_DISABLE_LOW;
+				break;
+			case 1:
+				pdata->clkout[num].disable_state =
+					SI5351_DISABLE_HIGH;
+				break;
+			case 2:
+				pdata->clkout[num].disable_state =
+					SI5351_DISABLE_FLOATING;
+				break;
+			case 3:
+				pdata->clkout[num].disable_state =
+					SI5351_DISABLE_NEVER;
+				break;
+			default:
+				dev_err(&client->dev,
+					"invalid disable state %d for clkout %d\n",
+					val, num);
+				return -EINVAL;
+			}
+		}
+
 		if (!of_property_read_u32(child, "clock-frequency", &val))
 			pdata->clkout[num].rate = val;
 
@@ -1281,9 +1343,6 @@ static int si5351_i2c_probe(struct i2c_client *client,
 
 	/* Disable interrupts */
 	si5351_reg_write(drvdata, SI5351_INTERRUPT_MASK, 0xf0);
-	/* Set disabled output drivers to drive low */
-	si5351_reg_write(drvdata, SI5351_CLK3_0_DISABLE_STATE, 0x00);
-	si5351_reg_write(drvdata, SI5351_CLK7_4_DISABLE_STATE, 0x00);
 	/* Ensure pll select is on XTAL for Si5351A/B */
 	if (drvdata->variant != SI5351_VARIANT_C)
 		si5351_set_bits(drvdata, SI5351_PLL_INPUT_SOURCE,
@@ -1327,6 +1386,15 @@ static int si5351_i2c_probe(struct i2c_client *client,
 				n, pdata->clkout[n].drive);
 			return ret;
 		}
+
+		ret = _si5351_clkout_set_disable_state(drvdata, n,
+						pdata->clkout[n].disable_state);
+		if (ret) {
+			dev_err(&client->dev,
+				"failed set disable state of clkout%d to %d\n",
+				n, pdata->clkout[n].disable_state);
+			return ret;
+		}
 	}
 
 	/* register xtal input clock gate */
diff --git a/drivers/clk/clk-si5351.h b/drivers/clk/clk-si5351.h
index af41b5080f43..c0dbf2676872 100644
--- a/drivers/clk/clk-si5351.h
+++ b/drivers/clk/clk-si5351.h
@@ -81,6 +81,7 @@
 
 #define SI5351_CLK3_0_DISABLE_STATE		24
 #define SI5351_CLK7_4_DISABLE_STATE		25
+#define  SI5351_CLK_DISABLE_STATE_MASK		3
 #define  SI5351_CLK_DISABLE_STATE_LOW		0
 #define  SI5351_CLK_DISABLE_STATE_HIGH		1
 #define  SI5351_CLK_DISABLE_STATE_FLOAT		2
diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h
index 92dabcaf6499..54334393ab92 100644
--- a/include/linux/platform_data/si5351.h
+++ b/include/linux/platform_data/si5351.h
@@ -78,6 +78,23 @@ enum si5351_drive_strength {
 	SI5351_DRIVE_8MA = 8,
 };
 
+/**
+ * enum si5351_disable_state - Si5351 clock output disable state
+ * @SI5351_DISABLE_DEFAULT: default, do not change eeprom config
+ * @SI5351_DISABLE_LOW: CLKx is set to a LOW state when disabled
+ * @SI5351_DISABLE_HIGH: CLKx is set to a HIGH state when disabled
+ * @SI5351_DISABLE_FLOATING: CLKx is set to a FLOATING state when
+ *				disabled
+ * @SI5351_DISABLE_NEVER: CLKx is NEVER disabled
+ */
+enum si5351_disable_state {
+	SI5351_DISABLE_DEFAULT = 0,
+	SI5351_DISABLE_LOW,
+	SI5351_DISABLE_HIGH,
+	SI5351_DISABLE_FLOATING,
+	SI5351_DISABLE_NEVER,
+};
+
 /**
  * struct si5351_clkout_config - Si5351 clock output configuration
  * @clkout: clkout number
@@ -91,6 +108,7 @@ struct si5351_clkout_config {
 	enum si5351_multisynth_src multisynth_src;
 	enum si5351_clkout_src clkout_src;
 	enum si5351_drive_strength drive;
+	enum si5351_disable_state disable_state;
 	bool pll_master;
 	unsigned long rate;
 };
-- 
cgit v1.3-7-g2ca7


From 0b151debc31df089ddc07a3343031d8f51f988a3 Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Wed, 1 May 2013 02:58:28 +0200
Subject: clk: add non CONFIG_OF routines for clk-provider

Some drivers that are shared between architectures have HAVE_CLK selected
but don't have OF. To remove compilation errors for drivers that provide
clocks on DT with of_clk_add_provider we would have to enclose these calls
within #ifdef CONFIG_OF, #endif.

This patch adds some stubs for OF related clk-provider functions that
either do nothing or return appropriate values if CONFIG_OF is not set.
So, definition of these routines will always be available.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 include/linux/clk-provider.h | 47 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 11860985fecb..265f384f1e01 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -423,6 +423,17 @@ struct of_device_id;
 
 typedef void (*of_clk_init_cb_t)(struct device_node *);
 
+struct clk_onecell_data {
+	struct clk **clks;
+	unsigned int clk_num;
+};
+
+#define CLK_OF_DECLARE(name, compat, fn)			\
+	static const struct of_device_id __clk_of_table_##name	\
+		__used __section(__clk_of_table)		\
+		= { .compatible = compat, .data = fn };
+
+#ifdef CONFIG_OF
 int of_clk_add_provider(struct device_node *np,
 			struct clk *(*clk_src_get)(struct of_phandle_args *args,
 						   void *data),
@@ -430,19 +441,39 @@ int of_clk_add_provider(struct device_node *np,
 void of_clk_del_provider(struct device_node *np);
 struct clk *of_clk_src_simple_get(struct of_phandle_args *clkspec,
 				  void *data);
-struct clk_onecell_data {
-	struct clk **clks;
-	unsigned int clk_num;
-};
 struct clk *of_clk_src_onecell_get(struct of_phandle_args *clkspec, void *data);
 const char *of_clk_get_parent_name(struct device_node *np, int index);
 
 void of_clk_init(const struct of_device_id *matches);
 
-#define CLK_OF_DECLARE(name, compat, fn)			\
-	static const struct of_device_id __clk_of_table_##name	\
-		__used __section(__clk_of_table)		\
-		= { .compatible = compat, .data = fn };
+#else /* !CONFIG_OF */
 
+static inline int of_clk_add_provider(struct device_node *np,
+			struct clk *(*clk_src_get)(struct of_phandle_args *args,
+						   void *data),
+			void *data)
+{
+	return 0;
+}
+#define of_clk_del_provider(np) \
+	{ while (0); }
+static inline struct clk *of_clk_src_simple_get(
+	struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
+static inline struct clk *of_clk_src_onecell_get(
+	struct of_phandle_args *clkspec, void *data)
+{
+	return ERR_PTR(-ENOENT);
+}
+static inline const char *of_clk_get_parent_name(struct device_node *np,
+						 int index)
+{
+	return NULL;
+}
+#define of_clk_init(matches) \
+	{ while (0); }
+#endif /* CONFIG_OF */
 #endif /* CONFIG_COMMON_CLK */
 #endif /* CLK_PROVIDER_H */
-- 
cgit v1.3-7-g2ca7


From 899f0e66fff36ebb6dd6a83af9aa631f6cb7e0dc Mon Sep 17 00:00:00 2001
From: Gerlando Falauto <gerlando.falauto@keymile.com>
Date: Mon, 6 May 2013 14:30:19 +0000
Subject: genirq: Generic chip: Add support for per chip type mask cache

Today the same interrupt mask cache (stored within struct irq_chip_generic)
is shared between all the irq_chip_type instances. As there are instances
where each irq_chip_type uses a distinct mask register (as it is the case
for Orion SoCs), sharing a single mask cache may be incorrect.
So add a distinct pointer for each irq_chip_type, which for now
points to the original mask register within irq_chip_generic.
So no functional changes here.

[ tglx: Minor cosmetic tweaks ]

Reported-by: Joey Oravec <joravec@drewtech.com>
Signed-off-by: Simon Guinot <sguinot@lacie.com>
Signed-off-by: Holger Brunck <holger.brunck@keymile.com>
Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Lennert Buytenhek <kernel@wantstofly.org>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Holger Brunck <Holger.Brunck@keymile.com>
Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Simon Guinot <simon@sequanux.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Jean-Francois Moine <moinejf@free.fr>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Rob Landley <rob@landley.net>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: http://lkml.kernel.org/r/20130506142539.082226607@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       |  6 +++++-
 kernel/irq/generic-chip.c | 16 ++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index bc4e06611958..38709a3ab1c0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -644,6 +644,8 @@ struct irq_chip_regs {
  * @regs:		Register offsets for this chip
  * @handler:		Flow handler associated with this chip
  * @type:		Chip can handle these flow types
+ * @mask_cache_priv:	Cached mask register private to the chip type
+ * @mask_cache:		Pointer to cached mask register
  *
  * A irq_generic_chip can have several instances of irq_chip_type when
  * it requires different functions and register offsets for different
@@ -654,6 +656,8 @@ struct irq_chip_type {
 	struct irq_chip_regs	regs;
 	irq_flow_handler_t	handler;
 	u32			type;
+	u32			mask_cache_priv;
+	u32			*mask_cache;
 };
 
 /**
@@ -662,7 +666,7 @@ struct irq_chip_type {
  * @reg_base:		Register base address (virtual)
  * @irq_base:		Interrupt base nr for this chip
  * @irq_cnt:		Number of interrupts handled by this chip
- * @mask_cache:		Cached mask register
+ * @mask_cache:		Cached mask register shared between all chip types
  * @type_cache:		Cached type register
  * @polarity_cache:	Cached polarity register
  * @wake_enabled:	Interrupt can wakeup from suspend
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 0e6ba789056c..113d9ebfe0aa 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
-	gc->mask_cache &= ~mask;
+	*ct->mask_cache &= ~mask;
 	irq_gc_unlock(gc);
 }
 
@@ -57,8 +57,8 @@ void irq_gc_mask_set_bit(struct irq_data *d)
 	u32 mask = 1 << (d->irq - gc->irq_base);
 
 	irq_gc_lock(gc);
-	gc->mask_cache |= mask;
-	irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask);
+	*ct->mask_cache |= mask;
+	irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
 	irq_gc_unlock(gc);
 }
 
@@ -76,8 +76,8 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
 	u32 mask = 1 << (d->irq - gc->irq_base);
 
 	irq_gc_lock(gc);
-	gc->mask_cache &= ~mask;
-	irq_reg_writel(gc->mask_cache, gc->reg_base + ct->regs.mask);
+	*ct->mask_cache &= ~mask;
+	irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
 	irq_gc_unlock(gc);
 }
 
@@ -96,7 +96,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
-	gc->mask_cache |= mask;
+	*ct->mask_cache |= mask;
 	irq_gc_unlock(gc);
 }
 
@@ -250,6 +250,10 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 	if (flags & IRQ_GC_INIT_MASK_CACHE)
 		gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
 
+	/* Initialize mask cache pointer */
+	for (i = 0; i < gc->num_ct; i++)
+		ct[i].mask_cache = &gc->mask_cache;
+
 	for (i = gc->irq_base; msk; msk >>= 1, i++) {
 		if (!(msk & 0x01))
 			continue;
-- 
cgit v1.3-7-g2ca7


From af80b0fed67261dcba2ce2406db1d553d07cbe75 Mon Sep 17 00:00:00 2001
From: Gerlando Falauto <gerlando.falauto@keymile.com>
Date: Mon, 6 May 2013 14:30:21 +0000
Subject: genirq: Generic chip: Handle separate mask registers

There are cases where all irq_chip_type instances have separate mask
registers, making a shared mask register cache unsuitable for the
purpose.

Introduce a new flag IRQ_GC_MASK_CACHE_PER_TYPE. If set, point the per
chip mask pointer to the per chip private mask cache instead.

[ tglx: Simplified code, renamed flag and massaged changelog ]

Signed-off-by: Gerlando Falauto <gerlando.falauto@keymile.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Joey Oravec <joravec@drewtech.com>
Cc: Lennert Buytenhek <kernel@wantstofly.org>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Holger Brunck <Holger.Brunck@keymile.com>
Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Ben Dooks <ben-linux@fluff.org>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Simon Guinot <simon@sequanux.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Jean-Francois Moine <moinejf@free.fr>
Cc: Nicolas Pitre <nico@fluxnic.net>
Cc: Rob Landley <rob@landley.net>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Link: http://lkml.kernel.org/r/20130506142539.152569748@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       |  2 ++
 kernel/irq/generic-chip.c | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 38709a3ab1c0..7f1f0157fd00 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -704,10 +704,12 @@ struct irq_chip_generic {
  * @IRQ_GC_INIT_NESTED_LOCK:	Set the lock class of the irqs to nested for
  *				irq chips which need to call irq_set_wake() on
  *				the parent irq. Usually GPIO implementations
+ * @IRQ_GC_MASK_CACHE_PER_TYPE:	Mask cache is chip type private
  */
 enum irq_gc_flags {
 	IRQ_GC_INIT_MASK_CACHE		= 1 << 0,
 	IRQ_GC_INIT_NESTED_LOCK		= 1 << 1,
+	IRQ_GC_MASK_CACHE_PER_TYPE	= 1 << 2,
 };
 
 /* Generic chip callback functions */
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 113d9ebfe0aa..da2a94191fc5 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -241,18 +241,21 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 {
 	struct irq_chip_type *ct = gc->chip_types;
 	unsigned int i;
+	u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
 
 	raw_spin_lock(&gc_lock);
 	list_add_tail(&gc->list, &gc_list);
 	raw_spin_unlock(&gc_lock);
 
-	/* Init mask cache ? */
-	if (flags & IRQ_GC_INIT_MASK_CACHE)
-		gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
-
-	/* Initialize mask cache pointer */
-	for (i = 0; i < gc->num_ct; i++)
-		ct[i].mask_cache = &gc->mask_cache;
+	for (i = 0; i < gc->num_ct; i++) {
+		if (flags & IRQ_GC_MASK_CACHE_PER_TYPE) {
+			mskptr = &ct[i].mask_cache_priv;
+			mskreg = ct[i].regs.mask;
+		}
+		ct[i].mask_cache = mskptr;
+		if (flags & IRQ_GC_INIT_MASK_CACHE)
+			*mskptr = irq_reg_readl(gc->reg_base + mskreg);
+	}
 
 	for (i = gc->irq_base; msk; msk >>= 1, i++) {
 		if (!(msk & 0x01))
-- 
cgit v1.3-7-g2ca7


From 966dc736b819999cd2d3a6408d47d33b579f7d56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 May 2013 14:30:22 +0000
Subject: genirq: Generic chip: Cache per irq bit mask

Cache the per irq bit mask instead of recalculating it over and over.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jean-Francois Moine <moinejf@free.fr>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Gerlando Falauto <gerlando.falauto@keymile.com>
Cc: Rob Landley <rob@landley.net>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: http://lkml.kernel.org/r/20130506142539.227119865@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       |  4 ++++
 kernel/irq/generic-chip.c | 23 ++++++++++++++---------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7f1f0157fd00..d5fc7f5a49b8 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -119,6 +119,7 @@ struct irq_domain;
 
 /**
  * struct irq_data - per irq and irq chip data passed down to chip functions
+ * @mask:		precomputed bitmask for accessing the chip registers
  * @irq:		interrupt number
  * @hwirq:		hardware interrupt number, local to the interrupt domain
  * @node:		node index useful for balancing
@@ -138,6 +139,7 @@ struct irq_domain;
  * irq_data.
  */
 struct irq_data {
+	u32			mask;
 	unsigned int		irq;
 	unsigned long		hwirq;
 	unsigned int		node;
@@ -705,11 +707,13 @@ struct irq_chip_generic {
  *				irq chips which need to call irq_set_wake() on
  *				the parent irq. Usually GPIO implementations
  * @IRQ_GC_MASK_CACHE_PER_TYPE:	Mask cache is chip type private
+ * @IRQ_GC_NO_MASK:		Do not calculate irq_data->mask
  */
 enum irq_gc_flags {
 	IRQ_GC_INIT_MASK_CACHE		= 1 << 0,
 	IRQ_GC_INIT_NESTED_LOCK		= 1 << 1,
 	IRQ_GC_MASK_CACHE_PER_TYPE	= 1 << 2,
+	IRQ_GC_NO_MASK			= 1 << 3,
 };
 
 /* Generic chip callback functions */
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index da2a94191fc5..957155cebbac 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -35,7 +35,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
@@ -54,7 +54,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	*ct->mask_cache |= mask;
@@ -73,7 +73,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	*ct->mask_cache &= ~mask;
@@ -92,7 +92,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
@@ -108,7 +108,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
@@ -123,7 +123,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = ~(1 << (d->irq - gc->irq_base));
+	u32 mask = ~d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
@@ -138,7 +138,7 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
@@ -154,7 +154,7 @@ void irq_gc_eoi(struct irq_data *d)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
 	struct irq_chip_type *ct = irq_data_get_chip_type(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	irq_gc_lock(gc);
 	irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
@@ -172,7 +172,7 @@ void irq_gc_eoi(struct irq_data *d)
 int irq_gc_set_wake(struct irq_data *d, unsigned int on)
 {
 	struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
-	u32 mask = 1 << (d->irq - gc->irq_base);
+	u32 mask = d->mask;
 
 	if (!(mask & gc->wake_enabled))
 		return -EINVAL;
@@ -264,6 +264,11 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 		if (flags & IRQ_GC_INIT_NESTED_LOCK)
 			irq_set_lockdep_class(i, &irq_nested_lock_class);
 
+		if (!(flags & IRQ_GC_NO_MASK)) {
+			struct irq_data *d = irq_get_irq_data(i);
+
+			d->mask = 1 << (i - gc->irq_base);
+		}
 		irq_set_chip_and_handler(i, &ct->chip, ct->handler);
 		irq_set_chip_data(i, gc);
 		irq_modify_status(i, clr, set);
-- 
cgit v1.3-7-g2ca7


From d0051816e619f8f082582bec07ffa51bdb4f2104 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 May 2013 14:30:24 +0000
Subject: genirq: irqchip: Add a mask calculation function

Some chips have weird bit mask access patterns instead of the linear
you expect. Allow them to calculate the cached mask themself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jean-Francois Moine <moinejf@free.fr>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Gerlando Falauto <gerlando.falauto@keymile.com>
Cc: Rob Landley <rob@landley.net>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: http://lkml.kernel.org/r/20130506142539.302898834@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       | 3 +++
 kernel/irq/generic-chip.c | 8 ++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index d5fc7f5a49b8..ab8169faaa65 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -296,6 +296,7 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
  * @irq_suspend:	function called from core code on suspend once per chip
  * @irq_resume:		function called from core code on resume once per chip
  * @irq_pm_shutdown:	function called from core code on shutdown once per chip
+ * @irq_calc_mask:	Optional function to set irq_data.mask for special cases
  * @irq_print_chip:	optional to print special chip info in show_interrupts
  * @flags:		chip specific flags
  */
@@ -327,6 +328,8 @@ struct irq_chip {
 	void		(*irq_resume)(struct irq_data *data);
 	void		(*irq_pm_shutdown)(struct irq_data *data);
 
+	void		(*irq_calc_mask)(struct irq_data *data);
+
 	void		(*irq_print_chip)(struct irq_data *data, struct seq_file *p);
 
 	unsigned long	flags;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 957155cebbac..5068fe3ae1af 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -240,6 +240,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			    unsigned int set)
 {
 	struct irq_chip_type *ct = gc->chip_types;
+	struct irq_chip *chip = &ct->chip;
 	unsigned int i;
 	u32 *mskptr = &gc->mask_cache, mskreg = ct->regs.mask;
 
@@ -267,9 +268,12 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
 		if (!(flags & IRQ_GC_NO_MASK)) {
 			struct irq_data *d = irq_get_irq_data(i);
 
-			d->mask = 1 << (i - gc->irq_base);
+			if (chip->irq_calc_mask)
+				chip->irq_calc_mask(d);
+			else
+				d->mask = 1 << (i - gc->irq_base);
 		}
-		irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+		irq_set_chip_and_handler(i, chip, ct->handler);
 		irq_set_chip_data(i, gc);
 		irq_modify_status(i, clr, set);
 	}
-- 
cgit v1.3-7-g2ca7


From 088f40b7b027dad6519712ff224a5798dd62a204 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 6 May 2013 14:30:27 +0000
Subject: genirq: Generic chip: Add linear irq domain support

Provide infrastructure for irq chip implementations which work on
linear irq domains.

- Interface to allocate multiple generic chips which are associated to
  the irq domain.

- Interface to get the generic chip pointer for a particular hardware
  interrupt in the domain.

- irq domain mapping function to install the chip for a particular
  interrupt.

Note: This lacks a removal function for now.

[ Sebastian Hesselbarth: Mask cache and pointer math fixups ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Jean-Francois Moine <moinejf@free.fr>
Cc: devicetree-discuss@lists.ozlabs.org
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Cc: Gerlando Falauto <gerlando.falauto@keymile.com>
Cc: Rob Landley <rob@landley.net>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Link: http://lkml.kernel.org/r/20130506142539.450634298@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       |  30 ++++++++
 include/linux/irqdomain.h |  12 +++
 kernel/irq/generic-chip.c | 187 ++++++++++++++++++++++++++++++++++++++++++++--
 kernel/irq/irqdomain.c    |   6 --
 4 files changed, 223 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index ab8169faaa65..af7052c6a45c 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -678,6 +678,8 @@ struct irq_chip_type {
  * @wake_active:	Interrupt is marked as an wakeup from suspend source
  * @num_ct:		Number of available irq_chip_type instances (usually 1)
  * @private:		Private data for non generic chip callbacks
+ * @installed:		bitfield to denote installed interrupts
+ * @domain:		irq domain pointer
  * @list:		List head for keeping track of instances
  * @chip_types:		Array of interrupt irq_chip_types
  *
@@ -699,6 +701,8 @@ struct irq_chip_generic {
 	u32			wake_active;
 	unsigned int		num_ct;
 	void			*private;
+	unsigned long		installed;
+	struct irq_domain	*domain;
 	struct list_head	list;
 	struct irq_chip_type	chip_types[0];
 };
@@ -719,6 +723,24 @@ enum irq_gc_flags {
 	IRQ_GC_NO_MASK			= 1 << 3,
 };
 
+/*
+ * struct irq_domain_chip_generic - Generic irq chip data structure for irq domains
+ * @irqs_per_chip:	Number of interrupts per chip
+ * @num_chips:		Number of chips
+ * @irq_flags_to_set:	IRQ* flags to set on irq setup
+ * @irq_flags_to_clear:	IRQ* flags to clear on irq setup
+ * @gc_flags:		Generic chip specific setup flags
+ * @gc:			Array of pointers to generic interrupt chips
+ */
+struct irq_domain_chip_generic {
+	unsigned int		irqs_per_chip;
+	unsigned int		num_chips;
+	unsigned int		irq_flags_to_clear;
+	unsigned int		irq_flags_to_set;
+	enum irq_gc_flags	gc_flags;
+	struct irq_chip_generic	*gc[0];
+};
+
 /* Generic chip callback functions */
 void irq_gc_noop(struct irq_data *d);
 void irq_gc_mask_disable_reg(struct irq_data *d);
@@ -742,6 +764,14 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type);
 void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 			     unsigned int clr, unsigned int set);
 
+struct irq_chip_generic *irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq);
+int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+				   int num_ct, const char *name,
+				   irq_flow_handler_t handler,
+				   unsigned int clr, unsigned int set,
+				   enum irq_gc_flags flags);
+
+
 static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d)
 {
 	return container_of(d->chip, struct irq_chip_type, chip);
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 0d5b17bf5e51..ba2c708adcff 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -66,6 +66,10 @@ struct irq_domain_ops {
 		     unsigned long *out_hwirq, unsigned int *out_type);
 };
 
+extern struct irq_domain_ops irq_generic_chip_ops;
+
+struct irq_domain_chip_generic;
+
 /**
  * struct irq_domain - Hardware interrupt number translation object
  * @link: Element in global irq_domain list.
@@ -109,8 +113,16 @@ struct irq_domain {
 
 	/* Optional device node pointer */
 	struct device_node *of_node;
+	/* Optional pointer to generic interrupt chips */
+	struct irq_domain_chip_generic *gc;
 };
 
+#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
+				 * ie. legacy 8259, gets irqs 1..15 */
+#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
+#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
+#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
+
 #ifdef CONFIG_IRQ_DOMAIN
 struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
 					 unsigned int size,
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 3deb3333d53e..8743d62fded7 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -7,6 +7,7 @@
 #include <linux/irq.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
@@ -244,12 +245,156 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
 	}
 }
 
+/**
+ * irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain
+ * @d:			irq domain for which to allocate chips
+ * @irqs_per_chip:	Number of interrupts each chip handles
+ * @num_ct:		Number of irq_chip_type instances associated with this
+ * @name:		Name of the irq chip
+ * @handler:		Default flow handler associated with these chips
+ * @clr:		IRQ_* bits to clear in the mapping function
+ * @set:		IRQ_* bits to set in the mapping function
+ */
+int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
+				   int num_ct, const char *name,
+				   irq_flow_handler_t handler,
+				   unsigned int clr, unsigned int set,
+				   enum irq_gc_flags gcflags)
+{
+	struct irq_domain_chip_generic *dgc;
+	struct irq_chip_generic *gc;
+	int numchips, sz, i;
+	unsigned long flags;
+	void *tmp;
+
+	if (d->gc)
+		return -EBUSY;
+
+	if (d->revmap_type != IRQ_DOMAIN_MAP_LINEAR)
+		return -EINVAL;
+
+	numchips = d->revmap_data.linear.size / irqs_per_chip;
+	if (!numchips)
+		return -EINVAL;
+
+	/* Allocate a pointer, generic chip and chiptypes for each chip */
+	sz = sizeof(*dgc) + numchips * sizeof(gc);
+	sz += numchips * (sizeof(*gc) + num_ct * sizeof(struct irq_chip_type));
+
+	tmp = dgc = kzalloc(sz, GFP_KERNEL);
+	if (!dgc)
+		return -ENOMEM;
+	dgc->irqs_per_chip = irqs_per_chip;
+	dgc->num_chips = numchips;
+	dgc->irq_flags_to_set = set;
+	dgc->irq_flags_to_clear = clr;
+	dgc->gc_flags = gcflags;
+	d->gc = dgc;
+
+	/* Calc pointer to the first generic chip */
+	tmp += sizeof(*dgc) + numchips * sizeof(gc);
+	for (i = 0; i < numchips; i++) {
+		/* Store the pointer to the generic chip */
+		dgc->gc[i] = gc = tmp;
+		irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
+				      NULL, handler);
+		gc->domain = d;
+		raw_spin_lock_irqsave(&gc_lock, flags);
+		list_add_tail(&gc->list, &gc_list);
+		raw_spin_unlock_irqrestore(&gc_lock, flags);
+		/* Calc pointer to the next generic chip */
+		tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_alloc_domain_generic_chips);
+
+/**
+ * irq_get_domain_generic_chip - Get a pointer to the generic chip of a hw_irq
+ * @d:			irq domain pointer
+ * @hw_irq:		Hardware interrupt number
+ */
+struct irq_chip_generic *
+irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq)
+{
+	struct irq_domain_chip_generic *dgc = d->gc;
+	int idx;
+
+	if (!dgc)
+		return NULL;
+	idx = hw_irq / dgc->irqs_per_chip;
+	if (idx >= dgc->num_chips)
+		return NULL;
+	return dgc->gc[idx];
+}
+EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip);
+
 /*
  * Separate lockdep class for interrupt chip which can nest irq_desc
  * lock.
  */
 static struct lock_class_key irq_nested_lock_class;
 
+/**
+ * irq_map_generic_chip - Map a generic chip for an irq domain
+ */
+static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+				irq_hw_number_t hw_irq)
+{
+	struct irq_data *data = irq_get_irq_data(virq);
+	struct irq_domain_chip_generic *dgc = d->gc;
+	struct irq_chip_generic *gc;
+	struct irq_chip_type *ct;
+	struct irq_chip *chip;
+	unsigned long flags;
+	int idx;
+
+	if (!d->gc)
+		return -ENODEV;
+
+	idx = hw_irq / dgc->irqs_per_chip;
+	if (idx >= dgc->num_chips)
+		return -EINVAL;
+	gc = dgc->gc[idx];
+
+	idx = hw_irq % dgc->irqs_per_chip;
+
+	if (test_bit(idx, &gc->installed))
+		return -EBUSY;
+
+	ct = gc->chip_types;
+	chip = &ct->chip;
+
+	/* We only init the cache for the first mapping of a generic chip */
+	if (!gc->installed) {
+		raw_spin_lock_irqsave(&gc->lock, flags);
+		irq_gc_init_mask_cache(gc, dgc->gc_flags);
+		raw_spin_unlock_irqrestore(&gc->lock, flags);
+	}
+
+	/* Mark the interrupt as installed */
+	set_bit(idx, &gc->installed);
+
+	if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK)
+		irq_set_lockdep_class(virq, &irq_nested_lock_class);
+
+	if (chip->irq_calc_mask)
+		chip->irq_calc_mask(data);
+	else
+		data->mask = 1 << idx;
+
+	irq_set_chip_and_handler(virq, chip, ct->handler);
+	irq_set_chip_data(virq, gc);
+	irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
+	return 0;
+}
+
+struct irq_domain_ops irq_generic_chip_ops = {
+	.map	= irq_map_generic_chip,
+	.xlate	= irq_domain_xlate_onetwocell,
+};
+EXPORT_SYMBOL_GPL(irq_generic_chip_ops);
+
 /**
  * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
  * @gc:		Generic irq chip holding all data
@@ -354,6 +499,24 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
 }
 EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
 
+static struct irq_data *irq_gc_get_irq_data(struct irq_chip_generic *gc)
+{
+	unsigned int virq;
+
+	if (!gc->domain)
+		return irq_get_irq_data(gc->irq_base);
+
+	/*
+	 * We don't know which of the irqs has been actually
+	 * installed. Use the first one.
+	 */
+	if (!gc->installed)
+		return NULL;
+
+	virq = irq_find_mapping(gc->domain, gc->irq_base + __ffs(gc->installed));
+	return virq ? irq_get_irq_data(virq) : NULL;
+}
+
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
 {
@@ -362,8 +525,12 @@ static int irq_gc_suspend(void)
 	list_for_each_entry(gc, &gc_list, list) {
 		struct irq_chip_type *ct = gc->chip_types;
 
-		if (ct->chip.irq_suspend)
-			ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+		if (ct->chip.irq_suspend) {
+			struct irq_data *data = irq_gc_get_irq_data(gc);
+
+			if (data)
+				ct->chip.irq_suspend(data);
+		}
 	}
 	return 0;
 }
@@ -375,8 +542,12 @@ static void irq_gc_resume(void)
 	list_for_each_entry(gc, &gc_list, list) {
 		struct irq_chip_type *ct = gc->chip_types;
 
-		if (ct->chip.irq_resume)
-			ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+		if (ct->chip.irq_resume) {
+			struct irq_data *data = irq_gc_get_irq_data(gc);
+
+			if (data)
+				ct->chip.irq_resume(data);
+		}
 	}
 }
 #else
@@ -391,8 +562,12 @@ static void irq_gc_shutdown(void)
 	list_for_each_entry(gc, &gc_list, list) {
 		struct irq_chip_type *ct = gc->chip_types;
 
-		if (ct->chip.irq_pm_shutdown)
-			ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+		if (ct->chip.irq_pm_shutdown) {
+			struct irq_data *data = irq_gc_get_irq_data(gc);
+
+			if (data)
+				ct->chip.irq_pm_shutdown(data);
+		}
 	}
 }
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0c..1db9e70f5488 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -16,12 +16,6 @@
 #include <linux/smp.h>
 #include <linux/fs.h>
 
-#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
-				 * ie. legacy 8259, gets irqs 1..15 */
-#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
-#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
-#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
-
 static LIST_HEAD(irq_domain_list);
 static DEFINE_MUTEX(irq_domain_mutex);
 
-- 
cgit v1.3-7-g2ca7


From e8bd834f73714378ef110a64287db1b77033c8da Mon Sep 17 00:00:00 2001
From: Grant Likely <grant.likely@linaro.org>
Date: Wed, 29 May 2013 03:10:52 +0100
Subject: genirq: irqchip: Add mask to block out invalid irqs

Some controllers have irqs that aren't wired up and must never be used.
For the generic chip attached to an irq_domain this provides a mask that
can be used to block out particular irqs so that they never get mapped.

Signed-off-by: Grant Likely <grant.likely@linaro.org>
Link: http://lkml.kernel.org/r/1369793454-19197-2-git-send-email-grant.likely@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h       | 2 ++
 kernel/irq/generic-chip.c | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index af7052c6a45c..298a9b9ce675 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -679,6 +679,7 @@ struct irq_chip_type {
  * @num_ct:		Number of available irq_chip_type instances (usually 1)
  * @private:		Private data for non generic chip callbacks
  * @installed:		bitfield to denote installed interrupts
+ * @unused:		bitfield to denote unused interrupts
  * @domain:		irq domain pointer
  * @list:		List head for keeping track of instances
  * @chip_types:		Array of interrupt irq_chip_types
@@ -702,6 +703,7 @@ struct irq_chip_generic {
 	unsigned int		num_ct;
 	void			*private;
 	unsigned long		installed;
+	unsigned long		unused;
 	struct irq_domain	*domain;
 	struct list_head	list;
 	struct irq_chip_type	chip_types[0];
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 8743d62fded7..95575d8d5392 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -359,6 +359,9 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
 
 	idx = hw_irq % dgc->irqs_per_chip;
 
+	if (test_bit(idx, &gc->unused))
+		return -ENOTSUPP;
+
 	if (test_bit(idx, &gc->installed))
 		return -EBUSY;
 
-- 
cgit v1.3-7-g2ca7


From 061cec925f212f145516e826f39962624a738ded Mon Sep 17 00:00:00 2001
From: Prashant Gaikwad <pgaikwad@nvidia.com>
Date: Mon, 27 May 2013 13:10:09 +0530
Subject: clk: tegra: Use common of_clk_init function

Use common of_clk_init() function for clocks initialization.

Signed-off-by: Prashant Gaikwad <pgaikwad@nvidia.com>
Reviewed-by: Thierry Reding <thierry.reding@gmail.com>
Acked-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 arch/arm/mach-tegra/common.c     |  4 ++--
 drivers/clk/tegra/clk-tegra114.c |  3 ++-
 drivers/clk/tegra/clk-tegra20.c  |  3 ++-
 drivers/clk/tegra/clk-tegra30.c  |  3 ++-
 drivers/clk/tegra/clk.c          | 12 ------------
 drivers/clk/tegra/clk.h          | 18 ------------------
 include/linux/clk/tegra.h        |  1 -
 7 files changed, 8 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-tegra/common.c b/arch/arm/mach-tegra/common.c
index 9f852c6fe5b9..95ce2a043d0b 100644
--- a/arch/arm/mach-tegra/common.c
+++ b/arch/arm/mach-tegra/common.c
@@ -23,7 +23,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/irqchip.h>
-#include <linux/clk/tegra.h>
+#include <linux/clk-provider.h>
 
 #include <asm/hardware/cache-l2x0.h>
 
@@ -59,7 +59,7 @@ u32 tegra_uart_config[4] = {
 #ifdef CONFIG_OF
 void __init tegra_dt_init_irq(void)
 {
-	tegra_clocks_init();
+	of_clk_init(NULL);
 	tegra_pmc_init();
 	tegra_init_irq();
 	irqchip_init();
diff --git a/drivers/clk/tegra/clk-tegra114.c b/drivers/clk/tegra/clk-tegra114.c
index 772fc2e53371..86ee05d93eff 100644
--- a/drivers/clk/tegra/clk-tegra114.c
+++ b/drivers/clk/tegra/clk-tegra114.c
@@ -2033,7 +2033,7 @@ static void __init tegra114_clock_apply_init_table(void)
 	tegra_init_from_table(init_table, clks, clk_max);
 }
 
-void __init tegra114_clock_init(struct device_node *np)
+static void __init tegra114_clock_init(struct device_node *np)
 {
 	struct device_node *node;
 	int i;
@@ -2086,3 +2086,4 @@ void __init tegra114_clock_init(struct device_node *np)
 
 	tegra_cpu_car_ops = &tegra114_cpu_car_ops;
 }
+CLK_OF_DECLARE(tegra114, "nvidia,tegra114-car", tegra114_clock_init);
diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c
index 075db0c99edb..759ca47be753 100644
--- a/drivers/clk/tegra/clk-tegra20.c
+++ b/drivers/clk/tegra/clk-tegra20.c
@@ -1287,7 +1287,7 @@ static const struct of_device_id pmc_match[] __initconst = {
 	{},
 };
 
-void __init tegra20_clock_init(struct device_node *np)
+static void __init tegra20_clock_init(struct device_node *np)
 {
 	int i;
 	struct device_node *node;
@@ -1339,3 +1339,4 @@ void __init tegra20_clock_init(struct device_node *np)
 
 	tegra_cpu_car_ops = &tegra20_cpu_car_ops;
 }
+CLK_OF_DECLARE(tegra20, "nvidia,tegra20-car", tegra20_clock_init);
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index a11a7d9633d4..b62e140b9376 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -1953,7 +1953,7 @@ static const struct of_device_id pmc_match[] __initconst = {
 	{},
 };
 
-void __init tegra30_clock_init(struct device_node *np)
+static void __init tegra30_clock_init(struct device_node *np)
 {
 	struct device_node *node;
 	int i;
@@ -2004,3 +2004,4 @@ void __init tegra30_clock_init(struct device_node *np)
 
 	tegra_cpu_car_ops = &tegra30_cpu_car_ops;
 }
+CLK_OF_DECLARE(tegra30, "nvidia,tegra30-car", tegra30_clock_init);
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index 923ca7ee4694..86581ac1fd69 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -74,18 +74,6 @@ void __init tegra_init_from_table(struct tegra_clk_init_table *tbl,
 	}
 }
 
-static const struct of_device_id tegra_dt_clk_match[] = {
-	{ .compatible = "nvidia,tegra20-car", .data = tegra20_clock_init },
-	{ .compatible = "nvidia,tegra30-car", .data = tegra30_clock_init },
-	{ .compatible = "nvidia,tegra114-car", .data = tegra114_clock_init },
-	{ }
-};
-
-void __init tegra_clocks_init(void)
-{
-	of_clk_init(tegra_dt_clk_match);
-}
-
 tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
 
 void __init tegra_clocks_apply_init_table(void)
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index e0565620d68e..11278a80e63e 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -571,24 +571,6 @@ void tegra_init_from_table(struct tegra_clk_init_table *tbl,
 void tegra_init_dup_clks(struct tegra_clk_duplicate *dup_list,
 		struct clk *clks[], int clk_max);
 
-#ifdef CONFIG_ARCH_TEGRA_2x_SOC
-void tegra20_clock_init(struct device_node *np);
-#else
-static inline void tegra20_clock_init(struct device_node *np) {}
-#endif /* CONFIG_ARCH_TEGRA_2x_SOC */
-
-#ifdef CONFIG_ARCH_TEGRA_3x_SOC
-void tegra30_clock_init(struct device_node *np);
-#else
-static inline void tegra30_clock_init(struct device_node *np) {}
-#endif /* CONFIG_ARCH_TEGRA_3x_SOC */
-
-#ifdef CONFIG_ARCH_TEGRA_114_SOC
-void tegra114_clock_init(struct device_node *np);
-#else
-static inline void tegra114_clock_init(struct device_node *np) {}
-#endif /* CONFIG_ARCH_TEGRA114_SOC */
-
 typedef void (*tegra_clk_apply_init_table_func)(void);
 extern tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
 
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 642789baec74..3670a4f5402b 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -122,7 +122,6 @@ static inline void tegra_cpu_clock_resume(void)
 
 void tegra_periph_reset_deassert(struct clk *c);
 void tegra_periph_reset_assert(struct clk *c);
-void tegra_clocks_init(void);
 void tegra_clocks_apply_init_table(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.3-7-g2ca7


From 2da1c4bf765cb32024e5db6fa75dab92916fa3b1 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 30 May 2013 13:42:29 +0100
Subject: ASoC: wm8994: Allow debounce before MICDET identification

For systems which do not have a jack detection feature allow some debounce
to be specified before we perform accessory identification, improving
robustness without impacting button detection responsiveness.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/wm8994/pdata.h |  5 +++++
 sound/soc/codecs/wm8994.c        | 35 +++++++++++++++++++++++++++++++++--
 sound/soc/codecs/wm8994.h        |  2 ++
 3 files changed, 40 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index 68e776594889..b5046f6313a9 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -182,6 +182,11 @@ struct wm8994_pdata {
 	 */
 	int micdet_delay;
 
+	/* Delay between microphone detect completing and reporting on
+	 * insert (specified in ms)
+	 */
+	int mic_id_delay;
+
 	/* IRQ for microphone detection if brought out directly as a
 	 * signal.
 	 */
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index 5d046b1c5a7e..0f58f003dc1a 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -3660,6 +3660,8 @@ static irqreturn_t wm1811_jackdet_irq(int irq, void *data)
 
 	pm_runtime_get_sync(codec->dev);
 
+	cancel_delayed_work_sync(&wm8994->mic_complete_work);
+
 	mutex_lock(&wm8994->accdet_lock);
 
 	reg = snd_soc_read(codec, WM1811_JACKDET_CTRL);
@@ -3842,11 +3844,33 @@ int wm8958_mic_detect(struct snd_soc_codec *codec, struct snd_soc_jack *jack,
 }
 EXPORT_SYMBOL_GPL(wm8958_mic_detect);
 
+static void wm8958_mic_work(struct work_struct *work)
+{
+	struct wm8994_priv *wm8994 = container_of(work,
+						  struct wm8994_priv,
+						  mic_complete_work.work);
+	struct snd_soc_codec *codec = wm8994->hubs.codec;
+
+	dev_crit(codec->dev, "MIC WORK %x\n", wm8994->mic_status);
+
+	pm_runtime_get_sync(codec->dev);
+
+	mutex_lock(&wm8994->accdet_lock);
+
+	wm8994->mic_id_cb(wm8994->mic_id_cb_data, wm8994->mic_status);
+
+	mutex_unlock(&wm8994->accdet_lock);
+
+	pm_runtime_put(codec->dev);
+
+	dev_crit(codec->dev, "MIC WORK %x DONE\n", wm8994->mic_status);
+}
+
 static irqreturn_t wm8958_mic_irq(int irq, void *data)
 {
 	struct wm8994_priv *wm8994 = data;
 	struct snd_soc_codec *codec = wm8994->hubs.codec;
-	int reg, count, ret;
+	int reg, count, ret, id_delay;
 
 	/*
 	 * Jack detection may have detected a removal simulataneously
@@ -3856,6 +3880,7 @@ static irqreturn_t wm8958_mic_irq(int irq, void *data)
 	if (!(snd_soc_read(codec, WM8958_MIC_DETECT_1) & WM8958_MICD_ENA))
 		return IRQ_HANDLED;
 
+	cancel_delayed_work_sync(&wm8994->mic_complete_work);
 	cancel_delayed_work_sync(&wm8994->open_circuit_work);
 
 	pm_runtime_get_sync(codec->dev);
@@ -3904,8 +3929,12 @@ static irqreturn_t wm8958_mic_irq(int irq, void *data)
 		}
 	}
 
+	wm8994->mic_status = reg;
+	id_delay = wm8994->wm8994->pdata.mic_id_delay;
+
 	if (wm8994->mic_detecting)
-		wm8994->mic_id_cb(wm8994->mic_id_cb_data, reg);
+		schedule_delayed_work(&wm8994->mic_complete_work,
+				      msecs_to_jiffies(id_delay));
 	else
 		wm8958_button_det(codec, reg);
 
@@ -3971,6 +4000,8 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec)
 		break;
 	}
 
+	INIT_DELAYED_WORK(&wm8994->mic_complete_work, wm8958_mic_work);
+
 	for (i = 0; i < ARRAY_SIZE(wm8994->fll_locked); i++)
 		init_completion(&wm8994->fll_locked[i]);
 
diff --git a/sound/soc/codecs/wm8994.h b/sound/soc/codecs/wm8994.h
index 9d19a9185d35..6536f8d45ac6 100644
--- a/sound/soc/codecs/wm8994.h
+++ b/sound/soc/codecs/wm8994.h
@@ -135,6 +135,8 @@ struct wm8994_priv {
 	struct wm8994_micdet micdet[2];
 	struct delayed_work mic_work;
 	struct delayed_work open_circuit_work;
+	struct delayed_work mic_complete_work;
+	u16 mic_status;
 	bool mic_detecting;
 	bool jack_mic;
 	int btn_mask;
-- 
cgit v1.3-7-g2ca7


From 4dd9572abc224019a042b662fb0eececca283cb9 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Thu, 30 May 2013 09:59:39 -0600
Subject: spi: fix undefined behaviour in SPI_BPW_RANGE_MASK

The parameters to SPI_BPW_RANGE_MASK() are in the range 1..32. If 32 is
used as a parameter, part of the expression is "1 << 32". Since 32 is >=
the size of the type in use, such a shift is undefined behaviour. Add
macro SPI_BIT_MASK to Implement a special case and thus avoid undefined
behaviour. Use this new macro rather than BIT() when implementing
SPI_BPW_RANGE_MASK().

This fixes build warnings such as:
drivers/spi/spi-gpio.c:446:2: warning: left shift count >= width of type [enabled by default]

SPI_BPW_MASK() already avoids this, since its parameter is also in range
1..32, yet it only shifts by up to one less than the input parameter.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/spi/spi.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 173725622252..3e9479134d9d 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -309,7 +309,8 @@ struct spi_master {
 	/* bitmask of supported bits_per_word for transfers */
 	u32			bits_per_word_mask;
 #define SPI_BPW_MASK(bits) BIT((bits) - 1)
-#define SPI_BPW_RANGE_MASK(min, max) ((BIT(max) - 1) - (BIT(min) - 1))
+#define SPI_BIT_MASK(bits) (((bits) == 32) ? ~0UL : (BIT(bits) - 1))
+#define SPI_BPW_RANGE_MASK(min, max) (SPI_BIT_MASK(max) - SPI_BIT_MASK(min))
 
 	/* other constraints relevant to this driver */
 	u16			flags;
-- 
cgit v1.3-7-g2ca7


From eca8960a8e0f48dc62c1063bcc33a626455d766e Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Thu, 30 May 2013 09:59:40 -0600
Subject: spi: fix incorrect handling of min param in SPI_BPW_RANGE_MASK

SPI_BPW_RANGE_MASK is intended to work by calculating two masks; one
representing support for all bits up-to-and-including the "max" supported
value, and one representing support for all bits up-to-but-not-including
the "min" supported value, and then taking the difference between the
two, resulting in a mask representing support for all bits between
(inclusive) the min and max values.

However, the second mask ended up representing all bits up-to-and-
including rather up-to-but-not-including. Fix this bug.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/spi/spi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 3e9479134d9d..28e440be1c07 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -310,7 +310,7 @@ struct spi_master {
 	u32			bits_per_word_mask;
 #define SPI_BPW_MASK(bits) BIT((bits) - 1)
 #define SPI_BIT_MASK(bits) (((bits) == 32) ? ~0UL : (BIT(bits) - 1))
-#define SPI_BPW_RANGE_MASK(min, max) (SPI_BIT_MASK(max) - SPI_BIT_MASK(min))
+#define SPI_BPW_RANGE_MASK(min, max) (SPI_BIT_MASK(max) - SPI_BIT_MASK(min - 1))
 
 	/* other constraints relevant to this driver */
 	u16			flags;
-- 
cgit v1.3-7-g2ca7


From b2c064b25ad07169b2892a733918e6b941bf3366 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 May 2013 10:38:55 +0200
Subject: Driver core / memory: Simplify __memory_block_change_state()

As noted by Tang Chen, the last_online field in struct memory_block
introduced by commit 4960e05 (Driver core: Introduce offline/online
callbacks for memory blocks) is not really necessary, because
online_pages() restores the previous state if passed ONLINE_KEEP as
the last argument.  Therefore, remove that field along with the code
referring to it.

References: http://marc.info/?l=linux-kernel&m=136919777305599&w=2
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Tang Chen <tangchen@cn.fujitsu.com>
---
 drivers/base/memory.c  | 11 ++---------
 include/linux/memory.h |  1 -
 2 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c8f3b63fcacd..c7092bc3c01e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -291,13 +291,7 @@ static int __memory_block_change_state(struct memory_block *mem,
 		mem->state = MEM_GOING_OFFLINE;
 
 	ret = memory_block_action(mem->start_section_nr, to_state, online_type);
-	if (ret) {
-		mem->state = from_state_req;
-	} else {
-		mem->state = to_state;
-		if (to_state == MEM_ONLINE)
-			mem->last_online = online_type;
-	}
+	mem->state = ret ? from_state_req : to_state;
 	return ret;
 }
 
@@ -310,7 +304,7 @@ static int memory_subsys_online(struct device *dev)
 
 	ret = mem->state == MEM_ONLINE ? 0 :
 		__memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE,
-					    mem->last_online);
+					    ONLINE_KEEP);
 
 	mutex_unlock(&mem->state_mutex);
 	return ret;
@@ -618,7 +612,6 @@ static int init_memory_block(struct memory_block **memory,
 			base_memory_block_id(scn_nr) * sections_per_block;
 	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
 	mem->state = state;
-	mem->last_online = ONLINE_KEEP;
 	mem->section_count++;
 	mutex_init(&mem->state_mutex);
 	start_pfn = section_nr_to_pfn(mem->start_section_nr);
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 3d5346583022..85c31a8e2904 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -26,7 +26,6 @@ struct memory_block {
 	unsigned long start_section_nr;
 	unsigned long end_section_nr;
 	unsigned long state;
-	int last_online;
 	int section_count;
 
 	/*
-- 
cgit v1.3-7-g2ca7


From ea50be59345a2b714fd3ed43e1bba89906c177c3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 23 May 2013 10:41:50 +0200
Subject: Driver core / MM: Drop offline_memory_block()

Since offline_memory_block(mem) is functionally equivalent to
device_offline(&mem->dev), make the only caller of the former use
the latter instead and drop offline_memory_block() entirely.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Toshi Kani <toshi.kani@hp.com>
---
 drivers/base/memory.c          | 21 ---------------------
 include/linux/memory_hotplug.h |  1 -
 mm/memory_hotplug.c            |  2 +-
 3 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c7092bc3c01e..4ebf97f99fae 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -728,27 +728,6 @@ int unregister_memory_section(struct mem_section *section)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-/*
- * offline one memory block. If the memory block has been offlined, do nothing.
- *
- * Call under device_hotplug_lock.
- */
-int offline_memory_block(struct memory_block *mem)
-{
-	int ret = 0;
-
-	mutex_lock(&mem->state_mutex);
-	if (mem->state != MEM_OFFLINE) {
-		ret = __memory_block_change_state_uevent(mem, MEM_OFFLINE,
-							 MEM_ONLINE, -1);
-		if (!ret)
-			mem->dev.offline = true;
-	}
-	mutex_unlock(&mem->state_mutex);
-
-	return ret;
-}
-
 /* return true if the memory block is offlined, otherwise, return false */
 bool is_memblock_offlined(struct memory_block *mem)
 {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 2975b7b2a9d8..ae5480a00963 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -251,7 +251,6 @@ extern int mem_online_node(int nid);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
-extern int offline_memory_block(struct memory_block *mem);
 extern bool is_memblock_offlined(struct memory_block *mem);
 extern int remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5ea1287ee91f..a39841d240e8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1680,7 +1680,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 static int offline_memory_block_cb(struct memory_block *mem, void *arg)
 {
 	int *ret = arg;
-	int error = offline_memory_block(mem);
+	int error = device_offline(&mem->dev);
 
 	if (error != 0 && *ret == 0)
 		*ret = error;
-- 
cgit v1.3-7-g2ca7


From 242831eb15a06fa4414eaa705fdc6dd432ab98d1 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 27 May 2013 12:58:46 +0200
Subject: Memory hotplug / ACPI: Simplify memory removal

Now that the memory offlining should be taken care of by the
companion device offlining code in acpi_scan_hot_remove(), the
ACPI memory hotplug driver doesn't need to offline it in
remove_memory() any more.  Moreover, since the return value of
remove_memory() is not used, it's better to make it be a void
function and trigger a BUG() if the memory scheduled for removal is
not offline.

Change the code in accordance with the above observations.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Toshi Kani <toshi.kani@hp.com>
---
 drivers/acpi/acpi_memhotplug.c | 13 ++------
 include/linux/memory_hotplug.h |  2 +-
 mm/memory_hotplug.c            | 71 +++++-------------------------------------
 3 files changed, 12 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 5590db12028e..c711d1144044 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -271,13 +271,11 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
-static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
+static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 {
 	acpi_handle handle = mem_device->device->handle;
-	int result = 0, nid;
 	struct acpi_memory_info *info, *n;
-
-	nid = acpi_get_node(handle);
+	int nid = acpi_get_node(handle);
 
 	list_for_each_entry_safe(info, n, &mem_device->res_list, list) {
 		if (!info->enabled)
@@ -287,15 +285,10 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 			nid = memory_add_physaddr_to_nid(info->start_addr);
 
 		acpi_unbind_memory_blocks(info, handle);
-		result = remove_memory(nid, info->start_addr, info->length);
-		if (result)
-			return result;
-
+		remove_memory(nid, info->start_addr, info->length);
 		list_del(&info->list);
 		kfree(info);
 	}
-
-	return result;
 }
 
 static void acpi_memory_device_free(struct acpi_memory_device *mem_device)
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ae5480a00963..00569fb4ed6a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -252,7 +252,7 @@ extern int add_memory(int nid, u64 start, u64 size);
 extern int arch_add_memory(int nid, u64 start, u64 size);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern bool is_memblock_offlined(struct memory_block *mem);
-extern int remove_memory(int nid, u64 start, u64 size);
+extern void remove_memory(int nid, u64 start, u64 size);
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
 								int nr_pages);
 extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a39841d240e8..7026fbc42aaa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1670,24 +1670,6 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-/**
- * offline_memory_block_cb - callback function for offlining memory block
- * @mem: the memory block to be offlined
- * @arg: buffer to hold error msg
- *
- * Always return 0, and put the error msg in arg if any.
- */
-static int offline_memory_block_cb(struct memory_block *mem, void *arg)
-{
-	int *ret = arg;
-	int error = device_offline(&mem->dev);
-
-	if (error != 0 && *ret == 0)
-		*ret = error;
-
-	return 0;
-}
-
 static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 {
 	int ret = !is_memblock_offlined(mem);
@@ -1813,54 +1795,22 @@ void try_offline_node(int nid)
 }
 EXPORT_SYMBOL(try_offline_node);
 
-int __ref remove_memory(int nid, u64 start, u64 size)
+void __ref remove_memory(int nid, u64 start, u64 size)
 {
-	unsigned long start_pfn, end_pfn;
-	int ret = 0;
-	int retry = 1;
-
-	start_pfn = PFN_DOWN(start);
-	end_pfn = PFN_UP(start + size - 1);
-
-	/*
-	 * When CONFIG_MEMCG is on, one memory block may be used by other
-	 * blocks to store page cgroup when onlining pages. But we don't know
-	 * in what order pages are onlined. So we iterate twice to offline
-	 * memory:
-	 * 1st iterate: offline every non primary memory block.
-	 * 2nd iterate: offline primary (i.e. first added) memory block.
-	 */
-repeat:
-	walk_memory_range(start_pfn, end_pfn, &ret,
-			  offline_memory_block_cb);
-	if (ret) {
-		if (!retry)
-			return ret;
-
-		retry = 0;
-		ret = 0;
-		goto repeat;
-	}
+	int ret;
 
 	lock_memory_hotplug();
 
 	/*
-	 * we have offlined all memory blocks like this:
-	 *   1. lock memory hotplug
-	 *   2. offline a memory block
-	 *   3. unlock memory hotplug
-	 *
-	 * repeat step1-3 to offline the memory block. All memory blocks
-	 * must be offlined before removing memory. But we don't hold the
-	 * lock in the whole operation. So we should check whether all
-	 * memory blocks are offlined.
+	 * All memory blocks must be offlined before removing memory.  Check
+	 * whether all memory blocks in question are offline and trigger a BUG()
+	 * if this is not the case.
 	 */
-
-	ret = walk_memory_range(start_pfn, end_pfn, NULL,
+	ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
 				is_memblock_offlined_cb);
 	if (ret) {
 		unlock_memory_hotplug();
-		return ret;
+		BUG();
 	}
 
 	/* remove memmap entry */
@@ -1871,17 +1821,12 @@ repeat:
 	try_offline_node(nid);
 
 	unlock_memory_hotplug();
-
-	return 0;
 }
 #else
 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
 {
 	return -EINVAL;
 }
-int remove_memory(int nid, u64 start, u64 size)
-{
-	return -EINVAL;
-}
+void remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 EXPORT_SYMBOL_GPL(remove_memory);
-- 
cgit v1.3-7-g2ca7


From aba6efc47133af4941cda16e690f71b7ad894da2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Sat, 1 Jun 2013 22:24:07 +0200
Subject: Memory hotplug: Move alternative function definitions to header

Move the definitions of offline_pages() and remove_memory()
for CONFIG_MEMORY_HOTREMOVE to memory_hotplug.h, where they belong,
and make them static inline.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/memory_hotplug.h | 9 +++++++++
 mm/memory_hotplug.c            | 8 +-------
 2 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 00569fb4ed6a..dd38e62b84d2 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -234,6 +234,8 @@ static inline void unlock_memory_hotplug(void) {}
 
 extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
+extern void remove_memory(int nid, u64 start, u64 size);
 
 #else
 static inline int is_mem_section_removable(unsigned long pfn,
@@ -243,6 +245,13 @@ static inline int is_mem_section_removable(unsigned long pfn,
 }
 
 static inline void try_offline_node(int nid) {}
+
+static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+	return -EINVAL;
+}
+
+static inline void remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7026fbc42aaa..490e3d401e2c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1822,11 +1822,5 @@ void __ref remove_memory(int nid, u64 start, u64 size)
 
 	unlock_memory_hotplug();
 }
-#else
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
-	return -EINVAL;
-}
-void remove_memory(int nid, u64 start, u64 size) {}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
 EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
-- 
cgit v1.3-7-g2ca7


From de9c739435b47e7d39e448b2cc4c8154cabc9e5a Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Tue, 5 Feb 2013 18:40:17 +0900
Subject: PM / devfreq: add comments and Documentation

- Added missing ABI documents
- Added comments to clarify the objectives of functions

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Acked-by: Nishanth Menon <nm@ti.com>
Acked-by: Rajagopal Venkat <rajagopal.venkat@linaro.org>
---
 Documentation/ABI/testing/sysfs-class-devfreq | 20 ++++++++++++++++++++
 drivers/devfreq/devfreq.c                     | 10 ++++++++++
 include/linux/devfreq.h                       |  2 ++
 3 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-class-devfreq b/Documentation/ABI/testing/sysfs-class-devfreq
index 0ba6ea2f89d9..ee39acacf6f8 100644
--- a/Documentation/ABI/testing/sysfs-class-devfreq
+++ b/Documentation/ABI/testing/sysfs-class-devfreq
@@ -78,3 +78,23 @@ Contact:	Nishanth Menon <nm@ti.com>
 Description:
 		The /sys/class/devfreq/.../available_governors shows
 		currently available governors in the system.
+
+What:		/sys/class/devfreq/.../min_freq
+Date:		January 2013
+Contact:	MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+		The /sys/class/devfreq/.../min_freq shows and stores
+		the minimum frequency requested by users. It is 0 if
+		the user does not care. min_freq overrides the
+		frequency requested by governors.
+
+What:		/sys/class/devfreq/.../max_freq
+Date:		January 2013
+Contact:	MyungJoo Ham <myungjoo.ham@samsung.com>
+Description:
+		The /sys/class/devfreq/.../max_freq shows and stores
+		the maximum frequency requested by users. It is 0 if
+		the user does not care. max_freq overrides the
+		frequency requested by governors and min_freq.
+		The max_freq overrides min_freq because max_freq may be
+		used to throttle devices to avoid overheating.
diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index abfa14dd8b4c..44c407986f64 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -527,6 +527,8 @@ EXPORT_SYMBOL(devfreq_add_device);
 /**
  * devfreq_remove_device() - Remove devfreq feature from a device.
  * @devfreq:	the devfreq instance to be removed
+ *
+ * The opposite of devfreq_add_device().
  */
 int devfreq_remove_device(struct devfreq *devfreq)
 {
@@ -542,6 +544,10 @@ EXPORT_SYMBOL(devfreq_remove_device);
 /**
  * devfreq_suspend_device() - Suspend devfreq of a device.
  * @devfreq: the devfreq instance to be suspended
+ *
+ * This function is intended to be called by the pm callbacks
+ * (e.g., runtime_suspend, suspend) of the device driver that
+ * holds the devfreq.
  */
 int devfreq_suspend_device(struct devfreq *devfreq)
 {
@@ -559,6 +565,10 @@ EXPORT_SYMBOL(devfreq_suspend_device);
 /**
  * devfreq_resume_device() - Resume devfreq of a device.
  * @devfreq: the devfreq instance to be resumed
+ *
+ * This function is intended to be called by the pm callbacks
+ * (e.g., runtime_resume, resume) of the device driver that
+ * holds the devfreq.
  */
 int devfreq_resume_device(struct devfreq *devfreq)
 {
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index fe8c4476f7e4..5f1ab92107e6 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -181,6 +181,8 @@ extern struct devfreq *devfreq_add_device(struct device *dev,
 				  const char *governor_name,
 				  void *data);
 extern int devfreq_remove_device(struct devfreq *devfreq);
+
+/* Supposed to be called by PM_SLEEP/PM_RUNTIME callbacks */
 extern int devfreq_suspend_device(struct devfreq *devfreq);
 extern int devfreq_resume_device(struct devfreq *devfreq);
 
-- 
cgit v1.3-7-g2ca7


From 6d481e2fd137e6c31e54318598a9c8ed1ebe280b Mon Sep 17 00:00:00 2001
From: Shane Huang <shane.huang@amd.com>
Date: Mon, 3 Jun 2013 18:22:28 +0800
Subject: PCI: Put Hudson-2 device IDs together

To put all AMD Hudson-2 device IDs together for better maintenance.

[bhelgaas: also sort Hudson-2 devices by ID]
Signed-off-by: Shane Huang <shane.huang@amd.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Jean Delvare <khali@linux-fr.org>
---
 include/linux/pci_ids.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c12916248469..84c0bcc2a147 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -556,7 +556,6 @@
 #define PCI_DEVICE_ID_AMD_8131_BRIDGE	0x7450
 #define PCI_DEVICE_ID_AMD_8131_APIC	0x7451
 #define PCI_DEVICE_ID_AMD_8132_BRIDGE	0x7458
-#define PCI_DEVICE_ID_AMD_HUDSON2_SMBUS	0x780b
 #define PCI_DEVICE_ID_AMD_CS5535_IDE    0x208F
 #define PCI_DEVICE_ID_AMD_CS5536_ISA    0x2090
 #define PCI_DEVICE_ID_AMD_CS5536_FLASH  0x2091
@@ -568,8 +567,9 @@
 #define PCI_DEVICE_ID_AMD_CS5536_IDE    0x209A
 #define PCI_DEVICE_ID_AMD_LX_VIDEO  0x2081
 #define PCI_DEVICE_ID_AMD_LX_AES    0x2082
-#define PCI_DEVICE_ID_AMD_HUDSON2_IDE		0x780c
 #define PCI_DEVICE_ID_AMD_HUDSON2_SATA_IDE	0x7800
+#define PCI_DEVICE_ID_AMD_HUDSON2_SMBUS		0x780b
+#define PCI_DEVICE_ID_AMD_HUDSON2_IDE		0x780c
 
 #define PCI_VENDOR_ID_TRIDENT		0x1023
 #define PCI_DEVICE_ID_TRIDENT_4DWAVE_DX	0x2000
-- 
cgit v1.3-7-g2ca7


From 45f0a85c8258741d11bda25c0a5669c06267204a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 3 Jun 2013 21:49:52 +0200
Subject: PM / Runtime: Rework the "runtime idle" helper routine

The "runtime idle" helper routine, rpm_idle(), currently ignores
return values from .runtime_idle() callbacks executed by it.
However, it turns out that many subsystems use
pm_generic_runtime_idle() which checks the return value of the
driver's callback and executes pm_runtime_suspend() for the device
unless that value is not 0.  If that logic is moved to rpm_idle()
instead, pm_generic_runtime_idle() can be dropped and its users
will not need any .runtime_idle() callbacks any more.

Moreover, the PCI, SCSI, and SATA subsystems' .runtime_idle()
routines, pci_pm_runtime_idle(), scsi_runtime_idle(), and
ata_port_runtime_idle(), respectively, as well as a few drivers'
ones may be simplified if rpm_idle() calls rpm_suspend() after 0 has
been returned by the .runtime_idle() callback executed by it.

To reduce overall code bloat, make the changes described above.

Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Tested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Kevin Hilman <khilman@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
---
 Documentation/power/runtime_pm.txt |  5 -----
 arch/arm/mach-omap2/omap_device.c  |  7 +------
 drivers/acpi/device_pm.c           |  1 -
 drivers/amba/bus.c                 |  2 +-
 drivers/ata/libata-core.c          |  2 +-
 drivers/base/platform.c            |  1 -
 drivers/base/power/domain.c        |  1 -
 drivers/base/power/generic_ops.c   | 23 -----------------------
 drivers/base/power/runtime.c       | 12 +++++-------
 drivers/dma/intel_mid_dma.c        |  2 +-
 drivers/gpio/gpio-langwell.c       |  6 +-----
 drivers/i2c/i2c-core.c             |  2 +-
 drivers/mfd/ab8500-gpadc.c         |  8 +-------
 drivers/mmc/core/bus.c             |  2 +-
 drivers/mmc/core/sdio_bus.c        |  2 +-
 drivers/pci/pci-driver.c           | 14 +++++---------
 drivers/scsi/scsi_pm.c             | 11 +++--------
 drivers/sh/pm_runtime.c            |  2 +-
 drivers/spi/spi.c                  |  2 +-
 drivers/tty/serial/mfd.c           |  9 ++-------
 drivers/usb/core/driver.c          |  3 ++-
 drivers/usb/core/port.c            |  1 -
 include/linux/pm_runtime.h         |  2 --
 23 files changed, 28 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/runtime_pm.txt b/Documentation/power/runtime_pm.txt
index 6c9f5d9aa115..6c470c71ba27 100644
--- a/Documentation/power/runtime_pm.txt
+++ b/Documentation/power/runtime_pm.txt
@@ -660,11 +660,6 @@ Subsystems may wish to conserve code space by using the set of generic power
 management callbacks provided by the PM core, defined in
 driver/base/power/generic_ops.c:
 
-  int pm_generic_runtime_idle(struct device *dev);
-    - invoke the ->runtime_idle() callback provided by the driver of this
-      device, if defined, and call pm_runtime_suspend() for this device if the
-      return value is 0 or the callback is not defined
-
   int pm_generic_runtime_suspend(struct device *dev);
     - invoke the ->runtime_suspend() callback provided by the driver of this
       device and return its result, or return -EINVAL if not defined
diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c
index e6d230700b2b..e37feb2f05a3 100644
--- a/arch/arm/mach-omap2/omap_device.c
+++ b/arch/arm/mach-omap2/omap_device.c
@@ -591,11 +591,6 @@ static int _od_runtime_suspend(struct device *dev)
 	return ret;
 }
 
-static int _od_runtime_idle(struct device *dev)
-{
-	return pm_generic_runtime_idle(dev);
-}
-
 static int _od_runtime_resume(struct device *dev)
 {
 	struct platform_device *pdev = to_platform_device(dev);
@@ -653,7 +648,7 @@ static int _od_resume_noirq(struct device *dev)
 struct dev_pm_domain omap_device_pm_domain = {
 	.ops = {
 		SET_RUNTIME_PM_OPS(_od_runtime_suspend, _od_runtime_resume,
-				   _od_runtime_idle)
+				   NULL)
 		USE_PLATFORM_PM_SLEEP_OPS
 		.suspend_noirq = _od_suspend_noirq,
 		.resume_noirq = _od_resume_noirq,
diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index bc493aa3af19..1eb8b6258786 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -886,7 +886,6 @@ static struct dev_pm_domain acpi_general_pm_domain = {
 #ifdef CONFIG_PM_RUNTIME
 		.runtime_suspend = acpi_subsys_runtime_suspend,
 		.runtime_resume = acpi_subsys_runtime_resume,
-		.runtime_idle = pm_generic_runtime_idle,
 #endif
 #ifdef CONFIG_PM_SLEEP
 		.prepare = acpi_subsys_prepare,
diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c
index cdbad3a454a0..c6707278a6bb 100644
--- a/drivers/amba/bus.c
+++ b/drivers/amba/bus.c
@@ -284,7 +284,7 @@ static const struct dev_pm_ops amba_pm = {
 	SET_RUNTIME_PM_OPS(
 		amba_pm_runtime_suspend,
 		amba_pm_runtime_resume,
-		pm_generic_runtime_idle
+		NULL
 	)
 };
 
diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 63c743baf920..84e3b62aa368 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5430,7 +5430,7 @@ static int ata_port_runtime_idle(struct device *dev)
 				return -EBUSY;
 	}
 
-	return pm_runtime_suspend(dev);
+	return 0;
 }
 
 static int ata_port_runtime_suspend(struct device *dev)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 9eda84246ffd..96a930387ebc 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -888,7 +888,6 @@ int platform_pm_restore(struct device *dev)
 static const struct dev_pm_ops platform_dev_pm_ops = {
 	.runtime_suspend = pm_generic_runtime_suspend,
 	.runtime_resume = pm_generic_runtime_resume,
-	.runtime_idle = pm_generic_runtime_idle,
 	USE_PLATFORM_PM_SLEEP_OPS
 };
 
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 7072404c8b6d..bfb8955c406c 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2143,7 +2143,6 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
 	genpd->max_off_time_changed = true;
 	genpd->domain.ops.runtime_suspend = pm_genpd_runtime_suspend;
 	genpd->domain.ops.runtime_resume = pm_genpd_runtime_resume;
-	genpd->domain.ops.runtime_idle = pm_generic_runtime_idle;
 	genpd->domain.ops.prepare = pm_genpd_prepare;
 	genpd->domain.ops.suspend = pm_genpd_suspend;
 	genpd->domain.ops.suspend_late = pm_genpd_suspend_late;
diff --git a/drivers/base/power/generic_ops.c b/drivers/base/power/generic_ops.c
index bfd898b8988e..5ee030a864f9 100644
--- a/drivers/base/power/generic_ops.c
+++ b/drivers/base/power/generic_ops.c
@@ -11,29 +11,6 @@
 #include <linux/export.h>
 
 #ifdef CONFIG_PM_RUNTIME
-/**
- * pm_generic_runtime_idle - Generic runtime idle callback for subsystems.
- * @dev: Device to handle.
- *
- * If PM operations are defined for the @dev's driver and they include
- * ->runtime_idle(), execute it and return its error code, if nonzero.
- * Otherwise, execute pm_runtime_suspend() for the device and return 0.
- */
-int pm_generic_runtime_idle(struct device *dev)
-{
-	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
-
-	if (pm && pm->runtime_idle) {
-		int ret = pm->runtime_idle(dev);
-		if (ret)
-			return ret;
-	}
-
-	pm_runtime_suspend(dev);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pm_generic_runtime_idle);
-
 /**
  * pm_generic_runtime_suspend - Generic runtime suspend callback for subsystems.
  * @dev: Device to suspend.
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index ef13ad08afb2..268a35097578 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -293,11 +293,8 @@ static int rpm_idle(struct device *dev, int rpmflags)
 	/* Pending requests need to be canceled. */
 	dev->power.request = RPM_REQ_NONE;
 
-	if (dev->power.no_callbacks) {
-		/* Assume ->runtime_idle() callback would have suspended. */
-		retval = rpm_suspend(dev, rpmflags);
+	if (dev->power.no_callbacks)
 		goto out;
-	}
 
 	/* Carry out an asynchronous or a synchronous idle notification. */
 	if (rpmflags & RPM_ASYNC) {
@@ -306,7 +303,8 @@ static int rpm_idle(struct device *dev, int rpmflags)
 			dev->power.request_pending = true;
 			queue_work(pm_wq, &dev->power.work);
 		}
-		goto out;
+		trace_rpm_return_int(dev, _THIS_IP_, 0);
+		return 0;
 	}
 
 	dev->power.idle_notification = true;
@@ -326,14 +324,14 @@ static int rpm_idle(struct device *dev, int rpmflags)
 		callback = dev->driver->pm->runtime_idle;
 
 	if (callback)
-		__rpm_callback(callback, dev);
+		retval = __rpm_callback(callback, dev);
 
 	dev->power.idle_notification = false;
 	wake_up_all(&dev->power.wait_queue);
 
  out:
 	trace_rpm_return_int(dev, _THIS_IP_, retval);
-	return retval;
+	return retval ? retval : rpm_suspend(dev, rpmflags);
 }
 
 /**
diff --git a/drivers/dma/intel_mid_dma.c b/drivers/dma/intel_mid_dma.c
index a0de82e21a7c..a975ebebea8a 100644
--- a/drivers/dma/intel_mid_dma.c
+++ b/drivers/dma/intel_mid_dma.c
@@ -1405,7 +1405,7 @@ static int dma_runtime_idle(struct device *dev)
 			return -EAGAIN;
 	}
 
-	return pm_schedule_suspend(dev, 0);
+	return 0;
 }
 
 /******************************************************************************
diff --git a/drivers/gpio/gpio-langwell.c b/drivers/gpio/gpio-langwell.c
index 62ef10a641c4..89d0d2a3b1bb 100644
--- a/drivers/gpio/gpio-langwell.c
+++ b/drivers/gpio/gpio-langwell.c
@@ -305,11 +305,7 @@ static const struct irq_domain_ops lnw_gpio_irq_ops = {
 
 static int lnw_gpio_runtime_idle(struct device *dev)
 {
-	int err = pm_schedule_suspend(dev, 500);
-
-	if (!err)
-		return 0;
-
+	pm_schedule_suspend(dev, 500);
 	return -EBUSY;
 }
 
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 48e31ed69dbf..f32ca293ae0e 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -435,7 +435,7 @@ static const struct dev_pm_ops i2c_device_pm_ops = {
 	SET_RUNTIME_PM_OPS(
 		pm_generic_runtime_suspend,
 		pm_generic_runtime_resume,
-		pm_generic_runtime_idle
+		NULL
 	)
 };
 
diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index 13f7866de46e..3598b0ecf8c7 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -886,12 +886,6 @@ static int ab8500_gpadc_runtime_resume(struct device *dev)
 	return ret;
 }
 
-static int ab8500_gpadc_runtime_idle(struct device *dev)
-{
-	pm_runtime_suspend(dev);
-	return 0;
-}
-
 static int ab8500_gpadc_suspend(struct device *dev)
 {
 	struct ab8500_gpadc *gpadc = dev_get_drvdata(dev);
@@ -1039,7 +1033,7 @@ static int ab8500_gpadc_remove(struct platform_device *pdev)
 static const struct dev_pm_ops ab8500_gpadc_pm_ops = {
 	SET_RUNTIME_PM_OPS(ab8500_gpadc_runtime_suspend,
 			   ab8500_gpadc_runtime_resume,
-			   ab8500_gpadc_runtime_idle)
+			   NULL)
 	SET_SYSTEM_SLEEP_PM_OPS(ab8500_gpadc_suspend,
 				ab8500_gpadc_resume)
 
diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c
index e219c97a02a4..9d5c71125576 100644
--- a/drivers/mmc/core/bus.c
+++ b/drivers/mmc/core/bus.c
@@ -164,7 +164,7 @@ static int mmc_runtime_resume(struct device *dev)
 
 static int mmc_runtime_idle(struct device *dev)
 {
-	return pm_runtime_suspend(dev);
+	return 0;
 }
 
 #endif /* !CONFIG_PM_RUNTIME */
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 546c67c2bbbf..6d67492a9247 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -211,7 +211,7 @@ static const struct dev_pm_ops sdio_bus_pm_ops = {
 	SET_RUNTIME_PM_OPS(
 		pm_generic_runtime_suspend,
 		pm_generic_runtime_resume,
-		pm_generic_runtime_idle
+		NULL
 	)
 };
 
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 79277fb36c6b..e6515e21afa3 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -1050,26 +1050,22 @@ static int pci_pm_runtime_idle(struct device *dev)
 {
 	struct pci_dev *pci_dev = to_pci_dev(dev);
 	const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL;
+	int ret = 0;
 
 	/*
 	 * If pci_dev->driver is not set (unbound), the device should
 	 * always remain in D0 regardless of the runtime PM status
 	 */
 	if (!pci_dev->driver)
-		goto out;
+		return 0;
 
 	if (!pm)
 		return -ENOSYS;
 
-	if (pm->runtime_idle) {
-		int ret = pm->runtime_idle(dev);
-		if (ret)
-			return ret;
-	}
+	if (pm->runtime_idle)
+		ret = pm->runtime_idle(dev);
 
-out:
-	pm_runtime_suspend(dev);
-	return 0;
+	return ret;
 }
 
 #else /* !CONFIG_PM_RUNTIME */
diff --git a/drivers/scsi/scsi_pm.c b/drivers/scsi/scsi_pm.c
index 42539ee2cb11..4c5aabe21755 100644
--- a/drivers/scsi/scsi_pm.c
+++ b/drivers/scsi/scsi_pm.c
@@ -229,8 +229,6 @@ static int scsi_runtime_resume(struct device *dev)
 
 static int scsi_runtime_idle(struct device *dev)
 {
-	int err;
-
 	dev_dbg(dev, "scsi_runtime_idle\n");
 
 	/* Insert hooks here for targets, hosts, and transport classes */
@@ -240,14 +238,11 @@ static int scsi_runtime_idle(struct device *dev)
 
 		if (sdev->request_queue->dev) {
 			pm_runtime_mark_last_busy(dev);
-			err = pm_runtime_autosuspend(dev);
-		} else {
-			err = pm_runtime_suspend(dev);
+			pm_runtime_autosuspend(dev);
+			return -EBUSY;
 		}
-	} else {
-		err = pm_runtime_suspend(dev);
 	}
-	return err;
+	return 0;
 }
 
 int scsi_autopm_get_device(struct scsi_device *sdev)
diff --git a/drivers/sh/pm_runtime.c b/drivers/sh/pm_runtime.c
index afe9282629b9..8afa5a4589f2 100644
--- a/drivers/sh/pm_runtime.c
+++ b/drivers/sh/pm_runtime.c
@@ -25,7 +25,7 @@
 static int default_platform_runtime_idle(struct device *dev)
 {
 	/* suspend synchronously to disable clocks immediately */
-	return pm_runtime_suspend(dev);
+	return 0;
 }
 
 static struct dev_pm_domain default_pm_domain = {
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index 32b7bb111eb6..095cfaded1c0 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -223,7 +223,7 @@ static const struct dev_pm_ops spi_pm = {
 	SET_RUNTIME_PM_OPS(
 		pm_generic_runtime_suspend,
 		pm_generic_runtime_resume,
-		pm_generic_runtime_idle
+		NULL
 	)
 };
 
diff --git a/drivers/tty/serial/mfd.c b/drivers/tty/serial/mfd.c
index 5f4765a7a5c5..5dfcf3bae23a 100644
--- a/drivers/tty/serial/mfd.c
+++ b/drivers/tty/serial/mfd.c
@@ -1248,13 +1248,8 @@ static int serial_hsu_resume(struct pci_dev *pdev)
 #ifdef CONFIG_PM_RUNTIME
 static int serial_hsu_runtime_idle(struct device *dev)
 {
-	int err;
-
-	err = pm_schedule_suspend(dev, 500);
-	if (err)
-		return -EBUSY;
-
-	return 0;
+	pm_schedule_suspend(dev, 500);
+	return -EBUSY;
 }
 
 static int serial_hsu_runtime_suspend(struct device *dev)
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index 6eab440e1542..7609ac4aed1c 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1765,7 +1765,8 @@ int usb_runtime_idle(struct device *dev)
 	 */
 	if (autosuspend_check(udev) == 0)
 		pm_runtime_autosuspend(dev);
-	return 0;
+	/* Tell the core not to suspend it, though. */
+	return -EBUSY;
 }
 
 int usb_set_usb2_hardware_lpm(struct usb_device *udev, int enable)
diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c
index b8bad294eeb8..8c1b2c509467 100644
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c
@@ -141,7 +141,6 @@ static const struct dev_pm_ops usb_port_pm_ops = {
 #ifdef CONFIG_PM_RUNTIME
 	.runtime_suspend =	usb_port_runtime_suspend,
 	.runtime_resume =	usb_port_runtime_resume,
-	.runtime_idle =		pm_generic_runtime_idle,
 #endif
 };
 
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 7d7e09efff9b..6fa7cea25da9 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -37,7 +37,6 @@ extern void pm_runtime_enable(struct device *dev);
 extern void __pm_runtime_disable(struct device *dev, bool check_resume);
 extern void pm_runtime_allow(struct device *dev);
 extern void pm_runtime_forbid(struct device *dev);
-extern int pm_generic_runtime_idle(struct device *dev);
 extern int pm_generic_runtime_suspend(struct device *dev);
 extern int pm_generic_runtime_resume(struct device *dev);
 extern void pm_runtime_no_callbacks(struct device *dev);
@@ -143,7 +142,6 @@ static inline bool pm_runtime_active(struct device *dev) { return true; }
 static inline bool pm_runtime_status_suspended(struct device *dev) { return false; }
 static inline bool pm_runtime_enabled(struct device *dev) { return false; }
 
-static inline int pm_generic_runtime_idle(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_suspend(struct device *dev) { return 0; }
 static inline int pm_generic_runtime_resume(struct device *dev) { return 0; }
 static inline void pm_runtime_no_callbacks(struct device *dev) {}
-- 
cgit v1.3-7-g2ca7


From 215e262f2aeba378aa192da07c30770f9925a4bf Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 31 May 2013 15:26:45 -0700
Subject: percpu: implement generic percpu refcounting

This implements a refcount with similar semantics to
atomic_get()/atomic_dec_and_test() - but percpu.

It also implements two stage shutdown, as we need it to tear down the
percpu counts.  Before dropping the initial refcount, you must call
percpu_ref_kill(); this puts the refcount in "shutting down mode" and
switches back to a single atomic refcount with the appropriate
barriers (synchronize_rcu()).

It's also legal to call percpu_ref_kill() multiple times - it only
returns true once, so callers don't have to reimplement shutdown
synchronization.

[akpm@linux-foundation.org: fix build]
[akpm@linux-foundation.org: coding-style tweak]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu-refcount.h | 122 ++++++++++++++++++++++++++++++++++++++
 lib/Makefile                    |   2 +-
 lib/percpu-refcount.c           | 128 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/percpu-refcount.h
 create mode 100644 lib/percpu-refcount.c

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
new file mode 100644
index 000000000000..24b31ef15932
--- /dev/null
+++ b/include/linux/percpu-refcount.h
@@ -0,0 +1,122 @@
+/*
+ * Percpu refcounts:
+ * (C) 2012 Google, Inc.
+ * Author: Kent Overstreet <koverstreet@google.com>
+ *
+ * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
+ * atomic_dec_and_test() - but percpu.
+ *
+ * There's one important difference between percpu refs and normal atomic_t
+ * refcounts; you have to keep track of your initial refcount, and then when you
+ * start shutting down you call percpu_ref_kill() _before_ dropping the initial
+ * refcount.
+ *
+ * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
+ * than an atomic_t - this is because of the way shutdown works, see
+ * percpu_ref_kill()/PCPU_COUNT_BIAS.
+ *
+ * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
+ * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
+ * puts the ref back in single atomic_t mode, collecting the per cpu refs and
+ * issuing the appropriate barriers, and then marks the ref as shutting down so
+ * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
+ * it's safe to drop the initial ref.
+ *
+ * USAGE:
+ *
+ * See fs/aio.c for some example usage; it's used there for struct kioctx, which
+ * is created when userspaces calls io_setup(), and destroyed when userspace
+ * calls io_destroy() or the process exits.
+ *
+ * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
+ * calls percpu_ref_kill(), then hlist_del_rcu() and sychronize_rcu() to remove
+ * the kioctx from the proccess's list of kioctxs - after that, there can't be
+ * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
+ * the initial ref with percpu_ref_put().
+ *
+ * Code that does a two stage shutdown like this often needs some kind of
+ * explicit synchronization to ensure the initial refcount can only be dropped
+ * once - percpu_ref_kill() does this for you, it returns true once and false if
+ * someone else already called it. The aio code uses it this way, but it's not
+ * necessary if the code has some other mechanism to synchronize teardown.
+ * around.
+ */
+
+#ifndef _LINUX_PERCPU_REFCOUNT_H
+#define _LINUX_PERCPU_REFCOUNT_H
+
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+
+struct percpu_ref;
+typedef void (percpu_ref_release)(struct percpu_ref *);
+
+struct percpu_ref {
+	atomic_t		count;
+	/*
+	 * The low bit of the pointer indicates whether the ref is in percpu
+	 * mode; if set, then get/put will manipulate the atomic_t (this is a
+	 * hack because we need to keep the pointer around for
+	 * percpu_ref_kill_rcu())
+	 */
+	unsigned __percpu	*pcpu_count;
+	percpu_ref_release	*release;
+	struct rcu_head		rcu;
+};
+
+int percpu_ref_init(struct percpu_ref *, percpu_ref_release *);
+void percpu_ref_kill(struct percpu_ref *ref);
+
+#define PCPU_STATUS_BITS	2
+#define PCPU_STATUS_MASK	((1 << PCPU_STATUS_BITS) - 1)
+#define PCPU_REF_PTR		0
+#define PCPU_REF_DEAD		1
+
+#define REF_STATUS(count)	(((unsigned long) count) & PCPU_STATUS_MASK)
+
+/**
+ * percpu_ref_get - increment a percpu refcount
+ *
+ * Analagous to atomic_inc().
+  */
+static inline void percpu_ref_get(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count;
+
+	preempt_disable();
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
+		__this_cpu_inc(*pcpu_count);
+	else
+		atomic_inc(&ref->count);
+
+	preempt_enable();
+}
+
+/**
+ * percpu_ref_put - decrement a percpu refcount
+ *
+ * Decrement the refcount, and if 0, call the release function (which was passed
+ * to percpu_ref_init())
+ */
+static inline void percpu_ref_put(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count;
+
+	preempt_disable();
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR))
+		__this_cpu_dec(*pcpu_count);
+	else if (unlikely(atomic_dec_and_test(&ref->count)))
+		ref->release(ref);
+
+	preempt_enable();
+}
+
+#endif
diff --git a/lib/Makefile b/lib/Makefile
index c55a037a354e..386db4bbc265 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o
+	 earlycpio.o percpu-refcount.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
new file mode 100644
index 000000000000..6f0ffd702a09
--- /dev/null
+++ b/lib/percpu-refcount.c
@@ -0,0 +1,128 @@
+#define pr_fmt(fmt) "%s: " fmt "\n", __func__
+
+#include <linux/kernel.h>
+#include <linux/percpu-refcount.h>
+
+/*
+ * Initially, a percpu refcount is just a set of percpu counters. Initially, we
+ * don't try to detect the ref hitting 0 - which means that get/put can just
+ * increment or decrement the local counter. Note that the counter on a
+ * particular cpu can (and will) wrap - this is fine, when we go to shutdown the
+ * percpu counters will all sum to the correct value
+ *
+ * (More precisely: because moduler arithmatic is commutative the sum of all the
+ * pcpu_count vars will be equal to what it would have been if all the gets and
+ * puts were done to a single integer, even if some of the percpu integers
+ * overflow or underflow).
+ *
+ * The real trick to implementing percpu refcounts is shutdown. We can't detect
+ * the ref hitting 0 on every put - this would require global synchronization
+ * and defeat the whole purpose of using percpu refs.
+ *
+ * What we do is require the user to keep track of the initial refcount; we know
+ * the ref can't hit 0 before the user drops the initial ref, so as long as we
+ * convert to non percpu mode before the initial ref is dropped everything
+ * works.
+ *
+ * Converting to non percpu mode is done with some RCUish stuff in
+ * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t
+ * can't hit 0 before we've added up all the percpu refs.
+ */
+
+#define PCPU_COUNT_BIAS		(1U << 31)
+
+/**
+ * percpu_ref_init - initialize a percpu refcount
+ * @ref:	ref to initialize
+ * @release:	function which will be called when refcount hits 0
+ *
+ * Initializes the refcount in single atomic counter mode with a refcount of 1;
+ * analagous to atomic_set(ref, 1).
+ *
+ * Note that @release must not sleep - it may potentially be called from RCU
+ * callback context by percpu_ref_kill().
+ */
+int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release)
+{
+	atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS);
+
+	ref->pcpu_count = alloc_percpu(unsigned);
+	if (!ref->pcpu_count)
+		return -ENOMEM;
+
+	ref->release = release;
+	return 0;
+}
+
+static void percpu_ref_kill_rcu(struct rcu_head *rcu)
+{
+	struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
+	unsigned __percpu *pcpu_count;
+	unsigned count = 0;
+	int cpu;
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	/* Mask out PCPU_REF_DEAD */
+	pcpu_count = (unsigned __percpu *)
+		(((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK);
+
+	for_each_possible_cpu(cpu)
+		count += *per_cpu_ptr(pcpu_count, cpu);
+
+	free_percpu(pcpu_count);
+
+	pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count);
+
+	/*
+	 * It's crucial that we sum the percpu counters _before_ adding the sum
+	 * to &ref->count; since gets could be happening on one cpu while puts
+	 * happen on another, adding a single cpu's count could cause
+	 * @ref->count to hit 0 before we've got a consistent value - but the
+	 * sum of all the counts will be consistent and correct.
+	 *
+	 * Subtracting the bias value then has to happen _after_ adding count to
+	 * &ref->count; we need the bias value to prevent &ref->count from
+	 * reaching 0 before we add the percpu counts. But doing it at the same
+	 * time is equivalent and saves us atomic operations:
+	 */
+
+	atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count);
+
+	/*
+	 * Now we're in single atomic_t mode with a consistent refcount, so it's
+	 * safe to drop our initial ref:
+	 */
+	percpu_ref_put(ref);
+}
+
+/**
+ * percpu_ref_kill - safely drop initial ref
+ *
+ * Must be used to drop the initial ref on a percpu refcount; must be called
+ * precisely once before shutdown.
+ *
+ * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
+ * percpu counters and dropping the initial ref.
+ */
+void percpu_ref_kill(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count, *old, *new;
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	do {
+		if (REF_STATUS(pcpu_count) == PCPU_REF_DEAD) {
+			WARN(1, "percpu_ref_kill() called more than once!\n");
+			return;
+		}
+
+		old = pcpu_count;
+		new = (unsigned __percpu *)
+			(((unsigned long) pcpu_count)|PCPU_REF_DEAD);
+
+		pcpu_count = cmpxchg(&ref->pcpu_count, old, new);
+	} while (pcpu_count != old);
+
+	call_rcu(&ref->rcu, percpu_ref_kill_rcu);
+}
-- 
cgit v1.3-7-g2ca7


From 6ea34c9b78c10289846db0abeebd6b84d5aca084 Mon Sep 17 00:00:00 2001
From: Amos Kong <akong@redhat.com>
Date: Sat, 25 May 2013 06:44:15 +0800
Subject: kvm: exclude ioeventfd from counting kvm_io_range limit

We can easily reach the 1000 limit by start VM with a couple
hundred I/O devices (multifunction=on). The hardcode limit
already been adjusted 3 times (6 ~ 200 ~ 300 ~ 1000).

In userspace, we already have maximum file descriptor to
limit ioeventfd count. But kvm_io_bus devices also are used
for pit, pic, ioapic, coalesced_mmio. They couldn't be limited
by maximum file descriptor.

Currently only ioeventfds take too much kvm_io_bus devices,
so just exclude it from counting kvm_io_range limit.

Also fixed one indent issue in kvm_host.h

Signed-off-by: Amos Kong <akong@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 include/linux/kvm_host.h | 3 ++-
 virt/kvm/eventfd.c       | 2 ++
 virt/kvm/kvm_main.c      | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d9a3c30eab2e..e3aae6db276f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -145,7 +145,8 @@ struct kvm_io_range {
 #define NR_IOBUS_DEVS 1000
 
 struct kvm_io_bus {
-	int                   dev_count;
+	int dev_count;
+	int ioeventfd_count;
 	struct kvm_io_range range[];
 };
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 64ee720b75c7..1550637d1b10 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -753,6 +753,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 	if (ret < 0)
 		goto unlock_fail;
 
+	kvm->buses[bus_idx]->ioeventfd_count++;
 	list_add_tail(&p->list, &kvm->ioeventfds);
 
 	mutex_unlock(&kvm->slots_lock);
@@ -798,6 +799,7 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 			continue;
 
 		kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
+		kvm->buses[bus_idx]->ioeventfd_count--;
 		ioeventfd_release(p);
 		ret = 0;
 		break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b547a1ceecbc..1580dd4ace4e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2926,7 +2926,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	struct kvm_io_bus *new_bus, *bus;
 
 	bus = kvm->buses[bus_idx];
-	if (bus->dev_count > NR_IOBUS_DEVS - 1)
+	/* exclude ioeventfd which is limited by maximum fd */
+	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 
 	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
-- 
cgit v1.3-7-g2ca7


From 5070158804b5339c71809f5e673cea1cfacd804d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Sat, 30 Mar 2013 16:25:15 +0530
Subject: cpufreq: rename index as driver_data in cpufreq_frequency_table

The "index" field of struct cpufreq_frequency_table was never an
index and isn't used at all by the cpufreq core.  It only is useful
for cpufreq drivers for their internal purposes.

Many people nowadays blindly set it in ascending order with the
assumption that the core will use it, which is a mistake.

Rename it to "driver_data" as that's what its purpose is. All of its
users are updated accordingly.

[rjw: Changelog]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpu-drivers.txt             |  10 +-
 arch/arm/mach-davinci/da850.c                      |   8 +-
 arch/arm/mach-s3c24xx/cpufreq-utils.c              |   2 +-
 arch/arm/mach-s3c24xx/cpufreq.c                    |   4 +-
 arch/arm/mach-s3c24xx/pll-s3c2410.c                |  54 +++++-----
 arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c       |  54 +++++-----
 arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c       | 110 ++++++++++-----------
 arch/arm/mach-shmobile/clock-sh7372.c              |   6 +-
 arch/arm/plat-samsung/include/plat/cpu-freq-core.h |   2 +-
 arch/mips/loongson/lemote-2f/clock.c               |   3 +-
 arch/powerpc/platforms/pasemi/cpufreq.c            |   5 +-
 drivers/base/power/opp.c                           |   4 +-
 drivers/cpufreq/acpi-cpufreq.c                     |   6 +-
 drivers/cpufreq/blackfin-cpufreq.c                 |  10 +-
 drivers/cpufreq/e_powersaver.c                     |   8 +-
 drivers/cpufreq/freq_table.c                       |  26 ++---
 drivers/cpufreq/ia64-acpi-cpufreq.c                |   2 +-
 drivers/cpufreq/kirkwood-cpufreq.c                 |   2 +-
 drivers/cpufreq/longhaul.c                         |  16 +--
 drivers/cpufreq/loongson2_cpufreq.c                |   2 +-
 drivers/cpufreq/p4-clockmod.c                      |   4 +-
 drivers/cpufreq/powernow-k6.c                      |   8 +-
 drivers/cpufreq/powernow-k7.c                      |  16 +--
 drivers/cpufreq/powernow-k8.c                      |  18 ++--
 drivers/cpufreq/ppc_cbe_cpufreq.c                  |   4 +-
 drivers/cpufreq/pxa2xx-cpufreq.c                   |   8 +-
 drivers/cpufreq/pxa3xx-cpufreq.c                   |   4 +-
 drivers/cpufreq/s3c2416-cpufreq.c                  |   2 +-
 drivers/cpufreq/s3c64xx-cpufreq.c                  |   2 +-
 drivers/cpufreq/sc520_freq.c                       |   2 +-
 drivers/cpufreq/sparc-us2e-cpufreq.c               |  12 +--
 drivers/cpufreq/sparc-us3-cpufreq.c                |   8 +-
 drivers/cpufreq/spear-cpufreq.c                    |   4 +-
 drivers/cpufreq/speedstep-centrino.c               |   8 +-
 drivers/mfd/db8500-prcmu.c                         |  10 +-
 drivers/sh/clk/core.c                              |   4 +-
 include/linux/cpufreq.h                            |   2 +-
 37 files changed, 223 insertions(+), 227 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-freq/cpu-drivers.txt b/Documentation/cpu-freq/cpu-drivers.txt
index a3585eac83b6..19fa98e07bf7 100644
--- a/Documentation/cpu-freq/cpu-drivers.txt
+++ b/Documentation/cpu-freq/cpu-drivers.txt
@@ -186,7 +186,7 @@ As most cpufreq processors only allow for being set to a few specific
 frequencies, a "frequency table" with some functions might assist in
 some work of the processor driver. Such a "frequency table" consists
 of an array of struct cpufreq_frequency_table entries, with any value in
-"index" you want to use, and the corresponding frequency in
+"driver_data" you want to use, and the corresponding frequency in
 "frequency". At the end of the table, you need to add a
 cpufreq_frequency_table entry with frequency set to CPUFREQ_TABLE_END. And
 if you want to skip one entry in the table, set the frequency to 
@@ -214,10 +214,4 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 is the corresponding frequency table helper for the ->target
 stage. Just pass the values to this function, and the unsigned int
 index returns the number of the frequency table entry which contains
-the frequency the CPU shall be set to. PLEASE NOTE: This is not the
-"index" which is in this cpufreq_table_entry.index, but instead
-cpufreq_table[index]. So, the new frequency is
-cpufreq_table[index].frequency, and the value you stored into the
-frequency table "index" field is
-cpufreq_table[index].index.
-
+the frequency the CPU shall be set to.
diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c
index 4d6933848abf..a0d4f6038b60 100644
--- a/arch/arm/mach-davinci/da850.c
+++ b/arch/arm/mach-davinci/da850.c
@@ -1004,7 +1004,7 @@ static const struct da850_opp da850_opp_96 = {
 
 #define OPP(freq) 		\
 	{				\
-		.index = (unsigned int) &da850_opp_##freq,	\
+		.driver_data = (unsigned int) &da850_opp_##freq,	\
 		.frequency = freq * 1000, \
 	}
 
@@ -1016,7 +1016,7 @@ static struct cpufreq_frequency_table da850_freq_table[] = {
 	OPP(200),
 	OPP(96),
 	{
-		.index		= 0,
+		.driver_data		= 0,
 		.frequency	= CPUFREQ_TABLE_END,
 	},
 };
@@ -1044,7 +1044,7 @@ static int da850_set_voltage(unsigned int index)
 	if (!cvdd)
 		return -ENODEV;
 
-	opp = (struct da850_opp *) cpufreq_info.freq_table[index].index;
+	opp = (struct da850_opp *) cpufreq_info.freq_table[index].driver_data;
 
 	return regulator_set_voltage(cvdd, opp->cvdd_min, opp->cvdd_max);
 }
@@ -1125,7 +1125,7 @@ static int da850_set_pll0rate(struct clk *clk, unsigned long index)
 	struct pll_data *pll = clk->pll_data;
 	int ret;
 
-	opp = (struct da850_opp *) cpufreq_info.freq_table[index].index;
+	opp = (struct da850_opp *) cpufreq_info.freq_table[index].driver_data;
 	prediv = opp->prediv;
 	mult = opp->mult;
 	postdiv = opp->postdiv;
diff --git a/arch/arm/mach-s3c24xx/cpufreq-utils.c b/arch/arm/mach-s3c24xx/cpufreq-utils.c
index ddd8280e3875..2a0aa5684e72 100644
--- a/arch/arm/mach-s3c24xx/cpufreq-utils.c
+++ b/arch/arm/mach-s3c24xx/cpufreq-utils.c
@@ -60,5 +60,5 @@ void s3c2410_cpufreq_setrefresh(struct s3c_cpufreq_config *cfg)
  */
 void s3c2410_set_fvco(struct s3c_cpufreq_config *cfg)
 {
-	__raw_writel(cfg->pll.index, S3C2410_MPLLCON);
+	__raw_writel(cfg->pll.driver_data, S3C2410_MPLLCON);
 }
diff --git a/arch/arm/mach-s3c24xx/cpufreq.c b/arch/arm/mach-s3c24xx/cpufreq.c
index 3c0e78ede0da..3513e7477160 100644
--- a/arch/arm/mach-s3c24xx/cpufreq.c
+++ b/arch/arm/mach-s3c24xx/cpufreq.c
@@ -70,7 +70,7 @@ static void s3c_cpufreq_getcur(struct s3c_cpufreq_config *cfg)
 	cfg->freq.pclk = pclk = clk_get_rate(clk_pclk);
 	cfg->freq.armclk = armclk = clk_get_rate(clk_arm);
 
-	cfg->pll.index = __raw_readl(S3C2410_MPLLCON);
+	cfg->pll.driver_data = __raw_readl(S3C2410_MPLLCON);
 	cfg->pll.frequency = fclk;
 
 	cfg->freq.hclk_tns = 1000000000 / (cfg->freq.hclk / 10);
@@ -431,7 +431,7 @@ static unsigned int suspend_freq;
 static int s3c_cpufreq_suspend(struct cpufreq_policy *policy)
 {
 	suspend_pll.frequency = clk_get_rate(_clk_mpll);
-	suspend_pll.index = __raw_readl(S3C2410_MPLLCON);
+	suspend_pll.driver_data = __raw_readl(S3C2410_MPLLCON);
 	suspend_freq = s3c_cpufreq_get(0) * 1000;
 
 	return 0;
diff --git a/arch/arm/mach-s3c24xx/pll-s3c2410.c b/arch/arm/mach-s3c24xx/pll-s3c2410.c
index dcf3420a3271..5e37d368594b 100644
--- a/arch/arm/mach-s3c24xx/pll-s3c2410.c
+++ b/arch/arm/mach-s3c24xx/pll-s3c2410.c
@@ -33,36 +33,36 @@
 #include <plat/cpu-freq-core.h>
 
 static struct cpufreq_frequency_table pll_vals_12MHz[] = {
-    { .frequency = 34000000,  .index = PLLVAL(82, 2, 3),   },
-    { .frequency = 45000000,  .index = PLLVAL(82, 1, 3),   },
-    { .frequency = 51000000,  .index = PLLVAL(161, 3, 3),  },
-    { .frequency = 48000000,  .index = PLLVAL(120, 2, 3),  },
-    { .frequency = 56000000,  .index = PLLVAL(142, 2, 3),  },
-    { .frequency = 68000000,  .index = PLLVAL(82, 2, 2),   },
-    { .frequency = 79000000,  .index = PLLVAL(71, 1, 2),   },
-    { .frequency = 85000000,  .index = PLLVAL(105, 2, 2),  },
-    { .frequency = 90000000,  .index = PLLVAL(112, 2, 2),  },
-    { .frequency = 101000000, .index = PLLVAL(127, 2, 2),  },
-    { .frequency = 113000000, .index = PLLVAL(105, 1, 2),  },
-    { .frequency = 118000000, .index = PLLVAL(150, 2, 2),  },
-    { .frequency = 124000000, .index = PLLVAL(116, 1, 2),  },
-    { .frequency = 135000000, .index = PLLVAL(82, 2, 1),   },
-    { .frequency = 147000000, .index = PLLVAL(90, 2, 1),   },
-    { .frequency = 152000000, .index = PLLVAL(68, 1, 1),   },
-    { .frequency = 158000000, .index = PLLVAL(71, 1, 1),   },
-    { .frequency = 170000000, .index = PLLVAL(77, 1, 1),   },
-    { .frequency = 180000000, .index = PLLVAL(82, 1, 1),   },
-    { .frequency = 186000000, .index = PLLVAL(85, 1, 1),   },
-    { .frequency = 192000000, .index = PLLVAL(88, 1, 1),   },
-    { .frequency = 203000000, .index = PLLVAL(161, 3, 1),  },
+    { .frequency = 34000000,  .driver_data = PLLVAL(82, 2, 3),   },
+    { .frequency = 45000000,  .driver_data = PLLVAL(82, 1, 3),   },
+    { .frequency = 51000000,  .driver_data = PLLVAL(161, 3, 3),  },
+    { .frequency = 48000000,  .driver_data = PLLVAL(120, 2, 3),  },
+    { .frequency = 56000000,  .driver_data = PLLVAL(142, 2, 3),  },
+    { .frequency = 68000000,  .driver_data = PLLVAL(82, 2, 2),   },
+    { .frequency = 79000000,  .driver_data = PLLVAL(71, 1, 2),   },
+    { .frequency = 85000000,  .driver_data = PLLVAL(105, 2, 2),  },
+    { .frequency = 90000000,  .driver_data = PLLVAL(112, 2, 2),  },
+    { .frequency = 101000000, .driver_data = PLLVAL(127, 2, 2),  },
+    { .frequency = 113000000, .driver_data = PLLVAL(105, 1, 2),  },
+    { .frequency = 118000000, .driver_data = PLLVAL(150, 2, 2),  },
+    { .frequency = 124000000, .driver_data = PLLVAL(116, 1, 2),  },
+    { .frequency = 135000000, .driver_data = PLLVAL(82, 2, 1),   },
+    { .frequency = 147000000, .driver_data = PLLVAL(90, 2, 1),   },
+    { .frequency = 152000000, .driver_data = PLLVAL(68, 1, 1),   },
+    { .frequency = 158000000, .driver_data = PLLVAL(71, 1, 1),   },
+    { .frequency = 170000000, .driver_data = PLLVAL(77, 1, 1),   },
+    { .frequency = 180000000, .driver_data = PLLVAL(82, 1, 1),   },
+    { .frequency = 186000000, .driver_data = PLLVAL(85, 1, 1),   },
+    { .frequency = 192000000, .driver_data = PLLVAL(88, 1, 1),   },
+    { .frequency = 203000000, .driver_data = PLLVAL(161, 3, 1),  },
 
     /* 2410A extras */
 
-    { .frequency = 210000000, .index = PLLVAL(132, 2, 1),  },
-    { .frequency = 226000000, .index = PLLVAL(105, 1, 1),  },
-    { .frequency = 266000000, .index = PLLVAL(125, 1, 1),  },
-    { .frequency = 268000000, .index = PLLVAL(126, 1, 1),  },
-    { .frequency = 270000000, .index = PLLVAL(127, 1, 1),  },
+    { .frequency = 210000000, .driver_data = PLLVAL(132, 2, 1),  },
+    { .frequency = 226000000, .driver_data = PLLVAL(105, 1, 1),  },
+    { .frequency = 266000000, .driver_data = PLLVAL(125, 1, 1),  },
+    { .frequency = 268000000, .driver_data = PLLVAL(126, 1, 1),  },
+    { .frequency = 270000000, .driver_data = PLLVAL(127, 1, 1),  },
 };
 
 static int s3c2410_plls_add(struct device *dev, struct subsys_interface *sif)
diff --git a/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c b/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c
index 673781758319..a19460e6e7b0 100644
--- a/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c
+++ b/arch/arm/mach-s3c24xx/pll-s3c2440-12000000.c
@@ -21,33 +21,33 @@
 #include <plat/cpu-freq-core.h>
 
 static struct cpufreq_frequency_table s3c2440_plls_12[] __initdata = {
-	{ .frequency = 75000000,	.index = PLLVAL(0x75, 3, 3),  }, 	/* FVco 600.000000 */
-	{ .frequency = 80000000,	.index = PLLVAL(0x98, 4, 3),  }, 	/* FVco 640.000000 */
-	{ .frequency = 90000000,	.index = PLLVAL(0x70, 2, 3),  }, 	/* FVco 720.000000 */
-	{ .frequency = 100000000,	.index = PLLVAL(0x5c, 1, 3),  }, 	/* FVco 800.000000 */
-	{ .frequency = 110000000,	.index = PLLVAL(0x66, 1, 3),  }, 	/* FVco 880.000000 */
-	{ .frequency = 120000000,	.index = PLLVAL(0x70, 1, 3),  }, 	/* FVco 960.000000 */
-	{ .frequency = 150000000,	.index = PLLVAL(0x75, 3, 2),  }, 	/* FVco 600.000000 */
-	{ .frequency = 160000000,	.index = PLLVAL(0x98, 4, 2),  }, 	/* FVco 640.000000 */
-	{ .frequency = 170000000,	.index = PLLVAL(0x4d, 1, 2),  }, 	/* FVco 680.000000 */
-	{ .frequency = 180000000,	.index = PLLVAL(0x70, 2, 2),  }, 	/* FVco 720.000000 */
-	{ .frequency = 190000000,	.index = PLLVAL(0x57, 1, 2),  }, 	/* FVco 760.000000 */
-	{ .frequency = 200000000,	.index = PLLVAL(0x5c, 1, 2),  }, 	/* FVco 800.000000 */
-	{ .frequency = 210000000,	.index = PLLVAL(0x84, 2, 2),  }, 	/* FVco 840.000000 */
-	{ .frequency = 220000000,	.index = PLLVAL(0x66, 1, 2),  }, 	/* FVco 880.000000 */
-	{ .frequency = 230000000,	.index = PLLVAL(0x6b, 1, 2),  }, 	/* FVco 920.000000 */
-	{ .frequency = 240000000,	.index = PLLVAL(0x70, 1, 2),  }, 	/* FVco 960.000000 */
-	{ .frequency = 300000000,	.index = PLLVAL(0x75, 3, 1),  }, 	/* FVco 600.000000 */
-	{ .frequency = 310000000,	.index = PLLVAL(0x93, 4, 1),  }, 	/* FVco 620.000000 */
-	{ .frequency = 320000000,	.index = PLLVAL(0x98, 4, 1),  }, 	/* FVco 640.000000 */
-	{ .frequency = 330000000,	.index = PLLVAL(0x66, 2, 1),  }, 	/* FVco 660.000000 */
-	{ .frequency = 340000000,	.index = PLLVAL(0x4d, 1, 1),  }, 	/* FVco 680.000000 */
-	{ .frequency = 350000000,	.index = PLLVAL(0xa7, 4, 1),  }, 	/* FVco 700.000000 */
-	{ .frequency = 360000000,	.index = PLLVAL(0x70, 2, 1),  }, 	/* FVco 720.000000 */
-	{ .frequency = 370000000,	.index = PLLVAL(0xb1, 4, 1),  }, 	/* FVco 740.000000 */
-	{ .frequency = 380000000,	.index = PLLVAL(0x57, 1, 1),  }, 	/* FVco 760.000000 */
-	{ .frequency = 390000000,	.index = PLLVAL(0x7a, 2, 1),  }, 	/* FVco 780.000000 */
-	{ .frequency = 400000000,	.index = PLLVAL(0x5c, 1, 1),  }, 	/* FVco 800.000000 */
+	{ .frequency = 75000000,	.driver_data = PLLVAL(0x75, 3, 3),  }, 	/* FVco 600.000000 */
+	{ .frequency = 80000000,	.driver_data = PLLVAL(0x98, 4, 3),  }, 	/* FVco 640.000000 */
+	{ .frequency = 90000000,	.driver_data = PLLVAL(0x70, 2, 3),  }, 	/* FVco 720.000000 */
+	{ .frequency = 100000000,	.driver_data = PLLVAL(0x5c, 1, 3),  }, 	/* FVco 800.000000 */
+	{ .frequency = 110000000,	.driver_data = PLLVAL(0x66, 1, 3),  }, 	/* FVco 880.000000 */
+	{ .frequency = 120000000,	.driver_data = PLLVAL(0x70, 1, 3),  }, 	/* FVco 960.000000 */
+	{ .frequency = 150000000,	.driver_data = PLLVAL(0x75, 3, 2),  }, 	/* FVco 600.000000 */
+	{ .frequency = 160000000,	.driver_data = PLLVAL(0x98, 4, 2),  }, 	/* FVco 640.000000 */
+	{ .frequency = 170000000,	.driver_data = PLLVAL(0x4d, 1, 2),  }, 	/* FVco 680.000000 */
+	{ .frequency = 180000000,	.driver_data = PLLVAL(0x70, 2, 2),  }, 	/* FVco 720.000000 */
+	{ .frequency = 190000000,	.driver_data = PLLVAL(0x57, 1, 2),  }, 	/* FVco 760.000000 */
+	{ .frequency = 200000000,	.driver_data = PLLVAL(0x5c, 1, 2),  }, 	/* FVco 800.000000 */
+	{ .frequency = 210000000,	.driver_data = PLLVAL(0x84, 2, 2),  }, 	/* FVco 840.000000 */
+	{ .frequency = 220000000,	.driver_data = PLLVAL(0x66, 1, 2),  }, 	/* FVco 880.000000 */
+	{ .frequency = 230000000,	.driver_data = PLLVAL(0x6b, 1, 2),  }, 	/* FVco 920.000000 */
+	{ .frequency = 240000000,	.driver_data = PLLVAL(0x70, 1, 2),  }, 	/* FVco 960.000000 */
+	{ .frequency = 300000000,	.driver_data = PLLVAL(0x75, 3, 1),  }, 	/* FVco 600.000000 */
+	{ .frequency = 310000000,	.driver_data = PLLVAL(0x93, 4, 1),  }, 	/* FVco 620.000000 */
+	{ .frequency = 320000000,	.driver_data = PLLVAL(0x98, 4, 1),  }, 	/* FVco 640.000000 */
+	{ .frequency = 330000000,	.driver_data = PLLVAL(0x66, 2, 1),  }, 	/* FVco 660.000000 */
+	{ .frequency = 340000000,	.driver_data = PLLVAL(0x4d, 1, 1),  }, 	/* FVco 680.000000 */
+	{ .frequency = 350000000,	.driver_data = PLLVAL(0xa7, 4, 1),  }, 	/* FVco 700.000000 */
+	{ .frequency = 360000000,	.driver_data = PLLVAL(0x70, 2, 1),  }, 	/* FVco 720.000000 */
+	{ .frequency = 370000000,	.driver_data = PLLVAL(0xb1, 4, 1),  }, 	/* FVco 740.000000 */
+	{ .frequency = 380000000,	.driver_data = PLLVAL(0x57, 1, 1),  }, 	/* FVco 760.000000 */
+	{ .frequency = 390000000,	.driver_data = PLLVAL(0x7a, 2, 1),  }, 	/* FVco 780.000000 */
+	{ .frequency = 400000000,	.driver_data = PLLVAL(0x5c, 1, 1),  }, 	/* FVco 800.000000 */
 };
 
 static int s3c2440_plls12_add(struct device *dev, struct subsys_interface *sif)
diff --git a/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c b/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c
index debfa106289b..1191b2905625 100644
--- a/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c
+++ b/arch/arm/mach-s3c24xx/pll-s3c2440-16934400.c
@@ -21,61 +21,61 @@
 #include <plat/cpu-freq-core.h>
 
 static struct cpufreq_frequency_table s3c2440_plls_169344[] __initdata = {
-	{ .frequency = 78019200,	.index = PLLVAL(121, 5, 3), 	}, 	/* FVco 624.153600 */
-	{ .frequency = 84067200,	.index = PLLVAL(131, 5, 3), 	}, 	/* FVco 672.537600 */
-	{ .frequency = 90115200,	.index = PLLVAL(141, 5, 3), 	}, 	/* FVco 720.921600 */
-	{ .frequency = 96163200,	.index = PLLVAL(151, 5, 3), 	}, 	/* FVco 769.305600 */
-	{ .frequency = 102135600,	.index = PLLVAL(185, 6, 3), 	}, 	/* FVco 817.084800 */
-	{ .frequency = 108259200,	.index = PLLVAL(171, 5, 3), 	}, 	/* FVco 866.073600 */
-	{ .frequency = 114307200,	.index = PLLVAL(127, 3, 3), 	}, 	/* FVco 914.457600 */
-	{ .frequency = 120234240,	.index = PLLVAL(134, 3, 3), 	}, 	/* FVco 961.873920 */
-	{ .frequency = 126161280,	.index = PLLVAL(141, 3, 3), 	}, 	/* FVco 1009.290240 */
-	{ .frequency = 132088320,	.index = PLLVAL(148, 3, 3), 	}, 	/* FVco 1056.706560 */
-	{ .frequency = 138015360,	.index = PLLVAL(155, 3, 3), 	}, 	/* FVco 1104.122880 */
-	{ .frequency = 144789120,	.index = PLLVAL(163, 3, 3), 	}, 	/* FVco 1158.312960 */
-	{ .frequency = 150100363,	.index = PLLVAL(187, 9, 2), 	}, 	/* FVco 600.401454 */
-	{ .frequency = 156038400,	.index = PLLVAL(121, 5, 2), 	}, 	/* FVco 624.153600 */
-	{ .frequency = 162086400,	.index = PLLVAL(126, 5, 2), 	}, 	/* FVco 648.345600 */
-	{ .frequency = 168134400,	.index = PLLVAL(131, 5, 2), 	}, 	/* FVco 672.537600 */
-	{ .frequency = 174048000,	.index = PLLVAL(177, 7, 2), 	}, 	/* FVco 696.192000 */
-	{ .frequency = 180230400,	.index = PLLVAL(141, 5, 2), 	}, 	/* FVco 720.921600 */
-	{ .frequency = 186278400,	.index = PLLVAL(124, 4, 2), 	}, 	/* FVco 745.113600 */
-	{ .frequency = 192326400,	.index = PLLVAL(151, 5, 2), 	}, 	/* FVco 769.305600 */
-	{ .frequency = 198132480,	.index = PLLVAL(109, 3, 2), 	}, 	/* FVco 792.529920 */
-	{ .frequency = 204271200,	.index = PLLVAL(185, 6, 2), 	}, 	/* FVco 817.084800 */
-	{ .frequency = 210268800,	.index = PLLVAL(141, 4, 2), 	}, 	/* FVco 841.075200 */
-	{ .frequency = 216518400,	.index = PLLVAL(171, 5, 2), 	}, 	/* FVco 866.073600 */
-	{ .frequency = 222264000,	.index = PLLVAL(97, 2, 2), 	}, 	/* FVco 889.056000 */
-	{ .frequency = 228614400,	.index = PLLVAL(127, 3, 2), 	}, 	/* FVco 914.457600 */
-	{ .frequency = 234259200,	.index = PLLVAL(158, 4, 2), 	}, 	/* FVco 937.036800 */
-	{ .frequency = 240468480,	.index = PLLVAL(134, 3, 2), 	}, 	/* FVco 961.873920 */
-	{ .frequency = 246960000,	.index = PLLVAL(167, 4, 2), 	}, 	/* FVco 987.840000 */
-	{ .frequency = 252322560,	.index = PLLVAL(141, 3, 2), 	}, 	/* FVco 1009.290240 */
-	{ .frequency = 258249600,	.index = PLLVAL(114, 2, 2), 	}, 	/* FVco 1032.998400 */
-	{ .frequency = 264176640,	.index = PLLVAL(148, 3, 2), 	}, 	/* FVco 1056.706560 */
-	{ .frequency = 270950400,	.index = PLLVAL(120, 2, 2), 	}, 	/* FVco 1083.801600 */
-	{ .frequency = 276030720,	.index = PLLVAL(155, 3, 2), 	}, 	/* FVco 1104.122880 */
-	{ .frequency = 282240000,	.index = PLLVAL(92, 1, 2), 	}, 	/* FVco 1128.960000 */
-	{ .frequency = 289578240,	.index = PLLVAL(163, 3, 2), 	}, 	/* FVco 1158.312960 */
-	{ .frequency = 294235200,	.index = PLLVAL(131, 2, 2), 	}, 	/* FVco 1176.940800 */
-	{ .frequency = 300200727,	.index = PLLVAL(187, 9, 1), 	}, 	/* FVco 600.401454 */
-	{ .frequency = 306358690,	.index = PLLVAL(191, 9, 1), 	}, 	/* FVco 612.717380 */
-	{ .frequency = 312076800,	.index = PLLVAL(121, 5, 1), 	}, 	/* FVco 624.153600 */
-	{ .frequency = 318366720,	.index = PLLVAL(86, 3, 1), 	}, 	/* FVco 636.733440 */
-	{ .frequency = 324172800,	.index = PLLVAL(126, 5, 1), 	}, 	/* FVco 648.345600 */
-	{ .frequency = 330220800,	.index = PLLVAL(109, 4, 1), 	}, 	/* FVco 660.441600 */
-	{ .frequency = 336268800,	.index = PLLVAL(131, 5, 1), 	}, 	/* FVco 672.537600 */
-	{ .frequency = 342074880,	.index = PLLVAL(93, 3, 1), 	}, 	/* FVco 684.149760 */
-	{ .frequency = 348096000,	.index = PLLVAL(177, 7, 1), 	}, 	/* FVco 696.192000 */
-	{ .frequency = 355622400,	.index = PLLVAL(118, 4, 1), 	}, 	/* FVco 711.244800 */
-	{ .frequency = 360460800,	.index = PLLVAL(141, 5, 1), 	}, 	/* FVco 720.921600 */
-	{ .frequency = 366206400,	.index = PLLVAL(165, 6, 1), 	}, 	/* FVco 732.412800 */
-	{ .frequency = 372556800,	.index = PLLVAL(124, 4, 1), 	}, 	/* FVco 745.113600 */
-	{ .frequency = 378201600,	.index = PLLVAL(126, 4, 1), 	}, 	/* FVco 756.403200 */
-	{ .frequency = 384652800,	.index = PLLVAL(151, 5, 1), 	}, 	/* FVco 769.305600 */
-	{ .frequency = 391608000,	.index = PLLVAL(177, 6, 1), 	}, 	/* FVco 783.216000 */
-	{ .frequency = 396264960,	.index = PLLVAL(109, 3, 1), 	}, 	/* FVco 792.529920 */
-	{ .frequency = 402192000,	.index = PLLVAL(87, 2, 1), 	}, 	/* FVco 804.384000 */
+	{ .frequency = 78019200,	.driver_data = PLLVAL(121, 5, 3), 	}, 	/* FVco 624.153600 */
+	{ .frequency = 84067200,	.driver_data = PLLVAL(131, 5, 3), 	}, 	/* FVco 672.537600 */
+	{ .frequency = 90115200,	.driver_data = PLLVAL(141, 5, 3), 	}, 	/* FVco 720.921600 */
+	{ .frequency = 96163200,	.driver_data = PLLVAL(151, 5, 3), 	}, 	/* FVco 769.305600 */
+	{ .frequency = 102135600,	.driver_data = PLLVAL(185, 6, 3), 	}, 	/* FVco 817.084800 */
+	{ .frequency = 108259200,	.driver_data = PLLVAL(171, 5, 3), 	}, 	/* FVco 866.073600 */
+	{ .frequency = 114307200,	.driver_data = PLLVAL(127, 3, 3), 	}, 	/* FVco 914.457600 */
+	{ .frequency = 120234240,	.driver_data = PLLVAL(134, 3, 3), 	}, 	/* FVco 961.873920 */
+	{ .frequency = 126161280,	.driver_data = PLLVAL(141, 3, 3), 	}, 	/* FVco 1009.290240 */
+	{ .frequency = 132088320,	.driver_data = PLLVAL(148, 3, 3), 	}, 	/* FVco 1056.706560 */
+	{ .frequency = 138015360,	.driver_data = PLLVAL(155, 3, 3), 	}, 	/* FVco 1104.122880 */
+	{ .frequency = 144789120,	.driver_data = PLLVAL(163, 3, 3), 	}, 	/* FVco 1158.312960 */
+	{ .frequency = 150100363,	.driver_data = PLLVAL(187, 9, 2), 	}, 	/* FVco 600.401454 */
+	{ .frequency = 156038400,	.driver_data = PLLVAL(121, 5, 2), 	}, 	/* FVco 624.153600 */
+	{ .frequency = 162086400,	.driver_data = PLLVAL(126, 5, 2), 	}, 	/* FVco 648.345600 */
+	{ .frequency = 168134400,	.driver_data = PLLVAL(131, 5, 2), 	}, 	/* FVco 672.537600 */
+	{ .frequency = 174048000,	.driver_data = PLLVAL(177, 7, 2), 	}, 	/* FVco 696.192000 */
+	{ .frequency = 180230400,	.driver_data = PLLVAL(141, 5, 2), 	}, 	/* FVco 720.921600 */
+	{ .frequency = 186278400,	.driver_data = PLLVAL(124, 4, 2), 	}, 	/* FVco 745.113600 */
+	{ .frequency = 192326400,	.driver_data = PLLVAL(151, 5, 2), 	}, 	/* FVco 769.305600 */
+	{ .frequency = 198132480,	.driver_data = PLLVAL(109, 3, 2), 	}, 	/* FVco 792.529920 */
+	{ .frequency = 204271200,	.driver_data = PLLVAL(185, 6, 2), 	}, 	/* FVco 817.084800 */
+	{ .frequency = 210268800,	.driver_data = PLLVAL(141, 4, 2), 	}, 	/* FVco 841.075200 */
+	{ .frequency = 216518400,	.driver_data = PLLVAL(171, 5, 2), 	}, 	/* FVco 866.073600 */
+	{ .frequency = 222264000,	.driver_data = PLLVAL(97, 2, 2), 	}, 	/* FVco 889.056000 */
+	{ .frequency = 228614400,	.driver_data = PLLVAL(127, 3, 2), 	}, 	/* FVco 914.457600 */
+	{ .frequency = 234259200,	.driver_data = PLLVAL(158, 4, 2), 	}, 	/* FVco 937.036800 */
+	{ .frequency = 240468480,	.driver_data = PLLVAL(134, 3, 2), 	}, 	/* FVco 961.873920 */
+	{ .frequency = 246960000,	.driver_data = PLLVAL(167, 4, 2), 	}, 	/* FVco 987.840000 */
+	{ .frequency = 252322560,	.driver_data = PLLVAL(141, 3, 2), 	}, 	/* FVco 1009.290240 */
+	{ .frequency = 258249600,	.driver_data = PLLVAL(114, 2, 2), 	}, 	/* FVco 1032.998400 */
+	{ .frequency = 264176640,	.driver_data = PLLVAL(148, 3, 2), 	}, 	/* FVco 1056.706560 */
+	{ .frequency = 270950400,	.driver_data = PLLVAL(120, 2, 2), 	}, 	/* FVco 1083.801600 */
+	{ .frequency = 276030720,	.driver_data = PLLVAL(155, 3, 2), 	}, 	/* FVco 1104.122880 */
+	{ .frequency = 282240000,	.driver_data = PLLVAL(92, 1, 2), 	}, 	/* FVco 1128.960000 */
+	{ .frequency = 289578240,	.driver_data = PLLVAL(163, 3, 2), 	}, 	/* FVco 1158.312960 */
+	{ .frequency = 294235200,	.driver_data = PLLVAL(131, 2, 2), 	}, 	/* FVco 1176.940800 */
+	{ .frequency = 300200727,	.driver_data = PLLVAL(187, 9, 1), 	}, 	/* FVco 600.401454 */
+	{ .frequency = 306358690,	.driver_data = PLLVAL(191, 9, 1), 	}, 	/* FVco 612.717380 */
+	{ .frequency = 312076800,	.driver_data = PLLVAL(121, 5, 1), 	}, 	/* FVco 624.153600 */
+	{ .frequency = 318366720,	.driver_data = PLLVAL(86, 3, 1), 	}, 	/* FVco 636.733440 */
+	{ .frequency = 324172800,	.driver_data = PLLVAL(126, 5, 1), 	}, 	/* FVco 648.345600 */
+	{ .frequency = 330220800,	.driver_data = PLLVAL(109, 4, 1), 	}, 	/* FVco 660.441600 */
+	{ .frequency = 336268800,	.driver_data = PLLVAL(131, 5, 1), 	}, 	/* FVco 672.537600 */
+	{ .frequency = 342074880,	.driver_data = PLLVAL(93, 3, 1), 	}, 	/* FVco 684.149760 */
+	{ .frequency = 348096000,	.driver_data = PLLVAL(177, 7, 1), 	}, 	/* FVco 696.192000 */
+	{ .frequency = 355622400,	.driver_data = PLLVAL(118, 4, 1), 	}, 	/* FVco 711.244800 */
+	{ .frequency = 360460800,	.driver_data = PLLVAL(141, 5, 1), 	}, 	/* FVco 720.921600 */
+	{ .frequency = 366206400,	.driver_data = PLLVAL(165, 6, 1), 	}, 	/* FVco 732.412800 */
+	{ .frequency = 372556800,	.driver_data = PLLVAL(124, 4, 1), 	}, 	/* FVco 745.113600 */
+	{ .frequency = 378201600,	.driver_data = PLLVAL(126, 4, 1), 	}, 	/* FVco 756.403200 */
+	{ .frequency = 384652800,	.driver_data = PLLVAL(151, 5, 1), 	}, 	/* FVco 769.305600 */
+	{ .frequency = 391608000,	.driver_data = PLLVAL(177, 6, 1), 	}, 	/* FVco 783.216000 */
+	{ .frequency = 396264960,	.driver_data = PLLVAL(109, 3, 1), 	}, 	/* FVco 792.529920 */
+	{ .frequency = 402192000,	.driver_data = PLLVAL(87, 2, 1), 	}, 	/* FVco 804.384000 */
 };
 
 static int s3c2440_plls169344_add(struct device *dev,
diff --git a/arch/arm/mach-shmobile/clock-sh7372.c b/arch/arm/mach-shmobile/clock-sh7372.c
index 7e105932c09d..5390c6bbbc02 100644
--- a/arch/arm/mach-shmobile/clock-sh7372.c
+++ b/arch/arm/mach-shmobile/clock-sh7372.c
@@ -142,15 +142,15 @@ static void pllc2_table_rebuild(struct clk *clk)
 	/* Initialise PLLC2 frequency table */
 	for (i = 0; i < ARRAY_SIZE(pllc2_freq_table) - 2; i++) {
 		pllc2_freq_table[i].frequency = clk->parent->rate * (i + 20) * 2;
-		pllc2_freq_table[i].index = i;
+		pllc2_freq_table[i].driver_data = i;
 	}
 
 	/* This is a special entry - switching PLL off makes it a repeater */
 	pllc2_freq_table[i].frequency = clk->parent->rate;
-	pllc2_freq_table[i].index = i;
+	pllc2_freq_table[i].driver_data = i;
 
 	pllc2_freq_table[++i].frequency = CPUFREQ_TABLE_END;
-	pllc2_freq_table[i].index = i;
+	pllc2_freq_table[i].driver_data = i;
 }
 
 static unsigned long pllc2_recalc(struct clk *clk)
diff --git a/arch/arm/plat-samsung/include/plat/cpu-freq-core.h b/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
index 95509d8eb140..a8a760ddfae1 100644
--- a/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
+++ b/arch/arm/plat-samsung/include/plat/cpu-freq-core.h
@@ -285,7 +285,7 @@ static inline int s3c_cpufreq_addfreq(struct cpufreq_frequency_table *table,
 		s3c_freq_dbg("%s: { %d = %u kHz }\n",
 			     __func__, index, freq);
 
-		table[index].index = index;
+		table[index].driver_data = index;
 		table[index].frequency = freq;
 	}
 
diff --git a/arch/mips/loongson/lemote-2f/clock.c b/arch/mips/loongson/lemote-2f/clock.c
index bc739d4bab2e..4dc2f5fa3f67 100644
--- a/arch/mips/loongson/lemote-2f/clock.c
+++ b/arch/mips/loongson/lemote-2f/clock.c
@@ -121,7 +121,8 @@ int clk_set_rate(struct clk *clk, unsigned long rate)
 	clk->rate = rate;
 
 	regval = LOONGSON_CHIPCFG0;
-	regval = (regval & ~0x7) | (loongson2_clockmod_table[i].index - 1);
+	regval = (regval & ~0x7) |
+		(loongson2_clockmod_table[i].driver_data - 1);
 	LOONGSON_CHIPCFG0 = regval;
 
 	return ret;
diff --git a/arch/powerpc/platforms/pasemi/cpufreq.c b/arch/powerpc/platforms/pasemi/cpufreq.c
index be1e7958909e..b704da404067 100644
--- a/arch/powerpc/platforms/pasemi/cpufreq.c
+++ b/arch/powerpc/platforms/pasemi/cpufreq.c
@@ -204,7 +204,8 @@ static int pas_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* initialize frequency table */
 	for (i=0; pas_freqs[i].frequency!=CPUFREQ_TABLE_END; i++) {
-		pas_freqs[i].frequency = get_astate_freq(pas_freqs[i].index) * 100000;
+		pas_freqs[i].frequency =
+			get_astate_freq(pas_freqs[i].driver_data) * 100000;
 		pr_debug("%d: %d\n", i, pas_freqs[i].frequency);
 	}
 
@@ -280,7 +281,7 @@ static int pas_cpufreq_target(struct cpufreq_policy *policy,
 	pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n",
 		 policy->cpu,
 		 pas_freqs[pas_astate_new].frequency,
-		 pas_freqs[pas_astate_new].index);
+		 pas_freqs[pas_astate_new].driver_data);
 
 	current_astate = pas_astate_new;
 
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index f0077cb8e249..c8ec186303db 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -648,14 +648,14 @@ int opp_init_cpufreq_table(struct device *dev,
 
 	list_for_each_entry(opp, &dev_opp->opp_list, node) {
 		if (opp->available) {
-			freq_table[i].index = i;
+			freq_table[i].driver_data = i;
 			freq_table[i].frequency = opp->rate / 1000;
 			i++;
 		}
 	}
 	mutex_unlock(&dev_opp_list_lock);
 
-	freq_table[i].index = i;
+	freq_table[i].driver_data = i;
 	freq_table[i].frequency = CPUFREQ_TABLE_END;
 
 	*table = &freq_table[0];
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 8c02622b35a4..0d25677fb37d 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -232,7 +232,7 @@ static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
 	perf = data->acpi_data;
 
 	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-		if (msr == perf->states[data->freq_table[i].index].status)
+		if (msr == perf->states[data->freq_table[i].driver_data].status)
 			return data->freq_table[i].frequency;
 	}
 	return data->freq_table[0].frequency;
@@ -442,7 +442,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		goto out;
 	}
 
-	next_perf_state = data->freq_table[next_state].index;
+	next_perf_state = data->freq_table[next_state].driver_data;
 	if (perf->state == next_perf_state) {
 		if (unlikely(data->resume)) {
 			pr_debug("Called after resume, resetting to P%d\n",
@@ -811,7 +811,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		    data->freq_table[valid_states-1].frequency / 1000)
 			continue;
 
-		data->freq_table[valid_states].index = i;
+		data->freq_table[valid_states].driver_data = i;
 		data->freq_table[valid_states].frequency =
 		    perf->states[i].core_frequency * 1000;
 		valid_states++;
diff --git a/drivers/cpufreq/blackfin-cpufreq.c b/drivers/cpufreq/blackfin-cpufreq.c
index 995511e80bef..9cdbbd278a80 100644
--- a/drivers/cpufreq/blackfin-cpufreq.c
+++ b/drivers/cpufreq/blackfin-cpufreq.c
@@ -20,23 +20,23 @@
 
 
 /* this is the table of CCLK frequencies, in Hz */
-/* .index is the entry in the auxiliary dpm_state_table[] */
+/* .driver_data is the entry in the auxiliary dpm_state_table[] */
 static struct cpufreq_frequency_table bfin_freq_table[] = {
 	{
 		.frequency = CPUFREQ_TABLE_END,
-		.index = 0,
+		.driver_data = 0,
 	},
 	{
 		.frequency = CPUFREQ_TABLE_END,
-		.index = 1,
+		.driver_data = 1,
 	},
 	{
 		.frequency = CPUFREQ_TABLE_END,
-		.index = 2,
+		.driver_data = 2,
 	},
 	{
 		.frequency = CPUFREQ_TABLE_END,
-		.index = 0,
+		.driver_data = 0,
 	},
 };
 
diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c
index 37380fb92621..324aff20aeef 100644
--- a/drivers/cpufreq/e_powersaver.c
+++ b/drivers/cpufreq/e_powersaver.c
@@ -188,7 +188,7 @@ static int eps_target(struct cpufreq_policy *policy,
 	}
 
 	/* Make frequency transition */
-	dest_state = centaur->freq_table[newstate].index & 0xffff;
+	dest_state = centaur->freq_table[newstate].driver_data & 0xffff;
 	ret = eps_set_state(centaur, policy, dest_state);
 	if (ret)
 		printk(KERN_ERR "eps: Timeout!\n");
@@ -380,9 +380,9 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
 	f_table = &centaur->freq_table[0];
 	if (brand != EPS_BRAND_C7M) {
 		f_table[0].frequency = fsb * min_multiplier;
-		f_table[0].index = (min_multiplier << 8) | min_voltage;
+		f_table[0].driver_data = (min_multiplier << 8) | min_voltage;
 		f_table[1].frequency = fsb * max_multiplier;
-		f_table[1].index = (max_multiplier << 8) | max_voltage;
+		f_table[1].driver_data = (max_multiplier << 8) | max_voltage;
 		f_table[2].frequency = CPUFREQ_TABLE_END;
 	} else {
 		k = 0;
@@ -391,7 +391,7 @@ static int eps_cpu_init(struct cpufreq_policy *policy)
 		for (i = min_multiplier; i <= max_multiplier; i++) {
 			voltage = (k * step) / 256 + min_voltage;
 			f_table[k].frequency = fsb * i;
-			f_table[k].index = (i << 8) | voltage;
+			f_table[k].driver_data = (i << 8) | voltage;
 			k++;
 		}
 		f_table[k].frequency = CPUFREQ_TABLE_END;
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index d7a79662e24c..f0d87412cc91 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -34,8 +34,8 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 
 			continue;
 		}
-		pr_debug("table entry %u: %u kHz, %u index\n",
-					i, freq, table[i].index);
+		pr_debug("table entry %u: %u kHz, %u driver_data\n",
+					i, freq, table[i].driver_data);
 		if (freq < min_freq)
 			min_freq = freq;
 		if (freq > max_freq)
@@ -97,11 +97,11 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 				   unsigned int *index)
 {
 	struct cpufreq_frequency_table optimal = {
-		.index = ~0,
+		.driver_data = ~0,
 		.frequency = 0,
 	};
 	struct cpufreq_frequency_table suboptimal = {
-		.index = ~0,
+		.driver_data = ~0,
 		.frequency = 0,
 	};
 	unsigned int i;
@@ -129,12 +129,12 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 			if (freq <= target_freq) {
 				if (freq >= optimal.frequency) {
 					optimal.frequency = freq;
-					optimal.index = i;
+					optimal.driver_data = i;
 				}
 			} else {
 				if (freq <= suboptimal.frequency) {
 					suboptimal.frequency = freq;
-					suboptimal.index = i;
+					suboptimal.driver_data = i;
 				}
 			}
 			break;
@@ -142,26 +142,26 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 			if (freq >= target_freq) {
 				if (freq <= optimal.frequency) {
 					optimal.frequency = freq;
-					optimal.index = i;
+					optimal.driver_data = i;
 				}
 			} else {
 				if (freq >= suboptimal.frequency) {
 					suboptimal.frequency = freq;
-					suboptimal.index = i;
+					suboptimal.driver_data = i;
 				}
 			}
 			break;
 		}
 	}
-	if (optimal.index > i) {
-		if (suboptimal.index > i)
+	if (optimal.driver_data > i) {
+		if (suboptimal.driver_data > i)
 			return -EINVAL;
-		*index = suboptimal.index;
+		*index = suboptimal.driver_data;
 	} else
-		*index = optimal.index;
+		*index = optimal.driver_data;
 
 	pr_debug("target is %u (%u kHz, %u)\n", *index, table[*index].frequency,
-		table[*index].index);
+		table[*index].driver_data);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/ia64-acpi-cpufreq.c b/drivers/cpufreq/ia64-acpi-cpufreq.c
index c0075dbaa633..573c14ea802d 100644
--- a/drivers/cpufreq/ia64-acpi-cpufreq.c
+++ b/drivers/cpufreq/ia64-acpi-cpufreq.c
@@ -326,7 +326,7 @@ acpi_cpufreq_cpu_init (
 	/* table init */
 	for (i = 0; i <= data->acpi_data.state_count; i++)
 	{
-		data->freq_table[i].index = i;
+		data->freq_table[i].driver_data = i;
 		if (i < data->acpi_data.state_count) {
 			data->freq_table[i].frequency =
 			      data->acpi_data.states[i].core_frequency * 1000;
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index b2644af985ec..c233ea617366 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -59,7 +59,7 @@ static void kirkwood_cpufreq_set_cpu_state(struct cpufreq_policy *policy,
 		unsigned int index)
 {
 	struct cpufreq_freqs freqs;
-	unsigned int state = kirkwood_freq_table[index].index;
+	unsigned int state = kirkwood_freq_table[index].driver_data;
 	unsigned long reg;
 
 	freqs.old = kirkwood_cpufreq_get_cpu_frequency(0);
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index b448638e34de..b6a0a7a406b0 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -254,7 +254,7 @@ static void longhaul_setstate(struct cpufreq_policy *policy,
 	u32 bm_timeout = 1000;
 	unsigned int dir = 0;
 
-	mults_index = longhaul_table[table_index].index;
+	mults_index = longhaul_table[table_index].driver_data;
 	/* Safety precautions */
 	mult = mults[mults_index & 0x1f];
 	if (mult == -1)
@@ -487,7 +487,7 @@ static int __cpuinit longhaul_get_ranges(void)
 		if (ratio > maxmult || ratio < minmult)
 			continue;
 		longhaul_table[k].frequency = calc_speed(ratio);
-		longhaul_table[k].index	= j;
+		longhaul_table[k].driver_data	= j;
 		k++;
 	}
 	if (k <= 1) {
@@ -508,8 +508,8 @@ static int __cpuinit longhaul_get_ranges(void)
 		if (min_i != j) {
 			swap(longhaul_table[j].frequency,
 			     longhaul_table[min_i].frequency);
-			swap(longhaul_table[j].index,
-			     longhaul_table[min_i].index);
+			swap(longhaul_table[j].driver_data,
+			     longhaul_table[min_i].driver_data);
 		}
 	}
 
@@ -517,7 +517,7 @@ static int __cpuinit longhaul_get_ranges(void)
 
 	/* Find index we are running on */
 	for (j = 0; j < k; j++) {
-		if (mults[longhaul_table[j].index & 0x1f] == mult) {
+		if (mults[longhaul_table[j].driver_data & 0x1f] == mult) {
 			longhaul_index = j;
 			break;
 		}
@@ -613,7 +613,7 @@ static void __cpuinit longhaul_setup_voltagescaling(void)
 			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
 		else
 			pos = minvid.pos;
-		longhaul_table[j].index |= mV_vrm_table[pos] << 8;
+		longhaul_table[j].driver_data |= mV_vrm_table[pos] << 8;
 		vid = vrm_mV_table[mV_vrm_table[pos]];
 		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
 				speed, j, vid.mV);
@@ -656,12 +656,12 @@ static int longhaul_target(struct cpufreq_policy *policy,
 		 * this in hardware, C3 is old and we need to do this
 		 * in software. */
 		i = longhaul_index;
-		current_vid = (longhaul_table[longhaul_index].index >> 8);
+		current_vid = (longhaul_table[longhaul_index].driver_data >> 8);
 		current_vid &= 0x1f;
 		if (table_index > longhaul_index)
 			dir = 1;
 		while (i != table_index) {
-			vid = (longhaul_table[i].index >> 8) & 0x1f;
+			vid = (longhaul_table[i].driver_data >> 8) & 0x1f;
 			if (vid != current_vid) {
 				longhaul_setstate(policy, i);
 				current_vid = vid;
diff --git a/drivers/cpufreq/loongson2_cpufreq.c b/drivers/cpufreq/loongson2_cpufreq.c
index d53912768946..bb838b985077 100644
--- a/drivers/cpufreq/loongson2_cpufreq.c
+++ b/drivers/cpufreq/loongson2_cpufreq.c
@@ -72,7 +72,7 @@ static int loongson2_cpufreq_target(struct cpufreq_policy *policy,
 
 	freq =
 	    ((cpu_clock_freq / 1000) *
-	     loongson2_clockmod_table[newstate].index) / 8;
+	     loongson2_clockmod_table[newstate].driver_data) / 8;
 	if (freq < policy->min || freq > policy->max)
 		return -EINVAL;
 
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index 421ef37d0bb3..9ee78170ff86 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -118,7 +118,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		return -EINVAL;
 
 	freqs.old = cpufreq_p4_get(policy->cpu);
-	freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
+	freqs.new = stock_freq * p4clockmod_table[newstate].driver_data / 8;
 
 	if (freqs.new == freqs.old)
 		return 0;
@@ -131,7 +131,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 	 * Developer's Manual, Volume 3
 	 */
 	for_each_cpu(i, policy->cpus)
-		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
+		cpufreq_p4_setdc(i, p4clockmod_table[newstate].driver_data);
 
 	/* notifiers */
 	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
diff --git a/drivers/cpufreq/powernow-k6.c b/drivers/cpufreq/powernow-k6.c
index ea0222a45b7b..ea8e10382ec5 100644
--- a/drivers/cpufreq/powernow-k6.c
+++ b/drivers/cpufreq/powernow-k6.c
@@ -58,7 +58,7 @@ static int powernow_k6_get_cpu_multiplier(void)
 	msrval = POWERNOW_IOPORT + 0x0;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
 
-	return clock_ratio[(invalue >> 5)&7].index;
+	return clock_ratio[(invalue >> 5)&7].driver_data;
 }
 
 
@@ -75,13 +75,13 @@ static void powernow_k6_set_state(struct cpufreq_policy *policy,
 	unsigned long msrval;
 	struct cpufreq_freqs freqs;
 
-	if (clock_ratio[best_i].index > max_multiplier) {
+	if (clock_ratio[best_i].driver_data > max_multiplier) {
 		printk(KERN_ERR PFX "invalid target frequency\n");
 		return;
 	}
 
 	freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
-	freqs.new = busfreq * clock_ratio[best_i].index;
+	freqs.new = busfreq * clock_ratio[best_i].driver_data;
 
 	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
@@ -156,7 +156,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 
 	/* table init */
 	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
-		f = clock_ratio[i].index;
+		f = clock_ratio[i].driver_data;
 		if (f > max_multiplier)
 			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
 		else
diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c
index 53888dacbe58..b9f80b713fda 100644
--- a/drivers/cpufreq/powernow-k7.c
+++ b/drivers/cpufreq/powernow-k7.c
@@ -186,7 +186,7 @@ static int get_ranges(unsigned char *pst)
 		fid = *pst++;
 
 		powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
-		powernow_table[j].index = fid; /* lower 8 bits */
+		powernow_table[j].driver_data = fid; /* lower 8 bits */
 
 		speed = powernow_table[j].frequency;
 
@@ -203,7 +203,7 @@ static int get_ranges(unsigned char *pst)
 			maximum_speed = speed;
 
 		vid = *pst++;
-		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
+		powernow_table[j].driver_data |= (vid << 8); /* upper 8 bits */
 
 		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
 			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
@@ -212,7 +212,7 @@ static int get_ranges(unsigned char *pst)
 			 mobile_vid_table[vid]%1000);
 	}
 	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
-	powernow_table[number_scales].index = 0;
+	powernow_table[number_scales].driver_data = 0;
 
 	return 0;
 }
@@ -260,8 +260,8 @@ static void change_speed(struct cpufreq_policy *policy, unsigned int index)
 	 * vid are the upper 8 bits.
 	 */
 
-	fid = powernow_table[index].index & 0xFF;
-	vid = (powernow_table[index].index & 0xFF00) >> 8;
+	fid = powernow_table[index].driver_data & 0xFF;
+	vid = (powernow_table[index].driver_data & 0xFF00) >> 8;
 
 	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 	cfid = fidvidstatus.bits.CFID;
@@ -373,8 +373,8 @@ static int powernow_acpi_init(void)
 		fid = pc.bits.fid;
 
 		powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
-		powernow_table[i].index = fid; /* lower 8 bits */
-		powernow_table[i].index |= (vid << 8); /* upper 8 bits */
+		powernow_table[i].driver_data = fid; /* lower 8 bits */
+		powernow_table[i].driver_data |= (vid << 8); /* upper 8 bits */
 
 		speed = powernow_table[i].frequency;
 		speed_mhz = speed / 1000;
@@ -417,7 +417,7 @@ static int powernow_acpi_init(void)
 	}
 
 	powernow_table[i].frequency = CPUFREQ_TABLE_END;
-	powernow_table[i].index = 0;
+	powernow_table[i].driver_data = 0;
 
 	/* notify BIOS that we exist */
 	acpi_processor_notify_smm(THIS_MODULE);
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index b828efe4b2f8..51343a128703 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -584,9 +584,9 @@ static void print_basics(struct powernow_k8_data *data)
 				CPUFREQ_ENTRY_INVALID) {
 				printk(KERN_INFO PFX
 					"fid 0x%x (%d MHz), vid 0x%x\n",
-					data->powernow_table[j].index & 0xff,
+					data->powernow_table[j].driver_data & 0xff,
 					data->powernow_table[j].frequency/1000,
-					data->powernow_table[j].index >> 8);
+					data->powernow_table[j].driver_data >> 8);
 		}
 	}
 	if (data->batps)
@@ -632,13 +632,13 @@ static int fill_powernow_table(struct powernow_k8_data *data,
 
 	for (j = 0; j < data->numps; j++) {
 		int freq;
-		powernow_table[j].index = pst[j].fid; /* lower 8 bits */
-		powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
+		powernow_table[j].driver_data = pst[j].fid; /* lower 8 bits */
+		powernow_table[j].driver_data |= (pst[j].vid << 8); /* upper 8 bits */
 		freq = find_khz_freq_from_fid(pst[j].fid);
 		powernow_table[j].frequency = freq;
 	}
 	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
-	powernow_table[data->numps].index = 0;
+	powernow_table[data->numps].driver_data = 0;
 
 	if (query_current_values_with_pending_wait(data)) {
 		kfree(powernow_table);
@@ -810,7 +810,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 
 	powernow_table[data->acpi_data.state_count].frequency =
 		CPUFREQ_TABLE_END;
-	powernow_table[data->acpi_data.state_count].index = 0;
+	powernow_table[data->acpi_data.state_count].driver_data = 0;
 	data->powernow_table = powernow_table;
 
 	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
@@ -865,7 +865,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 		pr_debug("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
 
 		index = fid | (vid<<8);
-		powernow_table[i].index = index;
+		powernow_table[i].driver_data = index;
 
 		freq = find_khz_freq_from_fid(fid);
 		powernow_table[i].frequency = freq;
@@ -941,8 +941,8 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
 	 * the cpufreq frequency table in find_psb_table, vid
 	 * are the upper 8 bits.
 	 */
-	fid = data->powernow_table[index].index & 0xFF;
-	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
+	fid = data->powernow_table[index].driver_data & 0xFF;
+	vid = (data->powernow_table[index].driver_data & 0xFF00) >> 8;
 
 	pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
 
diff --git a/drivers/cpufreq/ppc_cbe_cpufreq.c b/drivers/cpufreq/ppc_cbe_cpufreq.c
index e577a1dbbfcd..5936f8d6f2cc 100644
--- a/drivers/cpufreq/ppc_cbe_cpufreq.c
+++ b/drivers/cpufreq/ppc_cbe_cpufreq.c
@@ -106,7 +106,7 @@ static int cbe_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* initialize frequency table */
 	for (i=0; cbe_freqs[i].frequency!=CPUFREQ_TABLE_END; i++) {
-		cbe_freqs[i].frequency = max_freq / cbe_freqs[i].index;
+		cbe_freqs[i].frequency = max_freq / cbe_freqs[i].driver_data;
 		pr_debug("%d: %d\n", i, cbe_freqs[i].frequency);
 	}
 
@@ -165,7 +165,7 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy,
 		 "1/%d of max frequency\n",
 		 policy->cpu,
 		 cbe_freqs[cbe_pmode_new].frequency,
-		 cbe_freqs[cbe_pmode_new].index);
+		 cbe_freqs[cbe_pmode_new].driver_data);
 
 	rc = set_pmode(policy->cpu, cbe_pmode_new);
 
diff --git a/drivers/cpufreq/pxa2xx-cpufreq.c b/drivers/cpufreq/pxa2xx-cpufreq.c
index 9e5bc8e388a0..fb3981ac829f 100644
--- a/drivers/cpufreq/pxa2xx-cpufreq.c
+++ b/drivers/cpufreq/pxa2xx-cpufreq.c
@@ -420,7 +420,7 @@ static int pxa_cpufreq_init(struct cpufreq_policy *policy)
 	/* Generate pxa25x the run cpufreq_frequency_table struct */
 	for (i = 0; i < NUM_PXA25x_RUN_FREQS; i++) {
 		pxa255_run_freq_table[i].frequency = pxa255_run_freqs[i].khz;
-		pxa255_run_freq_table[i].index = i;
+		pxa255_run_freq_table[i].driver_data = i;
 	}
 	pxa255_run_freq_table[i].frequency = CPUFREQ_TABLE_END;
 
@@ -428,7 +428,7 @@ static int pxa_cpufreq_init(struct cpufreq_policy *policy)
 	for (i = 0; i < NUM_PXA25x_TURBO_FREQS; i++) {
 		pxa255_turbo_freq_table[i].frequency =
 			pxa255_turbo_freqs[i].khz;
-		pxa255_turbo_freq_table[i].index = i;
+		pxa255_turbo_freq_table[i].driver_data = i;
 	}
 	pxa255_turbo_freq_table[i].frequency = CPUFREQ_TABLE_END;
 
@@ -440,9 +440,9 @@ static int pxa_cpufreq_init(struct cpufreq_policy *policy)
 		if (freq > pxa27x_maxfreq)
 			break;
 		pxa27x_freq_table[i].frequency = freq;
-		pxa27x_freq_table[i].index = i;
+		pxa27x_freq_table[i].driver_data = i;
 	}
-	pxa27x_freq_table[i].index = i;
+	pxa27x_freq_table[i].driver_data = i;
 	pxa27x_freq_table[i].frequency = CPUFREQ_TABLE_END;
 
 	/*
diff --git a/drivers/cpufreq/pxa3xx-cpufreq.c b/drivers/cpufreq/pxa3xx-cpufreq.c
index 15d60f857ad5..9c92ef032a9e 100644
--- a/drivers/cpufreq/pxa3xx-cpufreq.c
+++ b/drivers/cpufreq/pxa3xx-cpufreq.c
@@ -98,10 +98,10 @@ static int setup_freqs_table(struct cpufreq_policy *policy,
 		return -ENOMEM;
 
 	for (i = 0; i < num; i++) {
-		table[i].index = i;
+		table[i].driver_data = i;
 		table[i].frequency = freqs[i].cpufreq_mhz * 1000;
 	}
-	table[num].index = i;
+	table[num].driver_data = i;
 	table[num].frequency = CPUFREQ_TABLE_END;
 
 	pxa3xx_freqs = freqs;
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index 4f1881eee3f1..69f2e55828dc 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -244,7 +244,7 @@ static int s3c2416_cpufreq_set_target(struct cpufreq_policy *policy,
 	if (ret != 0)
 		goto out;
 
-	idx = s3c_freq->freq_table[i].index;
+	idx = s3c_freq->freq_table[i].driver_data;
 
 	if (idx == SOURCE_HCLK)
 		to_dvs = 1;
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 27cacb524796..306d395de990 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -87,7 +87,7 @@ static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 	freqs.old = clk_get_rate(armclk) / 1000;
 	freqs.new = s3c64xx_freq_table[i].frequency;
 	freqs.flags = 0;
-	dvfs = &s3c64xx_dvfs_table[s3c64xx_freq_table[i].index];
+	dvfs = &s3c64xx_dvfs_table[s3c64xx_freq_table[i].driver_data];
 
 	if (freqs.old == freqs.new)
 		return 0;
diff --git a/drivers/cpufreq/sc520_freq.c b/drivers/cpufreq/sc520_freq.c
index f740b134d27b..77a210975fc4 100644
--- a/drivers/cpufreq/sc520_freq.c
+++ b/drivers/cpufreq/sc520_freq.c
@@ -71,7 +71,7 @@ static void sc520_freq_set_cpu_state(struct cpufreq_policy *policy,
 	local_irq_disable();
 
 	clockspeed_reg = *cpuctl & ~0x03;
-	*cpuctl = clockspeed_reg | sc520_freq_table[state].index;
+	*cpuctl = clockspeed_reg | sc520_freq_table[state].driver_data;
 
 	local_irq_enable();
 
diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c
index 306ae462bba6..93061a408773 100644
--- a/drivers/cpufreq/sparc-us2e-cpufreq.c
+++ b/drivers/cpufreq/sparc-us2e-cpufreq.c
@@ -308,17 +308,17 @@ static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *table =
 		&us2e_freq_table[cpu].table[0];
 
-	table[0].index = 0;
+	table[0].driver_data = 0;
 	table[0].frequency = clock_tick / 1;
-	table[1].index = 1;
+	table[1].driver_data = 1;
 	table[1].frequency = clock_tick / 2;
-	table[2].index = 2;
+	table[2].driver_data = 2;
 	table[2].frequency = clock_tick / 4;
-	table[2].index = 3;
+	table[2].driver_data = 3;
 	table[2].frequency = clock_tick / 6;
-	table[2].index = 4;
+	table[2].driver_data = 4;
 	table[2].frequency = clock_tick / 8;
-	table[2].index = 5;
+	table[2].driver_data = 5;
 	table[3].frequency = CPUFREQ_TABLE_END;
 
 	policy->cpuinfo.transition_latency = 0;
diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c
index c71ee142347a..880ee293d61e 100644
--- a/drivers/cpufreq/sparc-us3-cpufreq.c
+++ b/drivers/cpufreq/sparc-us3-cpufreq.c
@@ -169,13 +169,13 @@ static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)
 	struct cpufreq_frequency_table *table =
 		&us3_freq_table[cpu].table[0];
 
-	table[0].index = 0;
+	table[0].driver_data = 0;
 	table[0].frequency = clock_tick / 1;
-	table[1].index = 1;
+	table[1].driver_data = 1;
 	table[1].frequency = clock_tick / 2;
-	table[2].index = 2;
+	table[2].driver_data = 2;
 	table[2].frequency = clock_tick / 32;
-	table[3].index = 0;
+	table[3].driver_data = 0;
 	table[3].frequency = CPUFREQ_TABLE_END;
 
 	policy->cpuinfo.transition_latency = 0;
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 156829f4576d..c3efa7f2a908 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -250,11 +250,11 @@ static int spear_cpufreq_driver_init(void)
 	}
 
 	for (i = 0; i < cnt; i++) {
-		freq_tbl[i].index = i;
+		freq_tbl[i].driver_data = i;
 		freq_tbl[i].frequency = be32_to_cpup(val++);
 	}
 
-	freq_tbl[i].index = i;
+	freq_tbl[i].driver_data = i;
 	freq_tbl[i].frequency = CPUFREQ_TABLE_END;
 
 	spear_cpufreq.freq_tbl = freq_tbl;
diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c
index 618e6f417b1c..0915e712fbdc 100644
--- a/drivers/cpufreq/speedstep-centrino.c
+++ b/drivers/cpufreq/speedstep-centrino.c
@@ -79,11 +79,11 @@ static struct cpufreq_driver centrino_driver;
 
 /* Computes the correct form for IA32_PERF_CTL MSR for a particular
    frequency/voltage operating point; frequency in MHz, volts in mV.
-   This is stored as "index" in the structure. */
+   This is stored as "driver_data" in the structure. */
 #define OP(mhz, mv)							\
 	{								\
 		.frequency = (mhz) * 1000,				\
-		.index = (((mhz)/100) << 8) | ((mv - 700) / 16)		\
+		.driver_data = (((mhz)/100) << 8) | ((mv - 700) / 16)		\
 	}
 
 /*
@@ -307,7 +307,7 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
 		per_cpu(centrino_model, cpu)->op_points[i].frequency
 							!= CPUFREQ_TABLE_END;
 	     i++) {
-		if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
+		if (msr == per_cpu(centrino_model, cpu)->op_points[i].driver_data)
 			return per_cpu(centrino_model, cpu)->
 							op_points[i].frequency;
 	}
@@ -501,7 +501,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			break;
 		}
 
-		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
+		msr = per_cpu(centrino_model, cpu)->op_points[newstate].driver_data;
 
 		if (first_cpu) {
 			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 66f80973596b..ed79d7b78e7d 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -1724,9 +1724,9 @@ static long round_clock_rate(u8 clock, unsigned long rate)
 
 /* CPU FREQ table, may be changed due to if MAX_OPP is supported. */
 static struct cpufreq_frequency_table db8500_cpufreq_table[] = {
-	{ .frequency = 200000, .index = ARM_EXTCLK,},
-	{ .frequency = 400000, .index = ARM_50_OPP,},
-	{ .frequency = 800000, .index = ARM_100_OPP,},
+	{ .frequency = 200000, .driver_data = ARM_EXTCLK,},
+	{ .frequency = 400000, .driver_data = ARM_50_OPP,},
+	{ .frequency = 800000, .driver_data = ARM_100_OPP,},
 	{ .frequency = CPUFREQ_TABLE_END,}, /* To be used for MAX_OPP. */
 	{ .frequency = CPUFREQ_TABLE_END,},
 };
@@ -1901,7 +1901,7 @@ static int set_armss_rate(unsigned long rate)
 		return -EINVAL;
 
 	/* Set the new arm opp. */
-	return db8500_prcmu_set_arm_opp(db8500_cpufreq_table[i].index);
+	return db8500_prcmu_set_arm_opp(db8500_cpufreq_table[i].driver_data);
 }
 
 static int set_plldsi_rate(unsigned long rate)
@@ -3105,7 +3105,7 @@ static void db8500_prcmu_update_cpufreq(void)
 {
 	if (prcmu_has_arm_maxopp()) {
 		db8500_cpufreq_table[3].frequency = 1000000;
-		db8500_cpufreq_table[3].index = ARM_MAX_OPP;
+		db8500_cpufreq_table[3].driver_data = ARM_MAX_OPP;
 	}
 }
 
diff --git a/drivers/sh/clk/core.c b/drivers/sh/clk/core.c
index 7715de2629c1..74727851820d 100644
--- a/drivers/sh/clk/core.c
+++ b/drivers/sh/clk/core.c
@@ -63,12 +63,12 @@ void clk_rate_table_build(struct clk *clk,
 		else
 			freq = clk->parent->rate * mult / div;
 
-		freq_table[i].index = i;
+		freq_table[i].driver_data = i;
 		freq_table[i].frequency = freq;
 	}
 
 	/* Termination entry */
-	freq_table[i].index = i;
+	freq_table[i].driver_data = i;
 	freq_table[i].frequency = CPUFREQ_TABLE_END;
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 1b5b5efa3e3a..d93905633dc7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -410,7 +410,7 @@ extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_TABLE_END     ~1
 
 struct cpufreq_frequency_table {
-	unsigned int	index;     /* any */
+	unsigned int	driver_data; /* driver specific data, not used by core */
 	unsigned int	frequency; /* kHz - doesn't need to be in ascending
 				    * order */
 };
-- 
cgit v1.3-7-g2ca7


From 082339bc63cccf8ea49b1f3cf4ee39ce00742849 Mon Sep 17 00:00:00 2001
From: Michal Simek <michal.simek@xilinx.com>
Date: Tue, 4 Jun 2013 16:02:36 +0200
Subject: spi: spi-xilinx: Add run run-time endian detection

Do not load endian value from platform data
and rather autodetect it.

Signed-off-by: Michal Simek <michal.simek@xilinx.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 drivers/mfd/timberdale.c       |  1 -
 drivers/spi/spi-xilinx.c       | 29 +++++++++++++++++++++--------
 include/linux/spi/xilinx_spi.h |  1 -
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/timberdale.c b/drivers/mfd/timberdale.c
index 59e0ee247e86..0c1fcbc23d04 100644
--- a/drivers/mfd/timberdale.c
+++ b/drivers/mfd/timberdale.c
@@ -145,7 +145,6 @@ static struct spi_board_info timberdale_spi_8bit_board_info[] = {
 
 static struct xspi_platform_data timberdale_xspi_platform_data = {
 	.num_chipselect = 3,
-	.little_endian = true,
 	/* bits per word and devices will be filled in runtime depending
 	 * on the HW config
 	 */
diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index e1d769607425..bb6ae4ee7dea 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -30,6 +30,7 @@
  */
 #define XSPI_CR_OFFSET		0x60	/* Control Register */
 
+#define XSPI_CR_LOOP		0x01
 #define XSPI_CR_ENABLE		0x02
 #define XSPI_CR_MASTER_MODE	0x04
 #define XSPI_CR_CPOL		0x08
@@ -359,11 +360,12 @@ static const struct of_device_id xilinx_spi_of_match[] = {
 MODULE_DEVICE_TABLE(of, xilinx_spi_of_match);
 
 struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
-	u32 irq, s16 bus_num, int num_cs, int little_endian, int bits_per_word)
+	u32 irq, s16 bus_num, int num_cs, int bits_per_word)
 {
 	struct spi_master *master;
 	struct xilinx_spi *xspi;
 	int ret;
+	u32 tmp;
 
 	master = spi_alloc_master(dev, sizeof(struct xilinx_spi));
 	if (!master)
@@ -396,13 +398,25 @@ struct spi_master *xilinx_spi_init(struct device *dev, struct resource *mem,
 
 	xspi->mem = *mem;
 	xspi->irq = irq;
-	if (little_endian) {
-		xspi->read_fn = xspi_read32;
-		xspi->write_fn = xspi_write32;
-	} else {
+
+	/*
+	 * Detect endianess on the IP via loop bit in CR. Detection
+	 * must be done before reset is sent because incorrect reset
+	 * value generates error interrupt.
+	 * Setup little endian helper functions first and try to use them
+	 * and check if bit was correctly setup or not.
+	 */
+	xspi->read_fn = xspi_read32;
+	xspi->write_fn = xspi_write32;
+
+	xspi->write_fn(XSPI_CR_LOOP, xspi->regs + XSPI_CR_OFFSET);
+	tmp = xspi->read_fn(xspi->regs + XSPI_CR_OFFSET);
+	tmp &= XSPI_CR_LOOP;
+	if (tmp != XSPI_CR_LOOP) {
 		xspi->read_fn = xspi_read32_be;
 		xspi->write_fn = xspi_write32_be;
 	}
+
 	xspi->bits_per_word = bits_per_word;
 	if (xspi->bits_per_word == 8) {
 		xspi->tx_fn = xspi_tx8;
@@ -466,14 +480,13 @@ static int xilinx_spi_probe(struct platform_device *dev)
 {
 	struct xspi_platform_data *pdata;
 	struct resource *r;
-	int irq, num_cs = 0, little_endian = 0, bits_per_word = 8;
+	int irq, num_cs = 0, bits_per_word = 8;
 	struct spi_master *master;
 	u8 i;
 
 	pdata = dev->dev.platform_data;
 	if (pdata) {
 		num_cs = pdata->num_chipselect;
-		little_endian = pdata->little_endian;
 		bits_per_word = pdata->bits_per_word;
 	}
 
@@ -505,7 +518,7 @@ static int xilinx_spi_probe(struct platform_device *dev)
 		return -ENXIO;
 
 	master = xilinx_spi_init(&dev->dev, r, irq, dev->id, num_cs,
-				 little_endian, bits_per_word);
+				 bits_per_word);
 	if (!master)
 		return -ENODEV;
 
diff --git a/include/linux/spi/xilinx_spi.h b/include/linux/spi/xilinx_spi.h
index 6f17278810b0..333ecdfee0d9 100644
--- a/include/linux/spi/xilinx_spi.h
+++ b/include/linux/spi/xilinx_spi.h
@@ -11,7 +11,6 @@
  */
 struct xspi_platform_data {
 	u16 num_chipselect;
-	bool little_endian;
 	u8 bits_per_word;
 	struct spi_board_info *devices;
 	u8 num_devices;
-- 
cgit v1.3-7-g2ca7


From 6ae32c539c0412ca789fb6041be45eeabf78431c Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Tue, 4 Jun 2013 19:18:14 +0200
Subject: PCI: Add pcibios_release_device()

Platforms may want to provide architecture-specific functionality when
a PCI device is released.  Add a pcibios_release_device() call that
architectures can override to do so.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pci.c   | 10 ++++++++++
 drivers/pci/probe.c |  1 +
 include/linux/pci.h |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e5f4e55d407d..709791b70ca0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1334,6 +1334,16 @@ int __weak pcibios_add_device (struct pci_dev *dev)
 	return 0;
 }
 
+/**
+ * pcibios_release_device - provide arch specific hooks when releasing device dev
+ * @dev: the PCI device being released
+ *
+ * Permits the platform to provide architecture specific functionality when
+ * devices are released. This is the default implementation. Architecture
+ * implementations can override this.
+ */
+void __weak pcibios_release_device(struct pci_dev *dev) {}
+
 /**
  * pcibios_disable_device - disable arch specific PCI resources for device dev
  * @dev: the PCI device to disable
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 70f10fa3c1b2..58cc0a8a0979 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1132,6 +1132,7 @@ static void pci_release_dev(struct device *dev)
 	pci_dev = to_pci_dev(dev);
 	pci_release_capabilities(pci_dev);
 	pci_release_of_node(pci_dev);
+	pcibios_release_device(pci_dev);
 	kfree(pci_dev);
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3a24e4ff3248..8f170e9073a5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1643,6 +1643,7 @@ void pcibios_set_master(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
 				 enum pcie_reset_state state);
 int pcibios_add_device(struct pci_dev *dev);
+void pcibios_release_device(struct pci_dev *dev);
 
 #ifdef CONFIG_PCI_MMCONFIG
 void __init pci_mmcfg_early_init(void);
-- 
cgit v1.3-7-g2ca7


From 9e50a9122f048c67a4e83916434e2e212a6f0fe2 Mon Sep 17 00:00:00 2001
From: Betty Dall <betty.dall@hp.com>
Date: Thu, 6 Jun 2013 12:10:49 -0600
Subject: PCI/AER: Move AER severity defines to aer.h

The function aer_recover_queue() is a public interface and the
severity argument uses #defines that are in the private header
pci/pcie/aer/aerdrv.h.

This patch moves the #defines from pci/pcie/aer/aerdrv.h to
include/linux/aer.h.

[bhelgaas: split "remove 'extern' from declarations" to another patch]
Signed-off-by: Betty Dall <betty.dall@hp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/pcie/aer/aerdrv.h | 4 ----
 include/linux/aer.h           | 4 ++++
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index d12c77cd6991..90ea3e88041f 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -13,10 +13,6 @@
 #include <linux/aer.h>
 #include <linux/interrupt.h>
 
-#define AER_NONFATAL			0
-#define AER_FATAL			1
-#define AER_CORRECTABLE			2
-
 #define SYSTEM_ERROR_INTR_ON_MESG_MASK	(PCI_EXP_RTCTL_SECEE|	\
 					PCI_EXP_RTCTL_SENFEE|	\
 					PCI_EXP_RTCTL_SEFEE)
diff --git a/include/linux/aer.h b/include/linux/aer.h
index ec10e1b24c1c..d2cf2e4f9c2d 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -7,6 +7,10 @@
 #ifndef _AER_H_
 #define _AER_H_
 
+#define AER_NONFATAL			0
+#define AER_FATAL			1
+#define AER_CORRECTABLE			2
+
 struct aer_header_log_regs {
 	unsigned int dw0;
 	unsigned int dw1;
-- 
cgit v1.3-7-g2ca7


From fde41b9fa2d0d6abd5b1b5674f1da3bb40ebc98d Mon Sep 17 00:00:00 2001
From: Betty Dall <betty.dall@hp.com>
Date: Thu, 6 Jun 2013 14:35:35 -0600
Subject: PCI/AER: Remove "extern" from function declarations

We had an inconsistent mix of using and omitting the "extern" keyword
on function declarations in header files.  This removes them all.

[bhelgaas: split out from "move AER severity defines" patch]
Signed-off-by: Betty Dall <betty.dall@hp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/aer.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index d2cf2e4f9c2d..55bb3dc4b2db 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -35,9 +35,9 @@ struct aer_capability_regs {
 
 #if defined(CONFIG_PCIEAER)
 /* pci-e port driver needs this function to enable aer */
-extern int pci_enable_pcie_error_reporting(struct pci_dev *dev);
-extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
-extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
+int pci_enable_pcie_error_reporting(struct pci_dev *dev);
+int pci_disable_pcie_error_reporting(struct pci_dev *dev);
+int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 #else
 static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev)
 {
@@ -53,10 +53,10 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
 }
 #endif
 
-extern void cper_print_aer(const char *prefix, struct pci_dev *dev,
-			   int cper_severity, struct aer_capability_regs *aer);
-extern int cper_severity_to_aer(int cper_severity);
-extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
-			      int severity);
+void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity,
+		    struct aer_capability_regs *aer);
+int cper_severity_to_aer(int cper_severity);
+void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
+		       int severity);
 #endif //_AER_H_
 
-- 
cgit v1.3-7-g2ca7


From 439d7a358f93a52458527329939be9f97db1242a Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@calxeda.com>
Date: Thu, 30 May 2013 15:17:30 -0500
Subject: ahci: make ahci_transmit_led_message into a function pointer

Create a new ata_port_operations function pointer called
transmit_led_message and give it the default value of
ahci_transmit_led_message. This allows AHCI controllers with
non-standard LED interfaces to use the existing em_ interface.

Signed-off-by: Mark Langsdorf <mark.langsdorf@calxeda.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ata/libahci.c  | 11 ++++++-----
 include/linux/libata.h |  3 +++
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index 3797a7bba15d..64c3f1f41af1 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -173,6 +173,7 @@ struct ata_port_operations ahci_ops = {
 	.em_store		= ahci_led_store,
 	.sw_activity_show	= ahci_activity_show,
 	.sw_activity_store	= ahci_activity_store,
+	.transmit_led_message	= ahci_transmit_led_message,
 #ifdef CONFIG_PM
 	.port_suspend		= ahci_port_suspend,
 	.port_resume		= ahci_port_resume,
@@ -774,7 +775,7 @@ static void ahci_start_port(struct ata_port *ap)
 
 			/* EM Transmit bit maybe busy during init */
 			for (i = 0; i < EM_MAX_RETRY; i++) {
-				rc = ahci_transmit_led_message(ap,
+				rc = ap->ops->transmit_led_message(ap,
 							       emp->led_state,
 							       4);
 				if (rc == -EBUSY)
@@ -915,7 +916,7 @@ static void ahci_sw_activity_blink(unsigned long arg)
 			led_message |= (1 << 16);
 	}
 	spin_unlock_irqrestore(ap->lock, flags);
-	ahci_transmit_led_message(ap, led_message, 4);
+	ap->ops->transmit_led_message(ap, led_message, 4);
 }
 
 static void ahci_init_sw_activity(struct ata_link *link)
@@ -1044,7 +1045,7 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
 	if (emp->blink_policy)
 		state &= ~EM_MSG_LED_VALUE_ACTIVITY;
 
-	return ahci_transmit_led_message(ap, state, size);
+	return ap->ops->transmit_led_message(ap, state, size);
 }
 
 static ssize_t ahci_activity_store(struct ata_device *dev, enum sw_activity val)
@@ -1063,7 +1064,7 @@ static ssize_t ahci_activity_store(struct ata_device *dev, enum sw_activity val)
 		/* set the LED to OFF */
 		port_led_state &= EM_MSG_LED_VALUE_OFF;
 		port_led_state |= (ap->port_no | (link->pmp << 8));
-		ahci_transmit_led_message(ap, port_led_state, 4);
+		ap->ops->transmit_led_message(ap, port_led_state, 4);
 	} else {
 		link->flags |= ATA_LFLAG_SW_ACTIVITY;
 		if (val == BLINK_OFF) {
@@ -1071,7 +1072,7 @@ static ssize_t ahci_activity_store(struct ata_device *dev, enum sw_activity val)
 			port_led_state &= EM_MSG_LED_VALUE_OFF;
 			port_led_state |= (ap->port_no | (link->pmp << 8));
 			port_led_state |= EM_MSG_LED_VALUE_ON; /* check this */
-			ahci_transmit_led_message(ap, port_led_state, 4);
+			ap->ops->transmit_led_message(ap, port_led_state, 4);
 		}
 	}
 	emp->blink_policy = val;
diff --git a/include/linux/libata.h b/include/linux/libata.h
index c886dc87aa81..4ea55bb45deb 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -910,6 +910,9 @@ struct ata_port_operations {
 	ssize_t (*sw_activity_show)(struct ata_device *dev, char *buf);
 	ssize_t (*sw_activity_store)(struct ata_device *dev,
 				     enum sw_activity val);
+	ssize_t (*transmit_led_message)(struct ata_port *ap, u32 state,
+					ssize_t size);
+
 	/*
 	 * Obsolete
 	 */
-- 
cgit v1.3-7-g2ca7


From 1237e598a94b5a44a0162a4f4534d18ef8a81a7d Mon Sep 17 00:00:00 2001
From: Philippe Begnic <philippe.begnic@st.com>
Date: Mon, 27 May 2013 14:41:29 +0200
Subject: clk: ux500: Pass clock base adresses in initcall for u8540 and u9540

Align on u8500 version, pass clock base address in clk_init functions
for u8540 and u9540.

Signed-off-by: Linus Walleij <linus.walleij@stericsson.com>
Signed-off-by: Philippe Begnic <philippe.begnic@st.com>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 arch/arm/mach-ux500/cpu.c               | 6 ++++--
 drivers/clk/ux500/u8540_clk.c           | 4 ++--
 drivers/clk/ux500/u9540_clk.c           | 4 ++--
 include/linux/platform_data/clk-ux500.h | 6 ++++--
 4 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index b6145ea51641..e6fb0239151b 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -76,13 +76,15 @@ void __init ux500_init_irq(void)
 	} else if (cpu_is_u9540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K - 1);
-		u8500_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
+		u9540_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
 			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
 			       U8500_CLKRST6_BASE);
 	} else if (cpu_is_u8540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
 		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
-		u8540_clk_init();
+		u8540_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
+			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
+			       U8500_CLKRST6_BASE);
 	}
 }
 
diff --git a/drivers/clk/ux500/u8540_clk.c b/drivers/clk/ux500/u8540_clk.c
index 10adfd2ead21..90f3c88694b9 100644
--- a/drivers/clk/ux500/u8540_clk.c
+++ b/drivers/clk/ux500/u8540_clk.c
@@ -12,10 +12,10 @@
 #include <linux/clk-provider.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/platform_data/clk-ux500.h>
-
 #include "clk.h"
 
-void u8540_clk_init(void)
+void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base)
 {
 	/* register clocks here */
 }
diff --git a/drivers/clk/ux500/u9540_clk.c b/drivers/clk/ux500/u9540_clk.c
index dbc0191e16c8..44794782e7e0 100644
--- a/drivers/clk/ux500/u9540_clk.c
+++ b/drivers/clk/ux500/u9540_clk.c
@@ -12,10 +12,10 @@
 #include <linux/clk-provider.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/platform_data/clk-ux500.h>
-
 #include "clk.h"
 
-void u9540_clk_init(void)
+void u9540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base)
 {
 	/* register clocks here */
 }
diff --git a/include/linux/platform_data/clk-ux500.h b/include/linux/platform_data/clk-ux500.h
index 320d9c39ea0a..9d98f3aaa16c 100644
--- a/include/linux/platform_data/clk-ux500.h
+++ b/include/linux/platform_data/clk-ux500.h
@@ -12,7 +12,9 @@
 
 void u8500_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
 		    u32 clkrst5_base, u32 clkrst6_base);
-void u9540_clk_init(void);
-void u8540_clk_init(void);
+void u9540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base);
+void u8540_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base);
 
 #endif /* __CLK_UX500_H */
-- 
cgit v1.3-7-g2ca7


From 852bbba96773777efc2b932f2c5939c6fbf252da Mon Sep 17 00:00:00 2001
From: Philippe Begnic <philippe.begnic@st.com>
Date: Mon, 27 May 2013 14:41:30 +0200
Subject: mfd: db8500: Update register definition for u8540 clock

PRCMU and ab8500 registers updated for u8540

Signed-off-by: Philippe Begnic <philippe.begnic@st.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 include/linux/mfd/abx500/ab8500-sysctrl.h |  4 ++--
 include/linux/mfd/dbx500-prcmu.h          | 11 +++++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/abx500/ab8500-sysctrl.h b/include/linux/mfd/abx500/ab8500-sysctrl.h
index 990bc93f46e1..adba89d9c660 100644
--- a/include/linux/mfd/abx500/ab8500-sysctrl.h
+++ b/include/linux/mfd/abx500/ab8500-sysctrl.h
@@ -278,8 +278,8 @@ struct ab8500_sysctrl_platform_data {
 
 #define AB9540_SYSCLK12CONFCTRL_PLL26TO38ENA BIT(0)
 #define AB9540_SYSCLK12CONFCTRL_SYSCLK12USBMUXSEL BIT(1)
-#define AB9540_SYSCLK12CONFCTRL_INT384MHZMUXSEL_MASK 0x0C
-#define AB9540_SYSCLK12CONFCTRL_INT384MHZMUXSEL_SHIFT 2
+#define AB9540_SYSCLK12CONFCTRL_INT384MHZMUXSEL0 BIT(2)
+#define AB9540_SYSCLK12CONFCTRL_INT384MHZMUXSEL1 BIT(3)
 #define AB9540_SYSCLK12CONFCTRL_SYSCLK12BUFMUX BIT(4)
 #define AB9540_SYSCLK12CONFCTRL_SYSCLK12PLLMUX BIT(5)
 #define AB9540_SYSCLK12CONFCTRL_SYSCLK2MUXVALID BIT(6)
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index 689e6a0d9c99..d0ba355cc55f 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -134,6 +134,10 @@ enum prcmu_clock {
 	PRCMU_SIACLK,
 	PRCMU_SVACLK,
 	PRCMU_ACLK,
+	PRCMU_HVACLK, /* Ux540 only */
+	PRCMU_G1CLK, /* Ux540 only */
+	PRCMU_SDMMCHCLK,
+	PRCMU_CAMCLK,
 	PRCMU_NUM_REG_CLOCKS,
 	PRCMU_SYSCLK = PRCMU_NUM_REG_CLOCKS,
 	PRCMU_CDCLK,
@@ -148,6 +152,13 @@ enum prcmu_clock {
 	PRCMU_DSI0ESCCLK,
 	PRCMU_DSI1ESCCLK,
 	PRCMU_DSI2ESCCLK,
+	/* LCD DSI PLL - Ux540 only */
+	PRCMU_PLLDSI_LCD,
+	PRCMU_DSI0CLK_LCD,
+	PRCMU_DSI1CLK_LCD,
+	PRCMU_DSI0ESCCLK_LCD,
+	PRCMU_DSI1ESCCLK_LCD,
+	PRCMU_DSI2ESCCLK_LCD,
 };
 
 /**
-- 
cgit v1.3-7-g2ca7


From 54e300339cce6d4be665e6bbd736123dd0f15888 Mon Sep 17 00:00:00 2001
From: Philippe Begnic <philippe.begnic@st.com>
Date: Mon, 27 May 2013 14:41:31 +0200
Subject: mfd: db8500: Update BML clock register for db8580

BML clock register address in DB8580 has changed.Defined a new address
under different name for DB8580.

Signed-off-by: Philippe Begnic <philippe.begnic@st.com>
Acked-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/mfd/db8500-prcmu.c       | 1 +
 drivers/mfd/dbx500-prcmu-regs.h  | 1 +
 include/linux/mfd/dbx500-prcmu.h | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 66f80973596b..a292a1deff9a 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -480,6 +480,7 @@ struct clk_mgt clk_mgt[PRCMU_NUM_REG_CLOCKS] = {
 	CLK_MGT_ENTRY(PER6CLK, PLL_DIV, true),
 	CLK_MGT_ENTRY(PER7CLK, PLL_DIV, true),
 	CLK_MGT_ENTRY(LCDCLK, PLL_FIX, true),
+	CLK_MGT_ENTRY(BML8580CLK, PLL_DIV, true),
 	CLK_MGT_ENTRY(BMLCLK, PLL_DIV, true),
 	CLK_MGT_ENTRY(HSITXCLK, PLL_DIV, true),
 	CLK_MGT_ENTRY(HSIRXCLK, PLL_DIV, true),
diff --git a/drivers/mfd/dbx500-prcmu-regs.h b/drivers/mfd/dbx500-prcmu-regs.h
index d14836ed2114..ca355dd423a6 100644
--- a/drivers/mfd/dbx500-prcmu-regs.h
+++ b/drivers/mfd/dbx500-prcmu-regs.h
@@ -32,6 +32,7 @@
 #define PRCM_PER7CLK_MGT	(0x040)
 #define PRCM_LCDCLK_MGT		(0x044)
 #define PRCM_BMLCLK_MGT		(0x04C)
+#define PRCM_BML8580CLK_MGT	(0x108)
 #define PRCM_HSITXCLK_MGT	(0x050)
 #define PRCM_HSIRXCLK_MGT	(0x054)
 #define PRCM_HDMICLK_MGT	(0x058)
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index d0ba355cc55f..ca0790fba2f5 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -138,6 +138,7 @@ enum prcmu_clock {
 	PRCMU_G1CLK, /* Ux540 only */
 	PRCMU_SDMMCHCLK,
 	PRCMU_CAMCLK,
+	PRCMU_BML8580CLK,
 	PRCMU_NUM_REG_CLOCKS,
 	PRCMU_SYSCLK = PRCMU_NUM_REG_CLOCKS,
 	PRCMU_CDCLK,
-- 
cgit v1.3-7-g2ca7


From 2a668a8bc2cbe7a464ab1212475a3efb23a94b1e Mon Sep 17 00:00:00 2001
From: Paul Walmsley <pwalmsley@nvidia.com>
Date: Fri, 7 Jun 2013 08:06:56 +0000
Subject: regulator: core: add regulator_get_linear_step()

Add regulator_get_linear_step(), which returns the voltage step size
between VSEL values for linear regulators.  This is intended for use
by regulator consumers which build their own voltage-to-VSEL tables.

Signed-off-by: Paul Walmsley <pwalmsley@nvidia.com>
Reviewed-by: Andrew Chew <achew@nvidia.com>
Cc: Matthew Longnecker <mlongnecker@nvidia.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 drivers/regulator/core.c           | 15 +++++++++++++++
 include/linux/regulator/consumer.h |  1 +
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 6e5017841582..f07d7adefdda 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2134,6 +2134,21 @@ int regulator_list_voltage(struct regulator *regulator, unsigned selector)
 }
 EXPORT_SYMBOL_GPL(regulator_list_voltage);
 
+/**
+ * regulator_get_linear_step - return the voltage step size between VSEL values
+ * @regulator: regulator source
+ *
+ * Returns the voltage step size between VSEL values for linear
+ * regulators, or return 0 if the regulator isn't a linear regulator.
+ */
+unsigned int regulator_get_linear_step(struct regulator *regulator)
+{
+	struct regulator_dev *rdev = regulator->rdev;
+
+	return rdev->desc->uV_step;
+}
+EXPORT_SYMBOL_GPL(regulator_get_linear_step);
+
 /**
  * regulator_is_supported_voltage - check if a voltage range can be supported
  *
diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 145022a83085..3a76389c6aaa 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -165,6 +165,7 @@ int regulator_count_voltages(struct regulator *regulator);
 int regulator_list_voltage(struct regulator *regulator, unsigned selector);
 int regulator_is_supported_voltage(struct regulator *regulator,
 				   int min_uV, int max_uV);
+unsigned int regulator_get_linear_step(struct regulator *regulator);
 int regulator_set_voltage(struct regulator *regulator, int min_uV, int max_uV);
 int regulator_set_voltage_time(struct regulator *regulator,
 			       int old_uV, int new_uV);
-- 
cgit v1.3-7-g2ca7


From 3dfc35ffd938abe67f2559db6b517536a207df24 Mon Sep 17 00:00:00 2001
From: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Date: Mon, 27 May 2013 14:09:22 +0300
Subject: PM / AVS: SmartReflex: use omap_sr * for errgen interfaces

SmartReflex driver interface is natively divided to two parts:

- external SmartReflex interface
- interface between SmartReflex driver and SmartReflex Class

Functions which belong to AVS class interface can use
struct omap_sr* instead of struct voltatedomain*, to provide a
direct connection between SR driver and SR class. This allows
us to optimize and not do additional lookups where none is
required.

sr_disable_errgen() and sr_configure_errgen() are interface
functions between SR driver and SR class. They are typically
used by Class driver to configure error generator module during
SmartReflex enable/disable sequence.
Now they take struct omap_sr* as input parameter.

Signed-off-by: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Acked-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Kevin Hilman <khilman@linaro.org>
---
 arch/arm/mach-omap2/smartreflex-class3.c |  4 ++--
 drivers/power/avs/smartreflex.c          | 26 +++++++++++++-------------
 include/linux/power/smartreflex.h        |  4 ++--
 3 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/smartreflex-class3.c b/arch/arm/mach-omap2/smartreflex-class3.c
index aee3c8940a30..6c26dc15815c 100644
--- a/arch/arm/mach-omap2/smartreflex-class3.c
+++ b/arch/arm/mach-omap2/smartreflex-class3.c
@@ -31,7 +31,7 @@ static int sr_class3_enable(struct omap_sr *sr)
 
 static int sr_class3_disable(struct omap_sr *sr, int is_volt_reset)
 {
-	sr_disable_errgen(sr->voltdm);
+	sr_disable_errgen(sr);
 	omap_vp_disable(sr->voltdm);
 	sr_disable(sr->voltdm);
 	if (is_volt_reset)
@@ -42,7 +42,7 @@ static int sr_class3_disable(struct omap_sr *sr, int is_volt_reset)
 
 static int sr_class3_configure(struct omap_sr *sr)
 {
-	return sr_configure_errgen(sr->voltdm);
+	return sr_configure_errgen(sr);
 }
 
 /* SR class3 structure */
diff --git a/drivers/power/avs/smartreflex.c b/drivers/power/avs/smartreflex.c
index 002005ee48d2..fccb62743d19 100644
--- a/drivers/power/avs/smartreflex.c
+++ b/drivers/power/avs/smartreflex.c
@@ -342,9 +342,9 @@ static struct omap_sr_nvalue_table *sr_retrieve_nvalue_row(
 /* Public Functions */
 
 /**
- * sr_configure_errgen() - Configures the smrtreflex to perform AVS using the
+ * sr_configure_errgen() - Configures the SmartReflex to perform AVS using the
  *			 error generator module.
- * @voltdm:	VDD pointer to which the SR module to be configured belongs to.
+ * @sr:			SR module to be configured.
  *
  * This API is to be called from the smartreflex class driver to
  * configure the error generator module inside the smartreflex module.
@@ -353,17 +353,17 @@ static struct omap_sr_nvalue_table *sr_retrieve_nvalue_row(
  * SR CLASS 2 can choose between ERROR module and MINMAXAVG
  * module. Returns 0 on success and error value in case of failure.
  */
-int sr_configure_errgen(struct voltagedomain *voltdm)
+int sr_configure_errgen(struct omap_sr *sr)
 {
 	u32 sr_config, sr_errconfig, errconfig_offs;
 	u32 vpboundint_en, vpboundint_st;
 	u32 senp_en = 0, senn_en = 0;
 	u8 senp_shift, senn_shift;
-	struct omap_sr *sr = _sr_lookup(voltdm);
 
-	if (IS_ERR(sr)) {
-		pr_warning("%s: omap_sr struct for voltdm not found\n",	__func__);
-		return PTR_ERR(sr);
+	if (!sr) {
+		pr_warn("%s: NULL omap_sr from %pF\n", __func__,
+			(void *)_RET_IP_);
+		return -EINVAL;
 	}
 
 	if (!sr->clk_length)
@@ -415,22 +415,22 @@ int sr_configure_errgen(struct voltagedomain *voltdm)
 
 /**
  * sr_disable_errgen() - Disables SmartReflex AVS module's errgen component
- * @voltdm:	VDD pointer to which the SR module to be configured belongs to.
+ * @sr:			SR module to be configured.
  *
  * This API is to be called from the smartreflex class driver to
  * disable the error generator module inside the smartreflex module.
  *
  * Returns 0 on success and error value in case of failure.
  */
-int sr_disable_errgen(struct voltagedomain *voltdm)
+int sr_disable_errgen(struct omap_sr *sr)
 {
 	u32 errconfig_offs;
 	u32 vpboundint_en, vpboundint_st;
-	struct omap_sr *sr = _sr_lookup(voltdm);
 
-	if (IS_ERR(sr)) {
-		pr_warning("%s: omap_sr struct for voltdm not found\n",	__func__);
-		return PTR_ERR(sr);
+	if (!sr) {
+		pr_warn("%s: NULL omap_sr from %pF\n", __func__,
+			(void *)_RET_IP_);
+		return -EINVAL;
 	}
 
 	switch (sr->ip_type) {
diff --git a/include/linux/power/smartreflex.h b/include/linux/power/smartreflex.h
index c0f44c2b006d..9c3b9ad17095 100644
--- a/include/linux/power/smartreflex.h
+++ b/include/linux/power/smartreflex.h
@@ -301,8 +301,8 @@ void omap_sr_register_pmic(struct omap_sr_pmic_data *pmic_data);
 /* Smartreflex driver hooks to be called from Smartreflex class driver */
 int sr_enable(struct voltagedomain *voltdm, unsigned long volt);
 void sr_disable(struct voltagedomain *voltdm);
-int sr_configure_errgen(struct voltagedomain *voltdm);
-int sr_disable_errgen(struct voltagedomain *voltdm);
+int sr_configure_errgen(struct omap_sr *sr);
+int sr_disable_errgen(struct omap_sr *sr);
 int sr_configure_minmax(struct voltagedomain *voltdm);
 
 /* API to register the smartreflex class driver with the smartreflex driver */
-- 
cgit v1.3-7-g2ca7


From 6c80573415fe47450579d5d8bfab53b304d803ed Mon Sep 17 00:00:00 2001
From: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Date: Mon, 27 May 2013 14:09:23 +0300
Subject: PM / AVS: SmartReflex: use omap_sr * for minmax interfaces

SmartReflex driver interface is natively divided to two parts:

- external SmartReflex interface
- interface between SmartReflex driver and SmartReflex Class

Functions which belong to AVS class interface can use
struct omap_sr* instead of struct voltatedomain*, to provide a
direct connection between SR driver and SR class. This allows
us to optimize and not do additional lookups where none is
required.

sr_configure_minmax() is interface function between SR driver
and SR class. It is typically used by Class driver to
configure MINMAXAVG module inside SmartReflex module.
Now it takes struct omap_sr* as input parameter.

Signed-off-by: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Acked-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Kevin Hilman <khilman@linaro.org>
---
 drivers/power/avs/smartreflex.c   | 14 +++++++-------
 include/linux/power/smartreflex.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/avs/smartreflex.c b/drivers/power/avs/smartreflex.c
index fccb62743d19..08a8a299f1fd 100644
--- a/drivers/power/avs/smartreflex.c
+++ b/drivers/power/avs/smartreflex.c
@@ -465,9 +465,9 @@ int sr_disable_errgen(struct omap_sr *sr)
 }
 
 /**
- * sr_configure_minmax() - Configures the smrtreflex to perform AVS using the
+ * sr_configure_minmax() - Configures the SmartReflex to perform AVS using the
  *			 minmaxavg module.
- * @voltdm:	VDD pointer to which the SR module to be configured belongs to.
+ * @sr:			SR module to be configured.
  *
  * This API is to be called from the smartreflex class driver to
  * configure the minmaxavg module inside the smartreflex module.
@@ -476,16 +476,16 @@ int sr_disable_errgen(struct omap_sr *sr)
  * SR CLASS 2 can choose between ERROR module and MINMAXAVG
  * module. Returns 0 on success and error value in case of failure.
  */
-int sr_configure_minmax(struct voltagedomain *voltdm)
+int sr_configure_minmax(struct omap_sr *sr)
 {
 	u32 sr_config, sr_avgwt;
 	u32 senp_en = 0, senn_en = 0;
 	u8 senp_shift, senn_shift;
-	struct omap_sr *sr = _sr_lookup(voltdm);
 
-	if (IS_ERR(sr)) {
-		pr_warning("%s: omap_sr struct for voltdm not found\n",	__func__);
-		return PTR_ERR(sr);
+	if (!sr) {
+		pr_warn("%s: NULL omap_sr from %pF\n", __func__,
+			(void *)_RET_IP_);
+		return -EINVAL;
 	}
 
 	if (!sr->clk_length)
diff --git a/include/linux/power/smartreflex.h b/include/linux/power/smartreflex.h
index 9c3b9ad17095..648be776b661 100644
--- a/include/linux/power/smartreflex.h
+++ b/include/linux/power/smartreflex.h
@@ -303,7 +303,7 @@ int sr_enable(struct voltagedomain *voltdm, unsigned long volt);
 void sr_disable(struct voltagedomain *voltdm);
 int sr_configure_errgen(struct omap_sr *sr);
 int sr_disable_errgen(struct omap_sr *sr);
-int sr_configure_minmax(struct voltagedomain *voltdm);
+int sr_configure_minmax(struct omap_sr *sr);
 
 /* API to register the smartreflex class driver with the smartreflex driver */
 int sr_register_class(struct omap_sr_class_data *class_data);
-- 
cgit v1.3-7-g2ca7


From 299066bb376ef7720cc3d8de95d5b967c5446863 Mon Sep 17 00:00:00 2001
From: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Date: Mon, 27 May 2013 14:09:24 +0300
Subject: PM / AVS: SmartReflex: use omap_sr * for enable/disable interface

SmartReflex driver interface is natively divided to two parts:

- external SmartReflex interface
- interface between SmartReflex driver and SmartReflex Class

Functions which belong to AVS class interface can use
struct omap_sr* instead of struct voltatedomain*, to provide a
direct connection between SR driver and SR class. This allows
us to optimize and not do additional lookups where none is
required.

sr_enable() and sr_disable() are interface functions between
SR driver and SR class. They are typically used by Class driver
to enable/disable SmartReflex hardware module.
Now they take struct omap_sr* as input parameter.

Signed-off-by: Andrii Tseglytskyi <andrii.tseglytskyi@ti.com>
Acked-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Kevin Hilman <khilman@linaro.org>
---
 arch/arm/mach-omap2/smartreflex-class3.c |  4 ++--
 drivers/power/avs/smartreflex.c          | 23 +++++++++++------------
 include/linux/power/smartreflex.h        |  4 ++--
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-omap2/smartreflex-class3.c b/arch/arm/mach-omap2/smartreflex-class3.c
index 6c26dc15815c..7a42e1960c3b 100644
--- a/arch/arm/mach-omap2/smartreflex-class3.c
+++ b/arch/arm/mach-omap2/smartreflex-class3.c
@@ -26,14 +26,14 @@ static int sr_class3_enable(struct omap_sr *sr)
 	}
 
 	omap_vp_enable(sr->voltdm);
-	return sr_enable(sr->voltdm, volt);
+	return sr_enable(sr, volt);
 }
 
 static int sr_class3_disable(struct omap_sr *sr, int is_volt_reset)
 {
 	sr_disable_errgen(sr);
 	omap_vp_disable(sr->voltdm);
-	sr_disable(sr->voltdm);
+	sr_disable(sr);
 	if (is_volt_reset)
 		voltdm_reset(sr->voltdm);
 
diff --git a/drivers/power/avs/smartreflex.c b/drivers/power/avs/smartreflex.c
index 08a8a299f1fd..14cce7addbb9 100644
--- a/drivers/power/avs/smartreflex.c
+++ b/drivers/power/avs/smartreflex.c
@@ -552,7 +552,7 @@ int sr_configure_minmax(struct omap_sr *sr)
 
 /**
  * sr_enable() - Enables the smartreflex module.
- * @voltdm:	VDD pointer to which the SR module to be configured belongs to.
+ * @sr:		pointer to which the SR module to be configured belongs to.
  * @volt:	The voltage at which the Voltage domain associated with
  *		the smartreflex module is operating at.
  *		This is required only to program the correct Ntarget value.
@@ -561,16 +561,16 @@ int sr_configure_minmax(struct omap_sr *sr)
  * enable a smartreflex module. Returns 0 on success. Returns error
  * value if the voltage passed is wrong or if ntarget value is wrong.
  */
-int sr_enable(struct voltagedomain *voltdm, unsigned long volt)
+int sr_enable(struct omap_sr *sr, unsigned long volt)
 {
 	struct omap_volt_data *volt_data;
-	struct omap_sr *sr = _sr_lookup(voltdm);
 	struct omap_sr_nvalue_table *nvalue_row;
 	int ret;
 
-	if (IS_ERR(sr)) {
-		pr_warning("%s: omap_sr struct for voltdm not found\n",	__func__);
-		return PTR_ERR(sr);
+	if (!sr) {
+		pr_warn("%s: NULL omap_sr from %pF\n", __func__,
+			(void *)_RET_IP_);
+		return -EINVAL;
 	}
 
 	volt_data = omap_voltage_get_voltdata(sr->voltdm, volt);
@@ -612,17 +612,16 @@ int sr_enable(struct voltagedomain *voltdm, unsigned long volt)
 
 /**
  * sr_disable() - Disables the smartreflex module.
- * @voltdm:	VDD pointer to which the SR module to be configured belongs to.
+ * @sr:		pointer to which the SR module to be configured belongs to.
  *
  * This API is to be called from the smartreflex class driver to
  * disable a smartreflex module.
  */
-void sr_disable(struct voltagedomain *voltdm)
+void sr_disable(struct omap_sr *sr)
 {
-	struct omap_sr *sr = _sr_lookup(voltdm);
-
-	if (IS_ERR(sr)) {
-		pr_warning("%s: omap_sr struct for voltdm not found\n",	__func__);
+	if (!sr) {
+		pr_warn("%s: NULL omap_sr from %pF\n", __func__,
+			(void *)_RET_IP_);
 		return;
 	}
 
diff --git a/include/linux/power/smartreflex.h b/include/linux/power/smartreflex.h
index 648be776b661..d8b187c3925d 100644
--- a/include/linux/power/smartreflex.h
+++ b/include/linux/power/smartreflex.h
@@ -299,8 +299,8 @@ void omap_sr_disable_reset_volt(struct voltagedomain *voltdm);
 void omap_sr_register_pmic(struct omap_sr_pmic_data *pmic_data);
 
 /* Smartreflex driver hooks to be called from Smartreflex class driver */
-int sr_enable(struct voltagedomain *voltdm, unsigned long volt);
-void sr_disable(struct voltagedomain *voltdm);
+int sr_enable(struct omap_sr *sr, unsigned long volt);
+void sr_disable(struct omap_sr *sr);
 int sr_configure_errgen(struct omap_sr *sr);
 int sr_disable_errgen(struct omap_sr *sr);
 int sr_configure_minmax(struct omap_sr *sr);
-- 
cgit v1.3-7-g2ca7


From 99f88919f8fa8a8b01b5306c59c9977b94604df8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Mar 2013 16:54:14 -0700
Subject: rcu: Remove srcu_read_lock_raw() and srcu_read_unlock_raw().

These interfaces never did get used, so this commit removes them,
their rcutorture tests, and documentation referencing them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 Documentation/RCU/checklist.txt |  6 ------
 Documentation/RCU/torture.txt   |  6 ------
 Documentation/RCU/whatisRCU.txt | 22 +++++++--------------
 include/linux/srcu.h            | 43 -----------------------------------------
 kernel/rcutorture.c             | 39 -------------------------------------
 5 files changed, 7 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 79e789b8b8ea..7703ec73a9bb 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -354,12 +354,6 @@ over a rather long period of time, but improvements are always welcome!
 	using RCU rather than SRCU, because RCU is almost always faster
 	and easier to use than is SRCU.
 
-	If you need to enter your read-side critical section in a
-	hardirq or exception handler, and then exit that same read-side
-	critical section in the task that was interrupted, then you need
-	to srcu_read_lock_raw() and srcu_read_unlock_raw(), which avoid
-	the lockdep checking that would otherwise this practice illegal.
-
 	Also unlike other forms of RCU, explicit initialization
 	and cleanup is required via init_srcu_struct() and
 	cleanup_srcu_struct().	These are passed a "struct srcu_struct"
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 7dce8a17eac2..d8a502387397 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -182,12 +182,6 @@ torture_type	The type of RCU to test, with string values as follows:
 		"srcu_expedited": srcu_read_lock(), srcu_read_unlock() and
 			synchronize_srcu_expedited().
 
-		"srcu_raw": srcu_read_lock_raw(), srcu_read_unlock_raw(),
-			and call_srcu().
-
-		"srcu_raw_sync": srcu_read_lock_raw(), srcu_read_unlock_raw(),
-			and synchronize_srcu().
-
 		"sched": preempt_disable(), preempt_enable(), and
 			call_rcu_sched().
 
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 10df0b82f459..0f0fb7c432c2 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -842,9 +842,7 @@ SRCU:	Critical sections	Grace period		Barrier
 
 	srcu_read_lock		synchronize_srcu	srcu_barrier
 	srcu_read_unlock	call_srcu
-	srcu_read_lock_raw	synchronize_srcu_expedited
-	srcu_read_unlock_raw
-	srcu_dereference
+	srcu_dereference	synchronize_srcu_expedited
 
 SRCU:	Initialization/cleanup
 	init_srcu_struct
@@ -865,38 +863,32 @@ list can be helpful:
 
 a.	Will readers need to block?  If so, you need SRCU.
 
-b.	Is it necessary to start a read-side critical section in a
-	hardirq handler or exception handler, and then to complete
-	this read-side critical section in the task that was
-	interrupted?  If so, you need SRCU's srcu_read_lock_raw() and
-	srcu_read_unlock_raw() primitives.
-
-c.	What about the -rt patchset?  If readers would need to block
+b.	What about the -rt patchset?  If readers would need to block
 	in an non-rt kernel, you need SRCU.  If readers would block
 	in a -rt kernel, but not in a non-rt kernel, SRCU is not
 	necessary.
 
-d.	Do you need to treat NMI handlers, hardirq handlers,
+c.	Do you need to treat NMI handlers, hardirq handlers,
 	and code segments with preemption disabled (whether
 	via preempt_disable(), local_irq_save(), local_bh_disable(),
 	or some other mechanism) as if they were explicit RCU readers?
 	If so, RCU-sched is the only choice that will work for you.
 
-e.	Do you need RCU grace periods to complete even in the face
+d.	Do you need RCU grace periods to complete even in the face
 	of softirq monopolization of one or more of the CPUs?  For
 	example, is your code subject to network-based denial-of-service
 	attacks?  If so, you need RCU-bh.
 
-f.	Is your workload too update-intensive for normal use of
+e.	Is your workload too update-intensive for normal use of
 	RCU, but inappropriate for other synchronization mechanisms?
 	If so, consider SLAB_DESTROY_BY_RCU.  But please be careful!
 
-g.	Do you need read-side critical sections that are respected
+f.	Do you need read-side critical sections that are respected
 	even though they are in the middle of the idle loop, during
 	user-mode execution, or on an offlined CPU?  If so, SRCU is the
 	only choice that will work for you.
 
-h.	Otherwise, use RCU.
+g.	Otherwise, use RCU.
 
 Of course, this all assumes that you have determined that RCU is in fact
 the right tool for your job.
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 04f4121a23ae..c114614ed172 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -237,47 +237,4 @@ static inline void srcu_read_unlock(struct srcu_struct *sp, int idx)
 	__srcu_read_unlock(sp, idx);
 }
 
-/**
- * srcu_read_lock_raw - register a new reader for an SRCU-protected structure.
- * @sp: srcu_struct in which to register the new reader.
- *
- * Enter an SRCU read-side critical section.  Similar to srcu_read_lock(),
- * but avoids the RCU-lockdep checking.  This means that it is legal to
- * use srcu_read_lock_raw() in one context, for example, in an exception
- * handler, and then have the matching srcu_read_unlock_raw() in another
- * context, for example in the task that took the exception.
- *
- * However, the entire SRCU read-side critical section must reside within a
- * single task.  For example, beware of using srcu_read_lock_raw() in
- * a device interrupt handler and srcu_read_unlock() in the interrupted
- * task:  This will not work if interrupts are threaded.
- */
-static inline int srcu_read_lock_raw(struct srcu_struct *sp)
-{
-	unsigned long flags;
-	int ret;
-
-	local_irq_save(flags);
-	ret =  __srcu_read_lock(sp);
-	local_irq_restore(flags);
-	return ret;
-}
-
-/**
- * srcu_read_unlock_raw - unregister reader from an SRCU-protected structure.
- * @sp: srcu_struct in which to unregister the old reader.
- * @idx: return value from corresponding srcu_read_lock_raw().
- *
- * Exit an SRCU read-side critical section without lockdep-RCU checking.
- * See srcu_read_lock_raw() for more details.
- */
-static inline void srcu_read_unlock_raw(struct srcu_struct *sp, int idx)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__srcu_read_unlock(sp, idx);
-	local_irq_restore(flags);
-}
-
 #endif
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index e1f3a8c96724..b1fa5510388d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -695,44 +695,6 @@ static struct rcu_torture_ops srcu_sync_ops = {
 	.name		= "srcu_sync"
 };
 
-static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
-{
-	return srcu_read_lock_raw(&srcu_ctl);
-}
-
-static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
-{
-	srcu_read_unlock_raw(&srcu_ctl, idx);
-}
-
-static struct rcu_torture_ops srcu_raw_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= srcu_torture_read_lock_raw,
-	.read_delay	= srcu_read_delay,
-	.readunlock	= srcu_torture_read_unlock_raw,
-	.completed	= srcu_torture_completed,
-	.deferred_free	= srcu_torture_deferred_free,
-	.sync		= srcu_torture_synchronize,
-	.call		= NULL,
-	.cb_barrier	= NULL,
-	.stats		= srcu_torture_stats,
-	.name		= "srcu_raw"
-};
-
-static struct rcu_torture_ops srcu_raw_sync_ops = {
-	.init		= rcu_sync_torture_init,
-	.readlock	= srcu_torture_read_lock_raw,
-	.read_delay	= srcu_read_delay,
-	.readunlock	= srcu_torture_read_unlock_raw,
-	.completed	= srcu_torture_completed,
-	.deferred_free	= rcu_sync_torture_deferred_free,
-	.sync		= srcu_torture_synchronize,
-	.call		= NULL,
-	.cb_barrier	= NULL,
-	.stats		= srcu_torture_stats,
-	.name		= "srcu_raw_sync"
-};
-
 static void srcu_torture_synchronize_expedited(void)
 {
 	synchronize_srcu_expedited(&srcu_ctl);
@@ -1983,7 +1945,6 @@ rcu_torture_init(void)
 		{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
 		  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
 		  &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops,
-		  &srcu_raw_ops, &srcu_raw_sync_ops,
 		  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
 
 	mutex_lock(&fullstop_mutex);
-- 
cgit v1.3-7-g2ca7


From 127781d1ba1ee5bbe1780afa35dd0e71583b143d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 27 Mar 2013 08:44:00 -0700
Subject: rcu: Remove TINY_PREEMPT_RCU

TINY_PREEMPT_RCU adds significant code and complexity, but does not
offer commensurate benefits.  People currently using TINY_PREEMPT_RCU
can get much better memory footprint with TINY_RCU, or, if they really
need preemptible RCU, they can use TREE_PREEMPT_RCU with a relatively
minor degradation in memory footprint.  Please note that this move
has been widely publicized on LKML (https://lkml.org/lkml/2012/11/12/545)
and on LWN (http://lwn.net/Articles/541037/).

This commit therefore removes TINY_PREEMPT_RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Updated to eliminate #else in rcutiny.h as suggested by Josh ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/hardirq.h  |   2 +-
 include/linux/rcupdate.h |   2 +-
 include/linux/rcutiny.h  |  24 +-
 init/Kconfig             |  10 +-
 kernel/rcutiny_plugin.h  | 854 -----------------------------------------------
 5 files changed, 5 insertions(+), 887 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index c1d6555d2567..05bcc0903766 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -128,7 +128,7 @@ extern void synchronize_irq(unsigned int irq);
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
+#if defined(CONFIG_TINY_RCU)
 
 static inline void rcu_nmi_enter(void)
 {
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index ddcc7826d907..70b1522badd3 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -277,7 +277,7 @@ void wait_rcu_gp(call_rcu_func_t crf);
 
 #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
 #include <linux/rcutree.h>
-#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
+#elif defined(CONFIG_TINY_RCU)
 #include <linux/rcutiny.h>
 #else
 #error "Unknown RCU implementation specified to kernel configuration"
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 4e56a9c69a35..d3c094ffa960 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -53,16 +53,7 @@ static inline void rcu_barrier(void)
 	rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
 }
 
-#else /* #ifdef CONFIG_TINY_RCU */
-
-void synchronize_rcu_expedited(void);
-
-static inline void rcu_barrier(void)
-{
-	wait_rcu_gp(call_rcu);
-}
-
-#endif /* #else #ifdef CONFIG_TINY_RCU */
+#endif /* #ifdef CONFIG_TINY_RCU */
 
 static inline void synchronize_rcu_bh(void)
 {
@@ -97,18 +88,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	return 0;
 }
 
-#else /* #ifdef CONFIG_TINY_RCU */
-
-void rcu_preempt_note_context_switch(void);
-int rcu_preempt_needs_cpu(void);
-
-static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
-{
-	*delta_jiffies = ULONG_MAX;
-	return rcu_preempt_needs_cpu();
-}
-
-#endif /* #else #ifdef CONFIG_TINY_RCU */
+#endif /* #ifdef CONFIG_TINY_RCU */
 
 static inline void rcu_note_context_switch(int cpu)
 {
diff --git a/init/Kconfig b/init/Kconfig
index 2d9b83104dcf..e7fb255413d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -459,18 +459,10 @@ config TINY_RCU
 	  is not required.  This option greatly reduces the
 	  memory footprint of RCU.
 
-config TINY_PREEMPT_RCU
-	bool "Preemptible UP-only small-memory-footprint RCU"
-	depends on PREEMPT && !SMP
-	help
-	  This option selects the RCU implementation that is designed
-	  for real-time UP systems.  This option greatly reduces the
-	  memory footprint of RCU.
-
 endchoice
 
 config PREEMPT_RCU
-	def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
+	def_bool TREE_PREEMPT_RCU
 	help
 	  This option enables preemptible-RCU code that is common between
 	  the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 8a233002faeb..29a4dd78c8bf 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -102,763 +102,6 @@ static void check_cpu_stalls(void)
 	RCU_TRACE(check_cpu_stall_preempt());
 }
 
-#ifdef CONFIG_TINY_PREEMPT_RCU
-
-#include <linux/delay.h>
-
-/* Global control variables for preemptible RCU. */
-struct rcu_preempt_ctrlblk {
-	struct rcu_ctrlblk rcb;	/* curtail: ->next ptr of last CB for GP. */
-	struct rcu_head **nexttail;
-				/* Tasks blocked in a preemptible RCU */
-				/*  read-side critical section while an */
-				/*  preemptible-RCU grace period is in */
-				/*  progress must wait for a later grace */
-				/*  period.  This pointer points to the */
-				/*  ->next pointer of the last task that */
-				/*  must wait for a later grace period, or */
-				/*  to &->rcb.rcucblist if there is no */
-				/*  such task. */
-	struct list_head blkd_tasks;
-				/* Tasks blocked in RCU read-side critical */
-				/*  section.  Tasks are placed at the head */
-				/*  of this list and age towards the tail. */
-	struct list_head *gp_tasks;
-				/* Pointer to the first task blocking the */
-				/*  current grace period, or NULL if there */
-				/*  is no such task. */
-	struct list_head *exp_tasks;
-				/* Pointer to first task blocking the */
-				/*  current expedited grace period, or NULL */
-				/*  if there is no such task.  If there */
-				/*  is no current expedited grace period, */
-				/*  then there cannot be any such task. */
-#ifdef CONFIG_RCU_BOOST
-	struct list_head *boost_tasks;
-				/* Pointer to first task that needs to be */
-				/*  priority-boosted, or NULL if no priority */
-				/*  boosting is needed.  If there is no */
-				/*  current or expedited grace period, there */
-				/*  can be no such task. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-	u8 gpnum;		/* Current grace period. */
-	u8 gpcpu;		/* Last grace period blocked by the CPU. */
-	u8 completed;		/* Last grace period completed. */
-				/*  If all three are equal, RCU is idle. */
-#ifdef CONFIG_RCU_BOOST
-	unsigned long boost_time; /* When to start boosting (jiffies) */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#ifdef CONFIG_RCU_TRACE
-	unsigned long n_grace_periods;
-#ifdef CONFIG_RCU_BOOST
-	unsigned long n_tasks_boosted;
-				/* Total number of tasks boosted. */
-	unsigned long n_exp_boosts;
-				/* Number of tasks boosted for expedited GP. */
-	unsigned long n_normal_boosts;
-				/* Number of tasks boosted for normal GP. */
-	unsigned long n_balk_blkd_tasks;
-				/* Refused to boost: no blocked tasks. */
-	unsigned long n_balk_exp_gp_tasks;
-				/* Refused to boost: nothing blocking GP. */
-	unsigned long n_balk_boost_tasks;
-				/* Refused to boost: already boosting. */
-	unsigned long n_balk_notyet;
-				/* Refused to boost: not yet time. */
-	unsigned long n_balk_nos;
-				/* Refused to boost: not sure why, though. */
-				/*  This can happen due to race conditions. */
-#endif /* #ifdef CONFIG_RCU_BOOST */
-#endif /* #ifdef CONFIG_RCU_TRACE */
-};
-
-static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
-	.rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-	.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-	.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
-	.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
-	RCU_TRACE(.rcb.name = "rcu_preempt")
-};
-
-static int rcu_preempted_readers_exp(void);
-static void rcu_report_exp_done(void);
-
-/*
- * Return true if the CPU has not yet responded to the current grace period.
- */
-static int rcu_cpu_blocking_cur_gp(void)
-{
-	return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
-}
-
-/*
- * Check for a running RCU reader.  Because there is only one CPU,
- * there can be but one running RCU reader at a time.  ;-)
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section
- *
- * Returns zero if there are no running readers.  Returns a positive
- * number if there is at least one reader within its RCU read-side
- * critical section.  Returns a negative number if an outermost reader
- * is in the midst of exiting from its RCU read-side critical section.
- */
-static int rcu_preempt_running_reader(void)
-{
-	return current->rcu_read_lock_nesting;
-}
-
-/*
- * Check for preempted RCU readers blocking any grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_any(void)
-{
-	return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
-}
-
-/*
- * Check for preempted RCU readers blocking the current grace period.
- * If the caller needs a reliable answer, it must disable hard irqs.
- */
-static int rcu_preempt_blocked_readers_cgp(void)
-{
-	return rcu_preempt_ctrlblk.gp_tasks != NULL;
-}
-
-/*
- * Return true if another preemptible-RCU grace period is needed.
- */
-static int rcu_preempt_needs_another_gp(void)
-{
-	return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
-}
-
-/*
- * Return true if a preemptible-RCU grace period is in progress.
- * The caller must disable hardirqs.
- */
-static int rcu_preempt_gp_in_progress(void)
-{
-	return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
-}
-
-/*
- * Advance a ->blkd_tasks-list pointer to the next entry, instead
- * returning NULL if at the end of the list.
- */
-static struct list_head *rcu_next_node_entry(struct task_struct *t)
-{
-	struct list_head *np;
-
-	np = t->rcu_node_entry.next;
-	if (np == &rcu_preempt_ctrlblk.blkd_tasks)
-		np = NULL;
-	return np;
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-#ifdef CONFIG_RCU_BOOST
-static void rcu_initiate_boost_trace(void);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Dump additional statistice for TINY_PREEMPT_RCU.
- */
-static void show_tiny_preempt_stats(struct seq_file *m)
-{
-	seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n",
-		   rcu_preempt_ctrlblk.rcb.qlen,
-		   rcu_preempt_ctrlblk.n_grace_periods,
-		   rcu_preempt_ctrlblk.gpnum,
-		   rcu_preempt_ctrlblk.gpcpu,
-		   rcu_preempt_ctrlblk.completed,
-		   "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)],
-		   "N."[!rcu_preempt_ctrlblk.gp_tasks],
-		   "E."[!rcu_preempt_ctrlblk.exp_tasks]);
-#ifdef CONFIG_RCU_BOOST
-	seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
-		   "             ",
-		   "B."[!rcu_preempt_ctrlblk.boost_tasks],
-		   rcu_preempt_ctrlblk.n_tasks_boosted,
-		   rcu_preempt_ctrlblk.n_exp_boosts,
-		   rcu_preempt_ctrlblk.n_normal_boosts,
-		   (int)(jiffies & 0xffff),
-		   (int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
-	seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
-		   "             balk",
-		   rcu_preempt_ctrlblk.n_balk_blkd_tasks,
-		   rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
-		   rcu_preempt_ctrlblk.n_balk_boost_tasks,
-		   rcu_preempt_ctrlblk.n_balk_notyet,
-		   rcu_preempt_ctrlblk.n_balk_nos);
-#endif /* #ifdef CONFIG_RCU_BOOST */
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-#ifdef CONFIG_RCU_BOOST
-
-#include "rtmutex_common.h"
-
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-
-/* Controls for rcu_kthread() kthread. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
-
-/*
- * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
- * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
- */
-static int rcu_boost(void)
-{
-	unsigned long flags;
-	struct rt_mutex mtx;
-	struct task_struct *t;
-	struct list_head *tb;
-
-	if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
-	    rcu_preempt_ctrlblk.exp_tasks == NULL)
-		return 0;  /* Nothing to boost. */
-
-	local_irq_save(flags);
-
-	/*
-	 * Recheck with irqs disabled: all tasks in need of boosting
-	 * might exit their RCU read-side critical sections on their own
-	 * if we are preempted just before disabling irqs.
-	 */
-	if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
-	    rcu_preempt_ctrlblk.exp_tasks == NULL) {
-		local_irq_restore(flags);
-		return 0;
-	}
-
-	/*
-	 * Preferentially boost tasks blocking expedited grace periods.
-	 * This cannot starve the normal grace periods because a second
-	 * expedited grace period must boost all blocked tasks, including
-	 * those blocking the pre-existing normal grace period.
-	 */
-	if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
-		tb = rcu_preempt_ctrlblk.exp_tasks;
-		RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
-	} else {
-		tb = rcu_preempt_ctrlblk.boost_tasks;
-		RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
-	}
-	RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
-
-	/*
-	 * We boost task t by manufacturing an rt_mutex that appears to
-	 * be held by task t.  We leave a pointer to that rt_mutex where
-	 * task t can find it, and task t will release the mutex when it
-	 * exits its outermost RCU read-side critical section.  Then
-	 * simply acquiring this artificial rt_mutex will boost task
-	 * t's priority.  (Thanks to tglx for suggesting this approach!)
-	 */
-	t = container_of(tb, struct task_struct, rcu_node_entry);
-	rt_mutex_init_proxy_locked(&mtx, t);
-	t->rcu_boost_mutex = &mtx;
-	local_irq_restore(flags);
-	rt_mutex_lock(&mtx);
-	rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-
-	return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
-	       ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
-}
-
-/*
- * Check to see if it is now time to start boosting RCU readers blocking
- * the current grace period, and, if so, tell the rcu_kthread_task to
- * start boosting them.  If there is an expedited boost in progress,
- * we wait for it to complete.
- *
- * If there are no blocked readers blocking the current grace period,
- * return 0 to let the caller know, otherwise return 1.  Note that this
- * return value is independent of whether or not boosting was done.
- */
-static int rcu_initiate_boost(void)
-{
-	if (!rcu_preempt_blocked_readers_cgp() &&
-	    rcu_preempt_ctrlblk.exp_tasks == NULL) {
-		RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
-		return 0;
-	}
-	if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
-	    (rcu_preempt_ctrlblk.gp_tasks != NULL &&
-	     rcu_preempt_ctrlblk.boost_tasks == NULL &&
-	     ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
-		if (rcu_preempt_ctrlblk.exp_tasks == NULL)
-			rcu_preempt_ctrlblk.boost_tasks =
-				rcu_preempt_ctrlblk.gp_tasks;
-		invoke_rcu_callbacks();
-	} else {
-		RCU_TRACE(rcu_initiate_boost_trace());
-	}
-	return 1;
-}
-
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
-
-/*
- * Do priority-boost accounting for the start of a new grace period.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
-	rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
-}
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
-/*
- * If there is no RCU priority boosting, we don't initiate boosting,
- * but we do indicate whether there are blocked readers blocking the
- * current grace period.
- */
-static int rcu_initiate_boost(void)
-{
-	return rcu_preempt_blocked_readers_cgp();
-}
-
-/*
- * If there is no RCU priority boosting, nothing to do at grace-period start.
- */
-static void rcu_preempt_boost_start_gp(void)
-{
-}
-
-#endif /* else #ifdef CONFIG_RCU_BOOST */
-
-/*
- * Record a preemptible-RCU quiescent state for the specified CPU.  Note
- * that this just means that the task currently running on the CPU is
- * in a quiescent state.  There might be any number of tasks blocked
- * while in an RCU read-side critical section.
- *
- * Unlike the other rcu_*_qs() functions, callers to this function
- * must disable irqs in order to protect the assignment to
- * ->rcu_read_unlock_special.
- *
- * Because this is a single-CPU implementation, the only way a grace
- * period can end is if the CPU is in a quiescent state.  The reason is
- * that a blocked preemptible-RCU reader can exit its critical section
- * only if the CPU is running it at the time.  Therefore, when the
- * last task blocking the current grace period exits its RCU read-side
- * critical section, neither the CPU nor blocked tasks will be stopping
- * the current grace period.  (In contrast, SMP implementations
- * might have CPUs running in RCU read-side critical sections that
- * block later grace periods -- but this is not possible given only
- * one CPU.)
- */
-static void rcu_preempt_cpu_qs(void)
-{
-	/* Record both CPU and task as having responded to current GP. */
-	rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
-	current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
-
-	/* If there is no GP then there is nothing more to do.  */
-	if (!rcu_preempt_gp_in_progress())
-		return;
-	/*
-	 * Check up on boosting.  If there are readers blocking the
-	 * current grace period, leave.
-	 */
-	if (rcu_initiate_boost())
-		return;
-
-	/* Advance callbacks. */
-	rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
-	rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
-	rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
-
-	/* If there are no blocked readers, next GP is done instantly. */
-	if (!rcu_preempt_blocked_readers_any())
-		rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
-
-	/* If there are done callbacks, cause them to be invoked. */
-	if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-		invoke_rcu_callbacks();
-}
-
-/*
- * Start a new RCU grace period if warranted.  Hard irqs must be disabled.
- */
-static void rcu_preempt_start_gp(void)
-{
-	if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
-
-		/* Official start of GP. */
-		rcu_preempt_ctrlblk.gpnum++;
-		RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++);
-		reset_cpu_stall_ticks(&rcu_preempt_ctrlblk.rcb);
-
-		/* Any blocked RCU readers block new GP. */
-		if (rcu_preempt_blocked_readers_any())
-			rcu_preempt_ctrlblk.gp_tasks =
-				rcu_preempt_ctrlblk.blkd_tasks.next;
-
-		/* Set up for RCU priority boosting. */
-		rcu_preempt_boost_start_gp();
-
-		/* If there is no running reader, CPU is done with GP. */
-		if (!rcu_preempt_running_reader())
-			rcu_preempt_cpu_qs();
-	}
-}
-
-/*
- * We have entered the scheduler, and the current task might soon be
- * context-switched away from.  If this task is in an RCU read-side
- * critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the blkd_tasks list.
- * If the task started after the current grace period began, as recorded
- * by ->gpcpu, we enqueue at the beginning of the list.  Otherwise
- * before the element referenced by ->gp_tasks (or at the tail if
- * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
- * The task will dequeue itself when it exits the outermost enclosing
- * RCU read-side critical section.  Therefore, the current grace period
- * cannot be permitted to complete until the ->gp_tasks pointer becomes
- * NULL.
- *
- * Caller must disable preemption.
- */
-void rcu_preempt_note_context_switch(void)
-{
-	struct task_struct *t = current;
-	unsigned long flags;
-
-	local_irq_save(flags); /* must exclude scheduler_tick(). */
-	if (rcu_preempt_running_reader() > 0 &&
-	    (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
-
-		/* Possibly blocking in an RCU read-side critical section. */
-		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
-
-		/*
-		 * If this CPU has already checked in, then this task
-		 * will hold up the next grace period rather than the
-		 * current grace period.  Queue the task accordingly.
-		 * If the task is queued for the current grace period
-		 * (i.e., this CPU has not yet passed through a quiescent
-		 * state for the current grace period), then as long
-		 * as that task remains queued, the current grace period
-		 * cannot end.
-		 */
-		list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
-		if (rcu_cpu_blocking_cur_gp())
-			rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
-	} else if (rcu_preempt_running_reader() < 0 &&
-		   t->rcu_read_unlock_special) {
-		/*
-		 * Complete exit from RCU read-side critical section on
-		 * behalf of preempted instance of __rcu_read_unlock().
-		 */
-		rcu_read_unlock_special(t);
-	}
-
-	/*
-	 * Either we were not in an RCU read-side critical section to
-	 * begin with, or we have now recorded that critical section
-	 * globally.  Either way, we can now note a quiescent state
-	 * for this CPU.  Again, if we were in an RCU read-side critical
-	 * section, and if that critical section was blocking the current
-	 * grace period, then the fact that the task has been enqueued
-	 * means that current grace period continues to be blocked.
-	 */
-	rcu_preempt_cpu_qs();
-	local_irq_restore(flags);
-}
-
-/*
- * Handle special cases during rcu_read_unlock(), such as needing to
- * notify RCU core processing or task having blocked during the RCU
- * read-side critical section.
- */
-void rcu_read_unlock_special(struct task_struct *t)
-{
-	int empty;
-	int empty_exp;
-	unsigned long flags;
-	struct list_head *np;
-#ifdef CONFIG_RCU_BOOST
-	struct rt_mutex *rbmp = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-	int special;
-
-	/*
-	 * NMI handlers cannot block and cannot safely manipulate state.
-	 * They therefore cannot possibly be special, so just leave.
-	 */
-	if (in_nmi())
-		return;
-
-	local_irq_save(flags);
-
-	/*
-	 * If RCU core is waiting for this CPU to exit critical section,
-	 * let it know that we have done so.
-	 */
-	special = t->rcu_read_unlock_special;
-	if (special & RCU_READ_UNLOCK_NEED_QS)
-		rcu_preempt_cpu_qs();
-
-	/* Hardware IRQ handlers cannot block. */
-	if (in_irq() || in_serving_softirq()) {
-		local_irq_restore(flags);
-		return;
-	}
-
-	/* Clean up if blocked during RCU read-side critical section. */
-	if (special & RCU_READ_UNLOCK_BLOCKED) {
-		t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
-
-		/*
-		 * Remove this task from the ->blkd_tasks list and adjust
-		 * any pointers that might have been referencing it.
-		 */
-		empty = !rcu_preempt_blocked_readers_cgp();
-		empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
-		np = rcu_next_node_entry(t);
-		list_del_init(&t->rcu_node_entry);
-		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
-			rcu_preempt_ctrlblk.gp_tasks = np;
-		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
-			rcu_preempt_ctrlblk.exp_tasks = np;
-#ifdef CONFIG_RCU_BOOST
-		if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
-			rcu_preempt_ctrlblk.boost_tasks = np;
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
-		/*
-		 * If this was the last task on the current list, and if
-		 * we aren't waiting on the CPU, report the quiescent state
-		 * and start a new grace period if needed.
-		 */
-		if (!empty && !rcu_preempt_blocked_readers_cgp()) {
-			rcu_preempt_cpu_qs();
-			rcu_preempt_start_gp();
-		}
-
-		/*
-		 * If this was the last task on the expedited lists,
-		 * then we need wake up the waiting task.
-		 */
-		if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
-			rcu_report_exp_done();
-	}
-#ifdef CONFIG_RCU_BOOST
-	/* Unboost self if was boosted. */
-	if (t->rcu_boost_mutex != NULL) {
-		rbmp = t->rcu_boost_mutex;
-		t->rcu_boost_mutex = NULL;
-		rt_mutex_unlock(rbmp);
-	}
-#endif /* #ifdef CONFIG_RCU_BOOST */
-	local_irq_restore(flags);
-}
-
-/*
- * Check for a quiescent state from the current CPU.  When a task blocks,
- * the task is recorded in the rcu_preempt_ctrlblk structure, which is
- * checked elsewhere.  This is called from the scheduling-clock interrupt.
- *
- * Caller must disable hard irqs.
- */
-static void rcu_preempt_check_callbacks(void)
-{
-	struct task_struct *t = current;
-
-	if (rcu_preempt_gp_in_progress() &&
-	    (!rcu_preempt_running_reader() ||
-	     !rcu_cpu_blocking_cur_gp()))
-		rcu_preempt_cpu_qs();
-	if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
-	    rcu_preempt_ctrlblk.rcb.donetail)
-		invoke_rcu_callbacks();
-	if (rcu_preempt_gp_in_progress() &&
-	    rcu_cpu_blocking_cur_gp() &&
-	    rcu_preempt_running_reader() > 0)
-		t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
-}
-
-/*
- * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
- * update, so this is invoked from rcu_process_callbacks() to
- * handle that case.  Of course, it is invoked for all flavors of
- * RCU, but RCU callbacks can appear only on one of the lists, and
- * neither ->nexttail nor ->donetail can possibly be NULL, so there
- * is no need for an explicit check.
- */
-static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
-{
-	if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
-		rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
-}
-
-/*
- * Process callbacks for preemptible RCU.
- */
-static void rcu_preempt_process_callbacks(void)
-{
-	__rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
-}
-
-/*
- * Queue a preemptible -RCU callback for invocation after a grace period.
- */
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-
-	debug_rcu_head_queue(head);
-	head->func = func;
-	head->next = NULL;
-
-	local_irq_save(flags);
-	*rcu_preempt_ctrlblk.nexttail = head;
-	rcu_preempt_ctrlblk.nexttail = &head->next;
-	RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++);
-	rcu_preempt_start_gp();  /* checks to see if GP needed. */
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/*
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void synchronize_rcu(void)
-{
-	rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
-			   !lock_is_held(&rcu_lock_map) &&
-			   !lock_is_held(&rcu_sched_lock_map),
-			   "Illegal synchronize_rcu() in RCU read-side critical section");
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	if (!rcu_scheduler_active)
-		return;
-#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-
-	WARN_ON_ONCE(rcu_preempt_running_reader());
-	if (!rcu_preempt_blocked_readers_any())
-		return;
-
-	/* Once we get past the fastpath checks, same code as rcu_barrier(). */
-	if (rcu_expedited)
-		synchronize_rcu_expedited();
-	else
-		rcu_barrier();
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu);
-
-static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static unsigned long sync_rcu_preempt_exp_count;
-static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
-
-/*
- * Return non-zero if there are any tasks in RCU read-side critical
- * sections blocking the current preemptible-RCU expedited grace period.
- * If there is no preemptible-RCU expedited grace period currently in
- * progress, returns zero unconditionally.
- */
-static int rcu_preempted_readers_exp(void)
-{
-	return rcu_preempt_ctrlblk.exp_tasks != NULL;
-}
-
-/*
- * Report the exit from RCU read-side critical section for the last task
- * that queued itself during or before the current expedited preemptible-RCU
- * grace period.
- */
-static void rcu_report_exp_done(void)
-{
-	wake_up(&sync_rcu_preempt_exp_wq);
-}
-
-/*
- * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
- * is to rely in the fact that there is but one CPU, and that it is
- * illegal for a task to invoke synchronize_rcu_expedited() while in a
- * preemptible-RCU read-side critical section.  Therefore, any such
- * critical sections must correspond to blocked tasks, which must therefore
- * be on the ->blkd_tasks list.  So just record the current head of the
- * list in the ->exp_tasks pointer, and wait for all tasks including and
- * after the task pointed to by ->exp_tasks to drain.
- */
-void synchronize_rcu_expedited(void)
-{
-	unsigned long flags;
-	struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
-	unsigned long snap;
-
-	barrier(); /* ensure prior action seen before grace period. */
-
-	WARN_ON_ONCE(rcu_preempt_running_reader());
-
-	/*
-	 * Acquire lock so that there is only one preemptible RCU grace
-	 * period in flight.  Of course, if someone does the expedited
-	 * grace period for us while we are acquiring the lock, just leave.
-	 */
-	snap = sync_rcu_preempt_exp_count + 1;
-	mutex_lock(&sync_rcu_preempt_exp_mutex);
-	if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
-		goto unlock_mb_ret; /* Others did our work for us. */
-
-	local_irq_save(flags);
-
-	/*
-	 * All RCU readers have to already be on blkd_tasks because
-	 * we cannot legally be executing in an RCU read-side critical
-	 * section.
-	 */
-
-	/* Snapshot current head of ->blkd_tasks list. */
-	rpcp->exp_tasks = rpcp->blkd_tasks.next;
-	if (rpcp->exp_tasks == &rpcp->blkd_tasks)
-		rpcp->exp_tasks = NULL;
-
-	/* Wait for tail of ->blkd_tasks list to drain. */
-	if (!rcu_preempted_readers_exp()) {
-		local_irq_restore(flags);
-	} else {
-		rcu_initiate_boost();
-		local_irq_restore(flags);
-		wait_event(sync_rcu_preempt_exp_wq,
-			   !rcu_preempted_readers_exp());
-	}
-
-	/* Clean up and exit. */
-	barrier(); /* ensure expedited GP seen before counter increment. */
-	sync_rcu_preempt_exp_count++;
-unlock_mb_ret:
-	mutex_unlock(&sync_rcu_preempt_exp_mutex);
-	barrier(); /* ensure subsequent action seen after grace period. */
-}
-EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
-
-/*
- * Does preemptible RCU need the CPU to stay out of dynticks mode?
- */
-int rcu_preempt_needs_cpu(void)
-{
-	return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
-}
-
-#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
-
 #ifdef CONFIG_RCU_TRACE
 
 /*
@@ -895,79 +138,6 @@ static void rcu_preempt_process_callbacks(void)
 {
 }
 
-#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
-
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_callbacks(void)
-{
-	have_rcu_kthread_work = 1;
-	if (rcu_kthread_task != NULL)
-		wake_up(&rcu_kthread_wq);
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-/*
- * Is the current CPU running the RCU-callbacks kthread?
- * Caller must have preemption disabled.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-	return rcu_kthread_task == current;
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
-{
-	unsigned long work;
-	unsigned long morework;
-	unsigned long flags;
-
-	for (;;) {
-		wait_event_interruptible(rcu_kthread_wq,
-					 have_rcu_kthread_work != 0);
-		morework = rcu_boost();
-		local_irq_save(flags);
-		work = have_rcu_kthread_work;
-		have_rcu_kthread_work = morework;
-		local_irq_restore(flags);
-		if (work)
-			rcu_process_callbacks(NULL);
-		schedule_timeout_interruptible(1); /* Leave CPU for others. */
-	}
-
-	return 0;  /* Not reached, but needed to shut gcc up. */
-}
-
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-	struct sched_param sp;
-
-	rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-	sp.sched_priority = RCU_BOOST_PRIO;
-	sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-	return 0;
-}
-early_initcall(rcu_spawn_kthreads);
-
-#else /* #ifdef CONFIG_RCU_BOOST */
-
 /* Hold off callback invocation until early_initcall() time. */
 static int rcu_scheduler_fully_active __read_mostly;
 
@@ -1001,8 +171,6 @@ static int __init rcu_scheduler_really_started(void)
 }
 early_initcall(rcu_scheduler_really_started);
 
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 
@@ -1020,25 +188,6 @@ void __init rcu_scheduler_starting(void)
 
 #ifdef CONFIG_RCU_TRACE
 
-#ifdef CONFIG_RCU_BOOST
-
-static void rcu_initiate_boost_trace(void)
-{
-	if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
-		rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
-	else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
-		 rcu_preempt_ctrlblk.exp_tasks == NULL)
-		rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
-	else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
-		rcu_preempt_ctrlblk.n_balk_boost_tasks++;
-	else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
-		rcu_preempt_ctrlblk.n_balk_notyet++;
-	else
-		rcu_preempt_ctrlblk.n_balk_nos++;
-}
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
 static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
 {
 	unsigned long flags;
@@ -1105,9 +254,6 @@ MODULE_LICENSE("GPL");
 
 static void check_cpu_stall_preempt(void)
 {
-#ifdef CONFIG_TINY_PREEMPT_RCU
-	check_cpu_stall(&rcu_preempt_ctrlblk.rcb);
-#endif /* #ifdef CONFIG_TINY_PREEMPT_RCU */
 }
 
 #endif /* #ifdef CONFIG_RCU_TRACE */
-- 
cgit v1.3-7-g2ca7


From 9dc5ad32488a75504349372330cc228d4dd678db Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 27 Mar 2013 10:11:15 -0700
Subject: rcu: Simplify RCU_TINY RCU callback invocation

TINY_PREEMPT_RCU could use a kthread to handle RCU callback invocation,
which required an API to abstract kthread vs. softirq invocation.
Now that TINY_PREEMPT_RCU is no longer with us, this commit retires
this API in favor of direct use of the relevant softirq primitives.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h |  1 +
 include/linux/rcutiny.h  |  4 ----
 include/linux/rcutree.h  |  1 -
 kernel/rcutiny.c         | 14 +++++++++-----
 kernel/rcutiny_plugin.h  | 33 ---------------------------------
 5 files changed, 10 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70b1522badd3..257b50f9d2dc 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -216,6 +216,7 @@ static inline int rcu_preempt_depth(void)
 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 
 /* Internal to kernel */
+extern void rcu_init(void);
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
 extern void rcu_check_callbacks(int cpu, int user);
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index d3c094ffa960..e756251b39bc 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,10 +27,6 @@
 
 #include <linux/cache.h>
 
-static inline void rcu_init(void)
-{
-}
-
 static inline void rcu_barrier_bh(void)
 {
 	wait_rcu_gp(call_rcu_bh);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 952b79339304..3f1aa8f98666 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,7 +30,6 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
-extern void rcu_init(void);
 extern void rcu_note_context_switch(int cpu);
 extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies);
 extern void rcu_cpu_stall_reset(void);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7fc2339b0859..4adc9e26da34 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,7 +44,6 @@
 
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_callbacks(void);
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
 static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
@@ -227,7 +226,7 @@ void rcu_sched_qs(int cpu)
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
 	    rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_callbacks();
+		raise_softirq(RCU_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -240,7 +239,7 @@ void rcu_bh_qs(int cpu)
 
 	local_irq_save(flags);
 	if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-		invoke_rcu_callbacks();
+		raise_softirq(RCU_SOFTIRQ);
 	local_irq_restore(flags);
 }
 
@@ -277,7 +276,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 					      ACCESS_ONCE(rcp->rcucblist),
 					      need_resched(),
 					      is_idle_task(current),
-					      rcu_is_callbacks_kthread()));
+					      false));
 		return;
 	}
 
@@ -307,7 +306,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
 	RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
 				      is_idle_task(current),
-				      rcu_is_callbacks_kthread()));
+				      false));
 }
 
 static void rcu_process_callbacks(struct softirq_action *unused)
@@ -379,3 +378,8 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 	__call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+void rcu_init(void)
+{
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index bfe992407803..36fd83c544c8 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -102,39 +102,6 @@ static void check_cpu_stalls(void)
 	RCU_TRACE(check_cpu_stall_preempt());
 }
 
-/* Hold off callback invocation until early_initcall() time. */
-static int rcu_scheduler_fully_active __read_mostly;
-
-/*
- * Start up softirq processing of callbacks.
- */
-void invoke_rcu_callbacks(void)
-{
-	if (rcu_scheduler_fully_active)
-		raise_softirq(RCU_SOFTIRQ);
-}
-
-#ifdef CONFIG_RCU_TRACE
-
-/*
- * There is no callback kthread, so this thread is never it.
- */
-static bool rcu_is_callbacks_kthread(void)
-{
-	return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_TRACE */
-
-static int __init rcu_scheduler_really_started(void)
-{
-	rcu_scheduler_fully_active = 1;
-	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
-	raise_softirq(RCU_SOFTIRQ);  /* Invoke any callbacks from early boot. */
-	return 0;
-}
-early_initcall(rcu_scheduler_really_started);
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
 
-- 
cgit v1.3-7-g2ca7


From 57f1801a11d5a5313990ac56f5072e6be36994c3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 15 Apr 2013 08:25:27 -0700
Subject: rcu: Remove the CONFIG_TINY_RCU ifdefs in rcutiny.h

Now that CONFIG_TINY_PREEMPT_RCU is no more, this commit removes
the CONFIG_TINY_RCU ifdefs from include/linux/rcutiny.h in favor of
unconditionally compiling the CONFIG_TINY_RCU legs of those ifdefs.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Moved removal of #else to "Remove TINY_PREEMPT_RCU" as
  suggested by Josh Triplett. ]
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e756251b39bc..07b5affb92fa 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -37,8 +37,6 @@ static inline void rcu_barrier_sched(void)
 	wait_rcu_gp(call_rcu_sched);
 }
 
-#ifdef CONFIG_TINY_RCU
-
 static inline void synchronize_rcu_expedited(void)
 {
 	synchronize_sched();	/* Only one CPU, so pretty fast anyway!!! */
@@ -49,8 +47,6 @@ static inline void rcu_barrier(void)
 	rcu_barrier_sched();  /* Only one CPU, so only one list of callbacks! */
 }
 
-#endif /* #ifdef CONFIG_TINY_RCU */
-
 static inline void synchronize_rcu_bh(void)
 {
 	synchronize_sched();
@@ -72,8 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 	call_rcu(head, func);
 }
 
-#ifdef CONFIG_TINY_RCU
-
 static inline void rcu_preempt_note_context_switch(void)
 {
 }
@@ -84,8 +78,6 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 	return 0;
 }
 
-#endif /* #ifdef CONFIG_TINY_RCU */
-
 static inline void rcu_note_context_switch(int cpu)
 {
 	rcu_sched_qs(cpu);
-- 
cgit v1.3-7-g2ca7


From fa2b3b0a50949537ac95a521b3546e9a87dc0565 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 27 Mar 2013 10:32:00 -0700
Subject: rcu: Remove rcu_preempt_note_context_switch()

With the removal of CONFIG_TINY_PREEMPT_RCU, rcu_preempt_note_context_switch()
is now an empty function.  This commit therefore eliminates it by inlining it.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcutiny.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 07b5affb92fa..51230b63be93 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -68,10 +68,6 @@ static inline void kfree_call_rcu(struct rcu_head *head,
 	call_rcu(head, func);
 }
 
-static inline void rcu_preempt_note_context_switch(void)
-{
-}
-
 static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
@@ -81,7 +77,6 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 static inline void rcu_note_context_switch(int cpu)
 {
 	rcu_sched_qs(cpu);
-	rcu_preempt_note_context_switch();
 }
 
 /*
-- 
cgit v1.3-7-g2ca7


From 2439b696cb5303f1eeb6aeebcee19e0056c3dd6e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 11 Apr 2013 10:15:52 -0700
Subject: rcu: Shrink TINY_RCU by moving exit_rcu()

Now that TINY_PREEMPT_RCU is no more, exit_rcu() is always an empty
function.  But if TINY_RCU is going to have an empty function, it should
be in include/linux/rcutiny.h, where it does not bloat the kernel.
This commit therefore moves exit_rcu() out of kernel/rcupdate.c to
kernel/rcutree_plugin.h, and places a static inline empty function in
include/linux/rcutiny.h in order to shrink TINY_RCU a bit.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
---
 include/linux/rcupdate.h |  2 --
 include/linux/rcutiny.h  |  4 ++++
 include/linux/rcutree.h  |  2 ++
 kernel/rcupdate.c        | 26 +-------------------------
 kernel/rcutree_plugin.h  | 26 ++++++++++++++++++++++++++
 5 files changed, 33 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 257b50f9d2dc..4b14bdc911d7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -240,8 +240,6 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
 					 struct task_struct *next) { }
 #endif /* CONFIG_RCU_USER_QS */
 
-extern void exit_rcu(void);
-
 /**
  * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers
  * @a: Code that RCU needs to pay attention to.
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 51230b63be93..e31005ee339e 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -119,6 +119,10 @@ static inline void rcu_cpu_stall_reset(void)
 {
 }
 
+static inline void exit_rcu(void)
+{
+}
+
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern int rcu_scheduler_active __read_mostly;
 extern void rcu_scheduler_starting(void);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 3f1aa8f98666..226169d1bd2b 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -85,6 +85,8 @@ extern void rcu_force_quiescent_state(void);
 extern void rcu_bh_force_quiescent_state(void);
 extern void rcu_sched_force_quiescent_state(void);
 
+extern void exit_rcu(void);
+
 extern void rcu_scheduler_starting(void);
 extern int rcu_scheduler_active __read_mostly;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 48ab70384a4c..0be1fa2ea521 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -104,31 +104,7 @@ void __rcu_read_unlock(void)
 }
 EXPORT_SYMBOL_GPL(__rcu_read_unlock);
 
-/*
- * Check for a task exiting while in a preemptible-RCU read-side
- * critical section, clean up if so.  No need to issue warnings,
- * as debug_check_no_locks_held() already does this if lockdep
- * is enabled.
- */
-void exit_rcu(void)
-{
-	struct task_struct *t = current;
-
-	if (likely(list_empty(&current->rcu_node_entry)))
-		return;
-	t->rcu_read_lock_nesting = 1;
-	barrier();
-	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
-	__rcu_read_unlock();
-}
-
-#else /* #ifdef CONFIG_PREEMPT_RCU */
-
-void exit_rcu(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 207844ea0226..de701bbdb624 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -932,6 +932,24 @@ static void __init __rcu_init_preempt(void)
 	rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 }
 
+/*
+ * Check for a task exiting while in a preemptible-RCU read-side
+ * critical section, clean up if so.  No need to issue warnings,
+ * as debug_check_no_locks_held() already does this if lockdep
+ * is enabled.
+ */
+void exit_rcu(void)
+{
+	struct task_struct *t = current;
+
+	if (likely(list_empty(&current->rcu_node_entry)))
+		return;
+	t->rcu_read_lock_nesting = 1;
+	barrier();
+	t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED;
+	__rcu_read_unlock();
+}
+
 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 static struct rcu_state *rcu_state = &rcu_sched_state;
@@ -1100,6 +1118,14 @@ static void __init __rcu_init_preempt(void)
 {
 }
 
+/*
+ * Because preemptible RCU does not exist, tasks cannot possibly exit
+ * while in preemptible RCU read-side critical sections.
+ */
+void exit_rcu(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
 
 #ifdef CONFIG_RCU_BOOST
-- 
cgit v1.3-7-g2ca7


From 43ab0476a648053e5998bf081f47f215375a4502 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 2 Jun 2013 14:56:07 +0200
Subject: efi: Convert runtime services function ptrs

... to void * like the boot services and lose all the void * casts. No
functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h | 28 ++++++++++++++--------------
 include/linux/efi.h        | 28 ++++++++++++++--------------
 2 files changed, 28 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 2fb5d5884e23..5b33686b6995 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -52,40 +52,40 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
 		     u64 arg4, u64 arg5, u64 arg6);
 
 #define efi_call_phys0(f)			\
-	efi_call0((void *)(f))
+	efi_call0((f))
 #define efi_call_phys1(f, a1)			\
-	efi_call1((void *)(f), (u64)(a1))
+	efi_call1((f), (u64)(a1))
 #define efi_call_phys2(f, a1, a2)			\
-	efi_call2((void *)(f), (u64)(a1), (u64)(a2))
+	efi_call2((f), (u64)(a1), (u64)(a2))
 #define efi_call_phys3(f, a1, a2, a3)				\
-	efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
+	efi_call3((f), (u64)(a1), (u64)(a2), (u64)(a3))
 #define efi_call_phys4(f, a1, a2, a3, a4)				\
-	efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+	efi_call4((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
 		  (u64)(a4))
 #define efi_call_phys5(f, a1, a2, a3, a4, a5)				\
-	efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+	efi_call5((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
 		  (u64)(a4), (u64)(a5))
 #define efi_call_phys6(f, a1, a2, a3, a4, a5, a6)			\
-	efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3),		\
+	efi_call6((f), (u64)(a1), (u64)(a2), (u64)(a3),		\
 		  (u64)(a4), (u64)(a5), (u64)(a6))
 
 #define efi_call_virt0(f)				\
-	efi_call0((void *)(efi.systab->runtime->f))
+	efi_call0((efi.systab->runtime->f))
 #define efi_call_virt1(f, a1)					\
-	efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
+	efi_call1((efi.systab->runtime->f), (u64)(a1))
 #define efi_call_virt2(f, a1, a2)					\
-	efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
+	efi_call2((efi.systab->runtime->f), (u64)(a1), (u64)(a2))
 #define efi_call_virt3(f, a1, a2, a3)					\
-	efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+	efi_call3((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3))
 #define efi_call_virt4(f, a1, a2, a3, a4)				\
-	efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+	efi_call4((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3), (u64)(a4))
 #define efi_call_virt5(f, a1, a2, a3, a4, a5)				\
-	efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+	efi_call5((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3), (u64)(a4), (u64)(a5))
 #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6)			\
-	efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
+	efi_call6((efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
 		  (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
 
 extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 2bc0ad78d058..21ae6b3c0359 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -287,20 +287,20 @@ typedef struct {
 
 typedef struct {
 	efi_table_hdr_t hdr;
-	unsigned long get_time;
-	unsigned long set_time;
-	unsigned long get_wakeup_time;
-	unsigned long set_wakeup_time;
-	unsigned long set_virtual_address_map;
-	unsigned long convert_pointer;
-	unsigned long get_variable;
-	unsigned long get_next_variable;
-	unsigned long set_variable;
-	unsigned long get_next_high_mono_count;
-	unsigned long reset_system;
-	unsigned long update_capsule;
-	unsigned long query_capsule_caps;
-	unsigned long query_variable_info;
+	void *get_time;
+	void *set_time;
+	void *get_wakeup_time;
+	void *set_wakeup_time;
+	void *set_virtual_address_map;
+	void *convert_pointer;
+	void *get_variable;
+	void *get_next_variable;
+	void *set_variable;
+	void *get_next_high_mono_count;
+	void *reset_system;
+	void *update_capsule;
+	void *query_capsule_caps;
+	void *query_variable_info;
 } efi_runtime_services_t;
 
 typedef efi_status_t efi_get_time_t (efi_time_t *tm, efi_time_cap_t *tc);
-- 
cgit v1.3-7-g2ca7


From 82467a5a885ddd9f80309682159da8db510e7832 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Fri, 7 Jun 2013 21:53:09 +0000
Subject: cpuidle: simplify multiple driver support

Commit bf4d1b5 (cpuidle: support multiple drivers) introduced support
for using multiple cpuidle drivers at the same time.  It added a
couple of new APIs to register the driver per CPU, but that led to
some unnecessary code complexity related to the kernel config options
deciding whether or not the multiple driver support is enabled.  The
code has to work as it did before when the multiple driver support is
not enabled and the multiple driver support has to be compatible with
the previously existing API.

Remove the new API, not used by any driver in the tree yet (but
needed for the HMP cpuidle drivers that will be submitted soon), and
add a new cpumask pointer to the cpuidle driver structure that will
point to the mask of CPUs handled by the given driver.  That will
allow the cpuidle_[un]register_driver() API to be used for the
multiple driver support along with the cpuidle_[un]register()
functions added recently.

[rjw: Changelog]
Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c |   4 +-
 drivers/cpuidle/driver.c  | 192 +++++++++++++++++++---------------------------
 include/linux/cpuidle.h   |   6 +-
 3 files changed, 84 insertions(+), 118 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c3a93fece819..fdc432f18022 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -466,7 +466,7 @@ void cpuidle_unregister(struct cpuidle_driver *drv)
 	int cpu;
 	struct cpuidle_device *device;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, drv->cpumask) {
 		device = &per_cpu(cpuidle_dev, cpu);
 		cpuidle_unregister_device(device);
 	}
@@ -498,7 +498,7 @@ int cpuidle_register(struct cpuidle_driver *drv,
 		return ret;
 	}
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, drv->cpumask) {
 		device = &per_cpu(cpuidle_dev, cpu);
 		device->cpu = cpu;
 
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 8dfaaae94444..e75fa5472a91 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -18,167 +18,140 @@
 
 DEFINE_SPINLOCK(cpuidle_driver_lock);
 
-static void __cpuidle_set_cpu_driver(struct cpuidle_driver *drv, int cpu);
-static struct cpuidle_driver * __cpuidle_get_cpu_driver(int cpu);
+#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
 
-static void cpuidle_setup_broadcast_timer(void *arg)
+static DEFINE_PER_CPU(struct cpuidle_driver *, cpuidle_drivers);
+
+static struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
 {
-	int cpu = smp_processor_id();
-	clockevents_notify((long)(arg), &cpu);
+	return per_cpu(cpuidle_drivers, cpu);
 }
 
-static void __cpuidle_driver_init(struct cpuidle_driver *drv, int cpu)
+static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)
 {
-	int i;
-
-	drv->refcnt = 0;
+	int cpu;
 
-	for (i = drv->state_count - 1; i >= 0 ; i--) {
+	for_each_cpu(cpu, drv->cpumask) {
 
-		if (!(drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP))
+		if (drv != __cpuidle_get_cpu_driver(cpu))
 			continue;
 
-		drv->bctimer = 1;
-		on_each_cpu_mask(get_cpu_mask(cpu), cpuidle_setup_broadcast_timer,
-				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1);
-		break;
+		per_cpu(cpuidle_drivers, cpu) = NULL;
 	}
 }
 
-static int __cpuidle_register_driver(struct cpuidle_driver *drv, int cpu)
+static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)
 {
-	if (!drv || !drv->state_count)
-		return -EINVAL;
-
-	if (cpuidle_disabled())
-		return -ENODEV;
+	int cpu;
 
-	if (__cpuidle_get_cpu_driver(cpu))
-		return -EBUSY;
+	for_each_cpu(cpu, drv->cpumask) {
 
-	__cpuidle_driver_init(drv, cpu);
+		if (__cpuidle_get_cpu_driver(cpu)) {
+			__cpuidle_unset_driver(drv);
+			return -EBUSY;
+		}
 
-	__cpuidle_set_cpu_driver(drv, cpu);
+		per_cpu(cpuidle_drivers, cpu) = drv;
+	}
 
 	return 0;
 }
 
-static void __cpuidle_unregister_driver(struct cpuidle_driver *drv, int cpu)
-{
-	if (drv != __cpuidle_get_cpu_driver(cpu))
-		return;
+#else
 
-	if (!WARN_ON(drv->refcnt > 0))
-		__cpuidle_set_cpu_driver(NULL, cpu);
+static struct cpuidle_driver *cpuidle_curr_driver;
 
-	if (drv->bctimer) {
-		drv->bctimer = 0;
-		on_each_cpu_mask(get_cpu_mask(cpu), cpuidle_setup_broadcast_timer,
-				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_OFF, 1);
-	}
+static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
+{
+	return cpuidle_curr_driver;
 }
 
-#ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
+static inline int __cpuidle_set_driver(struct cpuidle_driver *drv)
+{
+	if (cpuidle_curr_driver)
+		return -EBUSY;
 
-static DEFINE_PER_CPU(struct cpuidle_driver *, cpuidle_drivers);
+	cpuidle_curr_driver = drv;
 
-static void __cpuidle_set_cpu_driver(struct cpuidle_driver *drv, int cpu)
-{
-	per_cpu(cpuidle_drivers, cpu) = drv;
+	return 0;
 }
 
-static struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
+static inline void __cpuidle_unset_driver(struct cpuidle_driver *drv)
 {
-	return per_cpu(cpuidle_drivers, cpu);
+	if (drv == cpuidle_curr_driver)
+		cpuidle_curr_driver = NULL;
 }
 
-static void __cpuidle_unregister_all_cpu_driver(struct cpuidle_driver *drv)
+#endif
+
+static void cpuidle_setup_broadcast_timer(void *arg)
 {
-	int cpu;
-	for_each_present_cpu(cpu)
-		__cpuidle_unregister_driver(drv, cpu);
+	int cpu = smp_processor_id();
+	clockevents_notify((long)(arg), &cpu);
 }
 
-static int __cpuidle_register_all_cpu_driver(struct cpuidle_driver *drv)
+static int __cpuidle_driver_init(struct cpuidle_driver *drv)
 {
-	int ret = 0;
-	int i, cpu;
+	int i;
 
-	for_each_present_cpu(cpu) {
-		ret = __cpuidle_register_driver(drv, cpu);
-		if (ret)
-			break;
-	}
+	drv->refcnt = 0;
 
-	if (ret)
-		for_each_present_cpu(i) {
-			if (i == cpu)
-				break;
-			__cpuidle_unregister_driver(drv, i);
-		}
+	if (!drv->cpumask)
+		drv->cpumask = (struct cpumask *)cpu_possible_mask;
 
+	for (i = drv->state_count - 1; i >= 0 ; i--) {
 
-	return ret;
+		if (!(drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP))
+			continue;
+
+		drv->bctimer = 1;
+		break;
+	}
+
+	return 0;
 }
 
-int cpuidle_register_cpu_driver(struct cpuidle_driver *drv, int cpu)
+static int __cpuidle_register_driver(struct cpuidle_driver *drv)
 {
 	int ret;
 
-	spin_lock(&cpuidle_driver_lock);
-	ret = __cpuidle_register_driver(drv, cpu);
-	spin_unlock(&cpuidle_driver_lock);
+	if (!drv || !drv->state_count)
+		return -EINVAL;
 
-	return ret;
-}
+	if (cpuidle_disabled())
+		return -ENODEV;
 
-void cpuidle_unregister_cpu_driver(struct cpuidle_driver *drv, int cpu)
-{
-	spin_lock(&cpuidle_driver_lock);
-	__cpuidle_unregister_driver(drv, cpu);
-	spin_unlock(&cpuidle_driver_lock);
-}
+	ret = __cpuidle_driver_init(drv);
+	if (ret)
+		return ret;
 
-/**
- * cpuidle_register_driver - registers a driver
- * @drv: the driver
- */
-int cpuidle_register_driver(struct cpuidle_driver *drv)
-{
-	int ret;
+	ret = __cpuidle_set_driver(drv);
+	if (ret)
+		return ret;
 
-	spin_lock(&cpuidle_driver_lock);
-	ret = __cpuidle_register_all_cpu_driver(drv);
-	spin_unlock(&cpuidle_driver_lock);
+	if (drv->bctimer)
+		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
+				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1);
 
-	return ret;
+	return 0;
 }
-EXPORT_SYMBOL_GPL(cpuidle_register_driver);
 
 /**
  * cpuidle_unregister_driver - unregisters a driver
  * @drv: the driver
  */
-void cpuidle_unregister_driver(struct cpuidle_driver *drv)
+static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
 {
-	spin_lock(&cpuidle_driver_lock);
-	__cpuidle_unregister_all_cpu_driver(drv);
-	spin_unlock(&cpuidle_driver_lock);
-}
-EXPORT_SYMBOL_GPL(cpuidle_unregister_driver);
-
-#else
-
-static struct cpuidle_driver *cpuidle_curr_driver;
+	if (WARN_ON(drv->refcnt > 0))
+		return;
 
-static inline void __cpuidle_set_cpu_driver(struct cpuidle_driver *drv, int cpu)
-{
-	cpuidle_curr_driver = drv;
-}
+	if (drv->bctimer) {
+		drv->bctimer = 0;
+		on_each_cpu_mask(drv->cpumask, cpuidle_setup_broadcast_timer,
+				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_OFF, 1);
+	}
 
-static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
-{
-	return cpuidle_curr_driver;
+	__cpuidle_unset_driver(drv);
 }
 
 /**
@@ -187,13 +160,11 @@ static inline struct cpuidle_driver *__cpuidle_get_cpu_driver(int cpu)
  */
 int cpuidle_register_driver(struct cpuidle_driver *drv)
 {
-	int ret, cpu;
+	int ret;
 
-	cpu = get_cpu();
 	spin_lock(&cpuidle_driver_lock);
-	ret = __cpuidle_register_driver(drv, cpu);
+	ret = __cpuidle_register_driver(drv);
 	spin_unlock(&cpuidle_driver_lock);
-	put_cpu();
 
 	return ret;
 }
@@ -205,16 +176,11 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver);
  */
 void cpuidle_unregister_driver(struct cpuidle_driver *drv)
 {
-	int cpu;
-
-	cpu = get_cpu();
 	spin_lock(&cpuidle_driver_lock);
-	__cpuidle_unregister_driver(drv, cpu);
+	__cpuidle_unregister_driver(drv);
 	spin_unlock(&cpuidle_driver_lock);
-	put_cpu();
 }
 EXPORT_SYMBOL_GPL(cpuidle_unregister_driver);
-#endif
 
 /**
  * cpuidle_get_driver - return the current driver
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 8f0406230a0a..0bc4b74668e9 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -111,6 +111,9 @@ struct cpuidle_driver {
 	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
 	int			state_count;
 	int			safe_state_index;
+
+	/* the driver handles the cpus in cpumask */
+	struct cpumask       *cpumask;
 };
 
 #ifdef CONFIG_CPU_IDLE
@@ -135,9 +138,6 @@ extern void cpuidle_disable_device(struct cpuidle_device *dev);
 extern int cpuidle_play_dead(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
-extern int cpuidle_register_cpu_driver(struct cpuidle_driver *drv, int cpu);
-extern void cpuidle_unregister_cpu_driver(struct cpuidle_driver *drv, int cpu);
-
 #else
 static inline void disable_cpuidle(void) { }
 static inline int cpuidle_idle_call(void) { return -ENODEV; }
-- 
cgit v1.3-7-g2ca7


From 5a49b4a527e5b72ae3a4f64f078759f83fbd98b5 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Fri, 7 Jun 2013 17:11:26 +0100
Subject: regulator: ab8500-ext: Register as a device in its own right

Some platforms don't support the AB8500 external regulators, so instead
of having a list of is_<platform>() calls prior to calling
ab8500_ext_regulator_init() from ab8500_regulator_probe(), we can only
register as a platform device on platforms which require them. It means
we also have more control over them when booting with Device Tree.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonielinaro.org>
---
 drivers/regulator/ab8500-ext.c   | 33 +++++++++++++++++++++++++++++++--
 drivers/regulator/ab8500.c       | 17 +----------------
 include/linux/regulator/ab8500.h |  4 ----
 3 files changed, 32 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
index e4975bc61e81..95f495f51d00 100644
--- a/drivers/regulator/ab8500-ext.c
+++ b/drivers/regulator/ab8500-ext.c
@@ -333,7 +333,7 @@ static struct ab8500_ext_regulator_info
 	},
 };
 
-int ab8500_ext_regulator_init(struct platform_device *pdev)
+int ab8500_ext_regulator_probe(struct platform_device *pdev)
 {
 	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
 	struct ab8500_platform_data *ppdata;
@@ -409,7 +409,7 @@ int ab8500_ext_regulator_init(struct platform_device *pdev)
 	return 0;
 }
 
-void ab8500_ext_regulator_exit(struct platform_device *pdev)
+int ab8500_ext_regulator_remove(struct platform_device *pdev)
 {
 	int i;
 
@@ -422,7 +422,36 @@ void ab8500_ext_regulator_exit(struct platform_device *pdev)
 
 		regulator_unregister(info->rdev);
 	}
+
+	return 0;
+}
+
+static struct platform_driver ab8500_ext_regulator_driver = {
+	.probe = ab8500_ext_regulator_probe,
+	.remove = ab8500_ext_regulator_remove,
+	.driver         = {
+		.name   = "ab8500-ext-regulator",
+		.owner  = THIS_MODULE,
+	},
+};
+
+static int __init ab8500_ext_regulator_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&ab8500_ext_regulator_driver);
+	if (ret)
+		pr_err("Failed to register ab8500 ext regulator: %d\n", ret);
+
+	return ret;
+}
+subsys_initcall(ab8500_ext_regulator_init);
+
+static void __exit ab8500_ext_regulator_exit(void)
+{
+	platform_driver_unregister(&ab8500_ext_regulator_driver);
 }
+module_exit(ab8500_ext_regulator_exit);
 
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Bengt Jonsson <bengt.g.jonsson@stericsson.com>");
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index f6656b8c28b6..dfc790196b34 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -3156,22 +3156,12 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 			return err;
 	}
 
-	if (!is_ab8505(ab8500)) {
-		/* register external regulators (before Vaux1, 2 and 3) */
-		err = ab8500_ext_regulator_init(pdev);
-		if (err)
-			return err;
-	}
-
 	/* register all regulators */
 	for (i = 0; i < abx500_regulator.info_size; i++) {
 		err = ab8500_regulator_register(pdev, &pdata->regulator[i],
 						i, NULL);
-		if (err < 0) {
-			if (!is_ab8505(ab8500))
-				ab8500_ext_regulator_exit(pdev);
+		if (err < 0)
 			return err;
-		}
 	}
 
 	return 0;
@@ -3180,7 +3170,6 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 static int ab8500_regulator_remove(struct platform_device *pdev)
 {
 	int i, err;
-	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
 
 	for (i = 0; i < abx500_regulator.info_size; i++) {
 		struct ab8500_regulator_info *info = NULL;
@@ -3192,10 +3181,6 @@ static int ab8500_regulator_remove(struct platform_device *pdev)
 		regulator_unregister(info->regulator);
 	}
 
-	/* remove external regulators (after Vaux1, 2 and 3) */
-	if (!is_ab8505(ab8500))
-		ab8500_ext_regulator_exit(pdev);
-
 	/* remove regulator debug */
 	err = ab8500_regulator_debug_exit(pdev);
 	if (err)
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 7c5ff0c55773..75307447cef9 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -336,8 +336,4 @@ static inline int ab8500_regulator_debug_exit(struct platform_device *pdev)
 }
 #endif
 
-/* AB8500 external regulator functions. */
-int ab8500_ext_regulator_init(struct platform_device *pdev);
-void ab8500_ext_regulator_exit(struct platform_device *pdev);
-
 #endif
-- 
cgit v1.3-7-g2ca7


From 67252287871113deba96adf7e4df1752f3f08688 Mon Sep 17 00:00:00 2001
From: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Date: Tue, 11 Jun 2013 13:18:15 +0100
Subject: regmap: Add regmap_field APIs

It is common to access regmap registers at bit level, using
regmap_update_bits or regmap_read functions, however the end user has to
take care of a mask or shifting. This becomes overhead when such use
cases are high. Having a common function to do this is much convenient
and less error prone.

The idea of regmap_field is simple, regmap_field gives a logical
structure to bits of the regmap register, and the driver can use this
logical entity without the knowledge of the bit positions and masks all
over the code. This way code looks much neat and it need not handle the
masks, shifts every time it access the those entities.

With this new regmap_field_read/write apis the end user can setup a
regmap field using regmap_field_init and use the return regmap_field to
read write the register field without worrying about the masks or
shifts.

Also this apis will be useful for drivers which are based on regmaps,
like some clocks or pinctrls which can work on the regmap_fields
directly without having to worry about bit positions.

Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@st.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 drivers/base/regmap/internal.h |   8 +++
 drivers/base/regmap/regmap.c   | 130 +++++++++++++++++++++++++++++++++++++++++
 include/linux/regmap.h         |  31 ++++++++++
 3 files changed, 169 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index c130536e0ab0..c5f6ebd0466d 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -174,6 +174,14 @@ struct regmap_range_node {
 	unsigned int window_len;
 };
 
+struct regmap_field {
+	struct regmap *regmap;
+	unsigned int mask;
+	/* lsb */
+	unsigned int shift;
+	unsigned int reg;
+};
+
 #ifdef CONFIG_DEBUG_FS
 extern void regmap_debugfs_initcall(void);
 extern void regmap_debugfs_init(struct regmap *map, const char *name);
diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index a941dcfe7590..fef6f13b96bb 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -801,6 +801,95 @@ struct regmap *devm_regmap_init(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(devm_regmap_init);
 
+static void regmap_field_init(struct regmap_field *rm_field,
+	struct regmap *regmap, struct reg_field reg_field)
+{
+	int field_bits = reg_field.msb - reg_field.lsb + 1;
+	rm_field->regmap = regmap;
+	rm_field->reg = reg_field.reg;
+	rm_field->shift = reg_field.lsb;
+	rm_field->mask = ((BIT(field_bits) - 1) << reg_field.lsb);
+}
+
+/**
+ * devm_regmap_field_alloc(): Allocate and initialise a register field
+ * in a register map.
+ *
+ * @dev: Device that will be interacted with
+ * @regmap: regmap bank in which this register field is located.
+ * @reg_field: Register field with in the bank.
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap_field. The regmap_field will be automatically freed
+ * by the device management code.
+ */
+struct regmap_field *devm_regmap_field_alloc(struct device *dev,
+		struct regmap *regmap, struct reg_field reg_field)
+{
+	struct regmap_field *rm_field = devm_kzalloc(dev,
+					sizeof(*rm_field), GFP_KERNEL);
+	if (!rm_field)
+		return ERR_PTR(-ENOMEM);
+
+	regmap_field_init(rm_field, regmap, reg_field);
+
+	return rm_field;
+
+}
+EXPORT_SYMBOL_GPL(devm_regmap_field_alloc);
+
+/**
+ * devm_regmap_field_free(): Free register field allocated using
+ * devm_regmap_field_alloc. Usally drivers need not call this function,
+ * as the memory allocated via devm will be freed as per device-driver
+ * life-cyle.
+ *
+ * @dev: Device that will be interacted with
+ * @field: regmap field which should be freed.
+ */
+void devm_regmap_field_free(struct device *dev,
+	struct regmap_field *field)
+{
+	devm_kfree(dev, field);
+}
+EXPORT_SYMBOL_GPL(devm_regmap_field_free);
+
+/**
+ * regmap_field_alloc(): Allocate and initialise a register field
+ * in a register map.
+ *
+ * @regmap: regmap bank in which this register field is located.
+ * @reg_field: Register field with in the bank.
+ *
+ * The return value will be an ERR_PTR() on error or a valid pointer
+ * to a struct regmap_field. The regmap_field should be freed by the
+ * user once its finished working with it using regmap_field_free().
+ */
+struct regmap_field *regmap_field_alloc(struct regmap *regmap,
+		struct reg_field reg_field)
+{
+	struct regmap_field *rm_field = kzalloc(sizeof(*rm_field), GFP_KERNEL);
+
+	if (!rm_field)
+		return ERR_PTR(-ENOMEM);
+
+	regmap_field_init(rm_field, regmap, reg_field);
+
+	return rm_field;
+}
+EXPORT_SYMBOL_GPL(regmap_field_alloc);
+
+/**
+ * regmap_field_free(): Free register field allocated using regmap_field_alloc
+ *
+ * @field: regmap field which should be freed.
+ */
+void regmap_field_free(struct regmap_field *field)
+{
+	kfree(field);
+}
+EXPORT_SYMBOL_GPL(regmap_field_free);
+
 /**
  * regmap_reinit_cache(): Reinitialise the current register cache
  *
@@ -1249,6 +1338,22 @@ int regmap_raw_write(struct regmap *map, unsigned int reg,
 }
 EXPORT_SYMBOL_GPL(regmap_raw_write);
 
+/**
+ * regmap_field_write(): Write a value to a single register field
+ *
+ * @field: Register field to write to
+ * @val: Value to be written
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_field_write(struct regmap_field *field, unsigned int val)
+{
+	return regmap_update_bits(field->regmap, field->reg,
+				field->mask, val << field->shift);
+}
+EXPORT_SYMBOL_GPL(regmap_field_write);
+
 /*
  * regmap_bulk_write(): Write multiple registers to the device
  *
@@ -1531,6 +1636,31 @@ int regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
 }
 EXPORT_SYMBOL_GPL(regmap_raw_read);
 
+/**
+ * regmap_field_read(): Read a value to a single register field
+ *
+ * @field: Register field to read from
+ * @val: Pointer to store read value
+ *
+ * A value of zero will be returned on success, a negative errno will
+ * be returned in error cases.
+ */
+int regmap_field_read(struct regmap_field *field, unsigned int *val)
+{
+	int ret;
+	unsigned int reg_val;
+	ret = regmap_read(field->regmap, field->reg, &reg_val);
+	if (ret != 0)
+		return ret;
+
+	reg_val &= field->mask;
+	reg_val >>= field->shift;
+	*val = reg_val;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(regmap_field_read);
+
 /**
  * regmap_bulk_read(): Read multiple registers from the device
  *
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index 02d84e24b7c2..eebea9b5f43e 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -23,6 +23,7 @@ struct irq_domain;
 struct spi_device;
 struct regmap;
 struct regmap_range_cfg;
+struct regmap_field;
 
 /* An enum of all the supported cache types */
 enum regcache_type {
@@ -411,6 +412,36 @@ bool regmap_reg_in_ranges(unsigned int reg,
 			  const struct regmap_range *ranges,
 			  unsigned int nranges);
 
+/**
+ * Description of an register field
+ *
+ * @reg: Offset of the register within the regmap bank
+ * @lsb: lsb of the register field.
+ * @reg: msb of the register field.
+ */
+struct reg_field {
+	unsigned int reg;
+	unsigned int lsb;
+	unsigned int msb;
+};
+
+#define REG_FIELD(_reg, _lsb, _msb) {		\
+				.reg = _reg,	\
+				.lsb = _lsb,	\
+				.msb = _msb,	\
+				}
+
+struct regmap_field *regmap_field_alloc(struct regmap *regmap,
+		struct reg_field reg_field);
+void regmap_field_free(struct regmap_field *field);
+
+struct regmap_field *devm_regmap_field_alloc(struct device *dev,
+		struct regmap *regmap, struct reg_field reg_field);
+void devm_regmap_field_free(struct device *dev,	struct regmap_field *field);
+
+int regmap_field_read(struct regmap_field *field, unsigned int *val);
+int regmap_field_write(struct regmap_field *field, unsigned int val);
+
 /**
  * Description of an IRQ for the generic regmap irq_chip.
  *
-- 
cgit v1.3-7-g2ca7


From 1b4d7d9787beb45a51c1ccf2f0a7fcee9213fb38 Mon Sep 17 00:00:00 2001
From: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Date: Wed, 12 Jun 2013 17:44:07 +0100
Subject: mfd: wm5102: Expose DRE control registers

Certain use cases may require specific DRE settings so expose the
necessary registers.

Signed-off-by: Charles Keepax <ckeepax@opensource.wolfsonmicro.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 drivers/mfd/wm5102-tables.c           |  9 ++++++-
 include/linux/mfd/arizona/registers.h | 44 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index 155c4a1a6a99..802dd3cb18cf 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c
@@ -65,7 +65,8 @@ static const struct reg_default wm5102_revb_patch[] = {
 	{ 0x418, 0xa080 },
 	{ 0x420, 0xa080 },
 	{ 0x428, 0xe000 },
-	{ 0x443, 0xDC1A },
+	{ 0x442, 0x3F0A },
+	{ 0x443, 0xDC1F },
 	{ 0x4B0, 0x0066 },
 	{ 0x458, 0x000b },
 	{ 0x212, 0x0000 },
@@ -424,6 +425,9 @@ static const struct reg_default wm5102_reg_default[] = {
 	{ 0x00000435, 0x0180 },   /* R1077  - DAC Digital Volume 5R */ 
 	{ 0x00000436, 0x0081 },   /* R1078  - DAC Volume Limit 5R */
 	{ 0x00000437, 0x0200 },   /* R1079  - Noise Gate Select 5R */
+	{ 0x00000440, 0x8FFF },   /* R1088  - DRE Enable */
+	{ 0x00000442, 0x3F0A },   /* R1090  - DRE Control 2 */
+	{ 0x00000443, 0xDC1F },   /* R1090  - DRE Control 3 */
 	{ 0x00000450, 0x0000 },   /* R1104  - DAC AEC Control 1 */ 
 	{ 0x00000458, 0x000B },   /* R1112  - Noise Gate Control */
 	{ 0x00000490, 0x0069 },   /* R1168  - PDM SPK1 CTRL 1 */ 
@@ -1197,6 +1201,9 @@ static bool wm5102_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_DAC_DIGITAL_VOLUME_5R:
 	case ARIZONA_DAC_VOLUME_LIMIT_5R:
 	case ARIZONA_NOISE_GATE_SELECT_5R:
+	case ARIZONA_DRE_ENABLE:
+	case ARIZONA_DRE_CONTROL_2:
+	case ARIZONA_DRE_CONTROL_3:
 	case ARIZONA_DAC_AEC_CONTROL_1:
 	case ARIZONA_NOISE_GATE_CONTROL:
 	case ARIZONA_PDM_SPK1_CTRL_1:
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 4730b5c576e2..4706d3d46e56 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -215,6 +215,9 @@
 #define ARIZONA_DAC_DIGITAL_VOLUME_6R            0x43D
 #define ARIZONA_DAC_VOLUME_LIMIT_6R              0x43E
 #define ARIZONA_NOISE_GATE_SELECT_6R             0x43F
+#define ARIZONA_DRE_ENABLE                       0x440
+#define ARIZONA_DRE_CONTROL_2                    0x442
+#define ARIZONA_DRE_CONTROL_3                    0x443
 #define ARIZONA_DAC_AEC_CONTROL_1                0x450
 #define ARIZONA_NOISE_GATE_CONTROL               0x458
 #define ARIZONA_PDM_SPK1_CTRL_1                  0x490
@@ -3132,6 +3135,47 @@
 #define ARIZONA_OUT6R_NGATE_SRC_SHIFT                 0  /* OUT6R_NGATE_SRC - [11:0] */
 #define ARIZONA_OUT6R_NGATE_SRC_WIDTH                12  /* OUT6R_NGATE_SRC - [11:0] */
 
+/*
+ * R1088 (0x440) - DRE Enable
+ */
+#define ARIZONA_DRE3L_ENA                        0x0010  /* DRE3L_ENA */
+#define ARIZONA_DRE3L_ENA_MASK                   0x0010  /* DRE3L_ENA */
+#define ARIZONA_DRE3L_ENA_SHIFT                       4  /* DRE3L_ENA */
+#define ARIZONA_DRE3L_ENA_WIDTH                       1  /* DRE3L_ENA */
+#define ARIZONA_DRE2R_ENA                        0x0008  /* DRE2R_ENA */
+#define ARIZONA_DRE2R_ENA_MASK                   0x0008  /* DRE2R_ENA */
+#define ARIZONA_DRE2R_ENA_SHIFT                       3  /* DRE2R_ENA */
+#define ARIZONA_DRE2R_ENA_WIDTH                       1  /* DRE2R_ENA */
+#define ARIZONA_DRE2L_ENA                        0x0004  /* DRE2L_ENA */
+#define ARIZONA_DRE2L_ENA_MASK                   0x0004  /* DRE2L_ENA */
+#define ARIZONA_DRE2L_ENA_SHIFT                       2  /* DRE2L_ENA */
+#define ARIZONA_DRE2L_ENA_WIDTH                       1  /* DRE2L_ENA */
+#define ARIZONA_DRE1R_ENA                        0x0002  /* DRE1R_ENA */
+#define ARIZONA_DRE1R_ENA_MASK                   0x0002  /* DRE1R_ENA */
+#define ARIZONA_DRE1R_ENA_SHIFT                       1  /* DRE1R_ENA */
+#define ARIZONA_DRE1R_ENA_WIDTH                       1  /* DRE1R_ENA */
+#define ARIZONA_DRE1L_ENA                        0x0001  /* DRE1L_ENA */
+#define ARIZONA_DRE1L_ENA_MASK                   0x0001  /* DRE1L_ENA */
+#define ARIZONA_DRE1L_ENA_SHIFT                       0  /* DRE1L_ENA */
+#define ARIZONA_DRE1L_ENA_WIDTH                       1  /* DRE1L_ENA */
+
+/*
+ * R1090 (0x442) - DRE Control 2
+ */
+#define ARIZONA_DRE_T_LOW_MASK                   0x3F00  /* DRE_T_LOW - [13:8] */
+#define ARIZONA_DRE_T_LOW_SHIFT                       8  /* DRE_T_LOW - [13:8] */
+#define ARIZONA_DRE_T_LOW_WIDTH                       6  /* DRE_T_LOW - [13:8] */
+
+/*
+ * R1091 (0x443) - DRE Control 3
+ */
+#define ARIZONA_DRE_GAIN_SHIFT_MASK              0xC000  /* DRE_GAIN_SHIFT - [15:14] */
+#define ARIZONA_DRE_GAIN_SHIFT_SHIFT                 14  /* DRE_GAIN_SHIFT - [15:14] */
+#define ARIZONA_DRE_GAIN_SHIFT_WIDTH                  2  /* DRE_GAIN_SHIFT - [15:14] */
+#define ARIZONA_DRE_LOW_LEVEL_ABS_MASK           0x000F  /* LOW_LEVEL_ABS - [3:0] */
+#define ARIZONA_DRE_LOW_LEVEL_ABS_SHIFT               0  /* LOW_LEVEL_ABS - [3:0] */
+#define ARIZONA_DRE_LOW_LEVEL_ABS_WIDTH               4  /* LOW_LEVEL_ABS - [3:0] */
+
 /*
  * R1104 (0x450) - DAC AEC Control 1
  */
-- 
cgit v1.3-7-g2ca7


From 6a24474da83ea7c8b7d32f05f858b1259994067a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 20:43:06 -0700
Subject: percpu-refcount: consistently use plain (non-sched) RCU

percpu_ref_get/put() are using preempt_disable/enable() while
percpu_ref_kill() is using plain call_rcu() instead of
call_rcu_sched().  This is buggy as grace periods of the two may not
match.  Fix it by using plain RCU in percpu_ref_get/put().

(I suggested using sched RCU in the first place but there's no actual
 benefit in doing so unless we're gonna introduce different variants
 of get/put to be called while preemption is alredy disabled, which we
 definitely shouldn't.)

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Kent Overstreet <koverstreet@google.com>
---
 include/linux/percpu-refcount.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 24b31ef15932..abe141172d96 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -85,7 +85,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count;
 
-	preempt_disable();
+	rcu_read_lock();
 
 	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
 
@@ -94,7 +94,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 	else
 		atomic_inc(&ref->count);
 
-	preempt_enable();
+	rcu_read_unlock();
 }
 
 /**
@@ -107,7 +107,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count;
 
-	preempt_disable();
+	rcu_read_lock();
 
 	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
 
@@ -116,7 +116,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 	else if (unlikely(atomic_dec_and_test(&ref->count)))
 		ref->release(ref);
 
-	preempt_enable();
+	rcu_read_unlock();
 }
 
 #endif
-- 
cgit v1.3-7-g2ca7


From ac899061a93250c28562f05ad94d5c74603415bc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 20:43:06 -0700
Subject: percpu-refcount: cosmetic updates

* s/percpu_ref_release/percpu_ref_func_t/ as it's customary to have _t
  postfix for types and the type is gonna be used for a different type
  of callback too.

* Add @ARG to function comments.

* Drop unnecessary and unaligned indentation from percpu_ref_init()
  function comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Kent Overstreet <koverstreet@google.com>
---
 include/linux/percpu-refcount.h | 8 +++++---
 lib/percpu-refcount.c           | 7 ++++---
 2 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index abe141172d96..b61bd6f23985 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -51,7 +51,7 @@
 #include <linux/rcupdate.h>
 
 struct percpu_ref;
-typedef void (percpu_ref_release)(struct percpu_ref *);
+typedef void (percpu_ref_func_t)(struct percpu_ref *);
 
 struct percpu_ref {
 	atomic_t		count;
@@ -62,11 +62,11 @@ struct percpu_ref {
 	 * percpu_ref_kill_rcu())
 	 */
 	unsigned __percpu	*pcpu_count;
-	percpu_ref_release	*release;
+	percpu_ref_func_t	*release;
 	struct rcu_head		rcu;
 };
 
-int percpu_ref_init(struct percpu_ref *, percpu_ref_release *);
+int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release);
 void percpu_ref_kill(struct percpu_ref *ref);
 
 #define PCPU_STATUS_BITS	2
@@ -78,6 +78,7 @@ void percpu_ref_kill(struct percpu_ref *ref);
 
 /**
  * percpu_ref_get - increment a percpu refcount
+ * @ref: percpu_ref to get
  *
  * Analagous to atomic_inc().
   */
@@ -99,6 +100,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 
 /**
  * percpu_ref_put - decrement a percpu refcount
+ * @ref: percpu_ref to put
  *
  * Decrement the refcount, and if 0, call the release function (which was passed
  * to percpu_ref_init())
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 1a17399fc7db..9a78e55fa48f 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -33,8 +33,8 @@
 
 /**
  * percpu_ref_init - initialize a percpu refcount
- * @ref:	ref to initialize
- * @release:	function which will be called when refcount hits 0
+ * @ref: percpu_ref to initialize
+ * @release: function which will be called when refcount hits 0
  *
  * Initializes the refcount in single atomic counter mode with a refcount of 1;
  * analagous to atomic_set(ref, 1).
@@ -42,7 +42,7 @@
  * Note that @release must not sleep - it may potentially be called from RCU
  * callback context by percpu_ref_kill().
  */
-int percpu_ref_init(struct percpu_ref *ref, percpu_ref_release *release)
+int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
 {
 	atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS);
 
@@ -98,6 +98,7 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 
 /**
  * percpu_ref_kill - safely drop initial ref
+ * @ref: percpu_ref to kill
  *
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
-- 
cgit v1.3-7-g2ca7


From 5c5cc62321d9df7a9a608346fc649c4528380c8f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sun, 9 Jun 2013 17:16:29 +0800
Subject: cpuset: allow to keep tasks in empty cpusets

To achieve this:

- We call update_tasks_cpumask/nodemask() for empty cpusets when
hotplug happens, instead of moving tasks out of them.

- When a cpuset's masks are changed by writing cpuset.cpus/mems,
we also update tasks in child cpusets which are empty.

v3:
- do propagation work in one place for both hotplug and unplug

v2:
- drop rcu_read_lock before calling update_task_nodemask() and
  update_task_cpumask(), instead of using workqueue.
- add documentation in include/linux/cgroup.h

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |   4 ++
 kernel/cpuset.c        | 141 ++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 114 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d0ad3794b947..53e81a61be57 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -277,6 +277,10 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
+	 * - cpuset: tasks will be kept in empty cpusets when hotplug happens
+	 *   and take masks of ancestors with non-empty cpus/mems, instead of
+	 *   being moved to an ancestor.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 82ac1f862cbc..3473dd2580d1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -874,6 +874,45 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 	cgroup_scan_tasks(&scan);
 }
 
+/*
+ * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
+ * @root_cs: the root cpuset of the hierarchy
+ * @update_root: update root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update cpumasks of tasks in @root_cs and all other empty cpusets
+ * which take on cpumask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_cpumask_hier(struct cpuset *root_cs,
+				      bool update_root, struct ptr_heap *heap)
+{
+	struct cpuset *cp;
+	struct cgroup *pos_cgrp;
+
+	if (update_root)
+		update_tasks_cpumask(root_cs, heap);
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+		/* skip the whole subtree if @cp have some CPU */
+		if (!cpumask_empty(cp->cpus_allowed)) {
+			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+			continue;
+		}
+		if (!css_tryget(&cp->css))
+			continue;
+		rcu_read_unlock();
+
+		update_tasks_cpumask(cp, heap);
+
+		rcu_read_lock();
+		css_put(&cp->css);
+	}
+	rcu_read_unlock();
+}
+
 /**
  * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
  * @cs: the cpuset to consider
@@ -925,11 +964,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
 	mutex_unlock(&callback_mutex);
 
-	/*
-	 * Scan tasks in the cpuset, and update the cpumasks of any
-	 * that need an update.
-	 */
-	update_tasks_cpumask(cs, &heap);
+	update_tasks_cpumask_hier(cs, true, &heap);
 
 	heap_free(&heap);
 
@@ -1096,6 +1131,45 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap)
 	cpuset_being_rebound = NULL;
 }
 
+/*
+ * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
+ * @cs: the root cpuset of the hierarchy
+ * @update_root: update the root cpuset or not?
+ * @heap: the heap used by cgroup_scan_tasks()
+ *
+ * This will update nodemasks of tasks in @root_cs and all other empty cpusets
+ * which take on nodemask of @root_cs.
+ *
+ * Called with cpuset_mutex held
+ */
+static void update_tasks_nodemask_hier(struct cpuset *root_cs,
+				       bool update_root, struct ptr_heap *heap)
+{
+	struct cpuset *cp;
+	struct cgroup *pos_cgrp;
+
+	if (update_root)
+		update_tasks_nodemask(root_cs, heap);
+
+	rcu_read_lock();
+	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
+		/* skip the whole subtree if @cp have some CPU */
+		if (!nodes_empty(cp->mems_allowed)) {
+			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
+			continue;
+		}
+		if (!css_tryget(&cp->css))
+			continue;
+		rcu_read_unlock();
+
+		update_tasks_nodemask(cp, heap);
+
+		rcu_read_lock();
+		css_put(&cp->css);
+	}
+	rcu_read_unlock();
+}
+
 /*
  * Handle user request to change the 'mems' memory placement
  * of a cpuset.  Needs to validate the request, update the
@@ -1160,7 +1234,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	cs->mems_allowed = trialcs->mems_allowed;
 	mutex_unlock(&callback_mutex);
 
-	update_tasks_nodemask(cs, &heap);
+	update_tasks_nodemask_hier(cs, true, &heap);
 
 	heap_free(&heap);
 done:
@@ -2048,6 +2122,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs)
 	static cpumask_t off_cpus;
 	static nodemask_t off_mems;
 	bool is_empty;
+	bool sane = cgroup_sane_behavior(cs->css.cgroup);
 
 retry:
 	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2066,21 +2141,29 @@ retry:
 	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
 	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
 
-	/* remove offline cpus from @cs */
-	if (!cpumask_empty(&off_cpus)) {
-		mutex_lock(&callback_mutex);
-		cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
-		mutex_unlock(&callback_mutex);
+	mutex_lock(&callback_mutex);
+	cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
+	mutex_unlock(&callback_mutex);
+
+	/*
+	 * If sane_behavior flag is set, we need to update tasks' cpumask
+	 * for empty cpuset to take on ancestor's cpumask.
+	 */
+	if ((sane && cpumask_empty(cs->cpus_allowed)) ||
+	    !cpumask_empty(&off_cpus))
 		update_tasks_cpumask(cs, NULL);
-	}
 
-	/* remove offline mems from @cs */
-	if (!nodes_empty(off_mems)) {
-		mutex_lock(&callback_mutex);
-		nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
-		mutex_unlock(&callback_mutex);
+	mutex_lock(&callback_mutex);
+	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
+	mutex_unlock(&callback_mutex);
+
+	/*
+	 * If sane_behavior flag is set, we need to update tasks' nodemask
+	 * for empty cpuset to take on ancestor's nodemask.
+	 */
+	if ((sane && nodes_empty(cs->mems_allowed)) ||
+	    !nodes_empty(off_mems))
 		update_tasks_nodemask(cs, NULL);
-	}
 
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		nodes_empty(cs->mems_allowed);
@@ -2088,11 +2171,13 @@ retry:
 	mutex_unlock(&cpuset_mutex);
 
 	/*
-	 * If @cs became empty, move tasks to the nearest ancestor with
-	 * execution resources.  This is full cgroup operation which will
+	 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
+	 *
+	 * Otherwise move tasks to the nearest ancestor with execution
+	 * resources.  This is full cgroup operation which will
 	 * also call back into cpuset.  Should be done outside any lock.
 	 */
-	if (is_empty)
+	if (!sane && is_empty)
 		remove_tasks_in_empty_cpuset(cs);
 }
 
@@ -2114,10 +2199,9 @@ retry:
  */
 static void cpuset_hotplug_workfn(struct work_struct *work)
 {
-	static cpumask_t new_cpus, tmp_cpus;
-	static nodemask_t new_mems, tmp_mems;
+	static cpumask_t new_cpus;
+	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
-	bool cpus_offlined, mems_offlined;
 
 	mutex_lock(&cpuset_mutex);
 
@@ -2126,12 +2210,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	new_mems = node_states[N_MEMORY];
 
 	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
-	cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
-				       &new_cpus);
-
 	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
-	nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
-	mems_offlined = !nodes_empty(tmp_mems);
 
 	/* synchronize cpus_allowed to cpu_active_mask */
 	if (cpus_updated) {
@@ -2151,8 +2230,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 
 	mutex_unlock(&cpuset_mutex);
 
-	/* if cpus or mems went down, we need to propagate to descendants */
-	if (cpus_offlined || mems_offlined) {
+	/* if cpus or mems changed, we need to propagate to descendants */
+	if (cpus_updated || mems_updated) {
 		struct cpuset *cs;
 		struct cgroup *pos_cgrp;
 
-- 
cgit v1.3-7-g2ca7


From 88fa523bff295f1d60244a54833480b02f775152 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Sun, 9 Jun 2013 17:16:46 +0800
Subject: cpuset: allow to move tasks to empty cpusets

Currently some cpuset behaviors are not friendly when cpuset is co-mounted
with other cgroup controllers.

Now with this patchset if cpuset is mounted with sane_behavior option,
it behaves differently:

- Tasks will be kept in empty cpusets when hotplug happens and take
  masks of ancestors with non-empty cpus/mems, instead of being moved to
  an ancestor.

- A task can be moved into an empty cpuset, and again it takes masks of
  ancestors, so the user can drop a task into a newly created cgroup without
  having to do anything for it.

As tasks can reside in empy cpusets, here're some rules:

- They can be moved to another cpuset, regardless it's empty or not.

- Though it takes masks from ancestors, it takes other configs from the
  empty cpuset.

- If the ancestors' masks are changed, those tasks will also be updated
  to take new masks.

v2: add documentation in include/linux/cgroup.h

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 3 +++
 kernel/cpuset.c        | 9 +++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 53e81a61be57..74e8b8e4cd7f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -281,6 +281,9 @@ enum {
 	 *   and take masks of ancestors with non-empty cpus/mems, instead of
 	 *   being moved to an ancestor.
 	 *
+	 * - cpuset: a task can be moved into an empty cpuset, and again it
+	 *   takes masks of ancestors.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 *
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3473dd2580d1..3b3fdfdd4d78 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -479,7 +479,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	 */
 	ret = -ENOSPC;
 	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
-	    (cpumask_empty(trial->cpus_allowed) ||
+	    (cpumask_empty(trial->cpus_allowed) &&
 	     nodes_empty(trial->mems_allowed)))
 		goto out;
 
@@ -1466,8 +1466,13 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 
 	mutex_lock(&cpuset_mutex);
 
+	/*
+	 * We allow to move tasks into an empty cpuset if sane_behavior
+	 * flag is set.
+	 */
 	ret = -ENOSPC;
-	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+	if (!cgroup_sane_behavior(cgrp) &&
+	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
 		goto out_unlock;
 
 	cgroup_taskset_for_each(task, cgrp, tset) {
-- 
cgit v1.3-7-g2ca7


From 3fc3db9a3ae0ce108badf31a4a00e41b4236f5fc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:48 -0700
Subject: cgroup: remove now unused css_depth()

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 -
 kernel/cgroup.c        | 12 ------------
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index d0ad3794b947..5830592258dc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -848,7 +848,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
-unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index bc53d5014b28..d4a329f5874c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5227,18 +5227,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
 
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-	struct css_id *cssid;
-
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
-
-	if (cssid)
-		return cssid->depth;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
 /**
  *  css_is_ancestor - test "root" css is an ancestor of "child"
  * @child: the css to be tested.
-- 
cgit v1.3-7-g2ca7


From 69d0206c793a17431eacee2694ee7a4b25df76b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:50 -0700
Subject: cgroup: bring some sanity to naming around cg_cgroup_link

cgroups and css_sets are mapped M:N and this M:N mapping is
represented by struct cg_cgroup_link which forms linked lists on both
sides.  The naming around this mapping is already confusing and struct
cg_cgroup_link exacerbates the situation quite a bit.

>From cgroup side, it starts off ->css_sets and runs through
->cgrp_link_list.  From css_set side, it starts off ->cg_links and
runs through ->cg_link_list.  This is rather reversed as
cgrp_link_list is used to iterate css_sets and cg_link_list cgroups.
Also, this is the only place which is still using the confusing "cg"
for css_sets.  This patch cleans it up a bit.

* s/cgroup->css_sets/cgroup->cset_links/
  s/css_set->cg_links/css_set->cgrp_links/
  s/cgroup_iter->cg_link/cgroup_iter->cset_link/

* s/cg_cgroup_link/cgrp_cset_link/

* s/cgrp_cset_link->cg/cgrp_cset_link->cset/
  s/cgrp_cset_link->cgrp_link_list/cgrp_cset_link->cset_link/
  s/cgrp_cset_link->cg_link_list/cgrp_cset_link->cgrp_link/

* s/init_css_set_link/init_cgrp_cset_link/
  s/free_cg_links/free_cgrp_cset_links/
  s/allocate_cg_links/allocate_cgrp_cset_links/

* s/cgl[12]/link[12]/ in compare_css_sets()

* s/saved_link/tmp_link/ s/tmp/tmp_links/ and a couple similar
  adustments.

* Comment and whiteline adjustments.

After the changes, we have

	list_for_each_entry(link, &cont->cset_links, cset_link) {
		struct css_set *cset = link->cset;

instead of

	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
		struct css_set *cset = link->cg;

This patch is purely cosmetic.

v2: Fix broken sentences in the patch description.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  15 ++--
 kernel/cgroup.c        | 226 ++++++++++++++++++++++++-------------------------
 2 files changed, 120 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5830592258dc..0e32855edc92 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -215,10 +215,10 @@ struct cgroup {
 	struct cgroupfs_root *root;
 
 	/*
-	 * List of cg_cgroup_links pointing at css_sets with
-	 * tasks in this cgroup. Protected by css_set_lock
+	 * List of cgrp_cset_links pointing at css_sets with tasks in this
+	 * cgroup.  Protected by css_set_lock.
 	 */
-	struct list_head css_sets;
+	struct list_head cset_links;
 
 	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
 	struct list_head cft_q_node;	/* used during cftype add/rm */
@@ -365,11 +365,10 @@ struct css_set {
 	struct list_head tasks;
 
 	/*
-	 * List of cg_cgroup_link objects on link chains from
-	 * cgroups referenced from this css_set. Protected by
-	 * css_set_lock
+	 * List of cgrp_cset_links pointing at cgroups referenced from this
+	 * css_set.  Protected by css_set_lock.
 	 */
-	struct list_head cg_links;
+	struct list_head cgrp_links;
 
 	/*
 	 * Set of subsystem states, one for each subsystem. This array
@@ -792,7 +791,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
 
 /* A cgroup_iter should be treated as an opaque object */
 struct cgroup_iter {
-	struct list_head *cg_link;
+	struct list_head *cset_link;
 	struct list_head *task;
 };
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1f5a4e101ed1..ef97bd0cd546 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -315,20 +315,24 @@ static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
 
-/* Link structure for associating css_set objects with cgroups */
-struct cg_cgroup_link {
-	/*
-	 * List running through cg_cgroup_links associated with a
-	 * cgroup, anchored on cgroup->css_sets
-	 */
-	struct list_head cgrp_link_list;
-	struct cgroup *cgrp;
-	/*
-	 * List running through cg_cgroup_links pointing at a
-	 * single css_set object, anchored on css_set->cg_links
-	 */
-	struct list_head cg_link_list;
-	struct css_set *cg;
+/*
+ * A cgroup can be associated with multiple css_sets as different tasks may
+ * belong to different cgroups on different hierarchies.  In the other
+ * direction, a css_set is naturally associated with multiple cgroups.
+ * This M:N relationship is represented by the following link structure
+ * which exists for each association and allows traversing the associations
+ * from both sides.
+ */
+struct cgrp_cset_link {
+	/* the cgroup and css_set this link associates */
+	struct cgroup		*cgrp;
+	struct css_set		*cset;
+
+	/* list of cgrp_cset_links anchored at cgrp->cset_links */
+	struct list_head	cset_link;
+
+	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
+	struct list_head	cgrp_link;
 };
 
 /* The default css_set - used by init and its children prior to any
@@ -339,7 +343,7 @@ struct cg_cgroup_link {
  */
 
 static struct css_set init_css_set;
-static struct cg_cgroup_link init_css_set_link;
+static struct cgrp_cset_link init_cgrp_cset_link;
 
 static int cgroup_init_idr(struct cgroup_subsys *ss,
 			   struct cgroup_subsys_state *css);
@@ -378,8 +382,7 @@ static int use_task_css_set_links __read_mostly;
 
 static void __put_css_set(struct css_set *cset, int taskexit)
 {
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
+	struct cgrp_cset_link *link, *tmp_link;
 
 	/*
 	 * Ensure that the refcount doesn't hit zero while any readers
@@ -398,12 +401,11 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 	hash_del(&cset->hlist);
 	css_set_count--;
 
-	list_for_each_entry_safe(link, saved_link, &cset->cg_links,
-				 cg_link_list) {
+	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *cgrp = link->cgrp;
 
-		list_del(&link->cg_link_list);
-		list_del(&link->cgrp_link_list);
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
 
 		/*
 		 * We may not be holding cgroup_mutex, and if cgrp->count is
@@ -475,26 +477,26 @@ static bool compare_css_sets(struct css_set *cset,
 	 * candidates.
 	 */
 
-	l1 = &cset->cg_links;
-	l2 = &old_cset->cg_links;
+	l1 = &cset->cgrp_links;
+	l2 = &old_cset->cgrp_links;
 	while (1) {
-		struct cg_cgroup_link *cgl1, *cgl2;
+		struct cgrp_cset_link *link1, *link2;
 		struct cgroup *cgrp1, *cgrp2;
 
 		l1 = l1->next;
 		l2 = l2->next;
 		/* See if we reached the end - both lists are equal length. */
-		if (l1 == &cset->cg_links) {
-			BUG_ON(l2 != &old_cset->cg_links);
+		if (l1 == &cset->cgrp_links) {
+			BUG_ON(l2 != &old_cset->cgrp_links);
 			break;
 		} else {
-			BUG_ON(l2 == &old_cset->cg_links);
+			BUG_ON(l2 == &old_cset->cgrp_links);
 		}
 		/* Locate the cgroups associated with these links. */
-		cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list);
-		cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list);
-		cgrp1 = cgl1->cgrp;
-		cgrp2 = cgl2->cgrp;
+		link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
+		link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
+		cgrp1 = link1->cgrp;
+		cgrp2 = link2->cgrp;
 		/* Hierarchies should be linked in the same order. */
 		BUG_ON(cgrp1->root != cgrp2->root);
 
@@ -569,61 +571,64 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 	return NULL;
 }
 
-static void free_cg_links(struct list_head *tmp)
+static void free_cgrp_cset_links(struct list_head *links_to_free)
 {
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
+	struct cgrp_cset_link *link, *tmp_link;
 
-	list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
-		list_del(&link->cgrp_link_list);
+	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
+		list_del(&link->cset_link);
 		kfree(link);
 	}
 }
 
-/*
- * allocate_cg_links() allocates "count" cg_cgroup_link structures
- * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
- * success or a negative error
+/**
+ * allocate_cgrp_cset_links - allocate cgrp_cset_links
+ * @count: the number of links to allocate
+ * @tmp_links: list_head the allocated links are put on
+ *
+ * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
+ * through ->cset_link.  Returns 0 on success or -errno.
  */
-static int allocate_cg_links(int count, struct list_head *tmp)
+static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	int i;
-	INIT_LIST_HEAD(tmp);
+
+	INIT_LIST_HEAD(tmp_links);
+
 	for (i = 0; i < count; i++) {
 		link = kmalloc(sizeof(*link), GFP_KERNEL);
 		if (!link) {
-			free_cg_links(tmp);
+			free_cgrp_cset_links(tmp_links);
 			return -ENOMEM;
 		}
-		list_add(&link->cgrp_link_list, tmp);
+		list_add(&link->cset_link, tmp_links);
 	}
 	return 0;
 }
 
 /**
  * link_css_set - a helper function to link a css_set to a cgroup
- * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
  * @cset: the css_set to be linked
  * @cgrp: the destination cgroup
  */
-static void link_css_set(struct list_head *tmp_cg_links,
-			 struct css_set *cset, struct cgroup *cgrp)
+static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
+			 struct cgroup *cgrp)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 
-	BUG_ON(list_empty(tmp_cg_links));
-	link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
-				cgrp_link_list);
-	link->cg = cset;
+	BUG_ON(list_empty(tmp_links));
+	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
+	link->cset = cset;
 	link->cgrp = cgrp;
 	atomic_inc(&cgrp->count);
-	list_move(&link->cgrp_link_list, &cgrp->css_sets);
+	list_move(&link->cset_link, &cgrp->cset_links);
 	/*
 	 * Always add links to the tail of the list so that the list
 	 * is sorted by order of hierarchy creation
 	 */
-	list_add_tail(&link->cg_link_list, &cset->cg_links);
+	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 }
 
 /*
@@ -638,10 +643,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 {
 	struct css_set *cset;
 	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
-
-	struct list_head tmp_cg_links;
-
-	struct cg_cgroup_link *link;
+	struct list_head tmp_links;
+	struct cgrp_cset_link *link;
 	unsigned long key;
 
 	/* First see if we already have a cgroup group that matches
@@ -659,14 +662,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	if (!cset)
 		return NULL;
 
-	/* Allocate all the cg_cgroup_link objects that we'll need */
-	if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
+	/* Allocate all the cgrp_cset_link objects that we'll need */
+	if (allocate_cgrp_cset_links(root_count, &tmp_links) < 0) {
 		kfree(cset);
 		return NULL;
 	}
 
 	atomic_set(&cset->refcount, 1);
-	INIT_LIST_HEAD(&cset->cg_links);
+	INIT_LIST_HEAD(&cset->cgrp_links);
 	INIT_LIST_HEAD(&cset->tasks);
 	INIT_HLIST_NODE(&cset->hlist);
 
@@ -676,14 +679,15 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	write_lock(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
-	list_for_each_entry(link, &old_cset->cg_links, cg_link_list) {
+	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
+
 		if (c->root == cgrp->root)
 			c = cgrp;
-		link_css_set(&tmp_cg_links, cset, c);
+		link_css_set(&tmp_links, cset, c);
 	}
 
-	BUG_ON(!list_empty(&tmp_cg_links));
+	BUG_ON(!list_empty(&tmp_links));
 
 	css_set_count++;
 
@@ -717,9 +721,11 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 	if (cset == &init_css_set) {
 		res = &root->top_cgroup;
 	} else {
-		struct cg_cgroup_link *link;
-		list_for_each_entry(link, &cset->cg_links, cg_link_list) {
+		struct cgrp_cset_link *link;
+
+		list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 			struct cgroup *c = link->cgrp;
+
 			if (c->root == root) {
 				res = c;
 				break;
@@ -1405,7 +1411,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->files);
-	INIT_LIST_HEAD(&cgrp->css_sets);
+	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
@@ -1604,7 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	BUG_ON(!root);
 	if (root == opts.new_root) {
 		/* We used the new root structure, so this is a new hierarchy */
-		struct list_head tmp_cg_links;
+		struct list_head tmp_links;
 		struct cgroup *root_cgrp = &root->top_cgroup;
 		struct cgroupfs_root *existing_root;
 		const struct cred *cred;
@@ -1636,7 +1642,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * that's us. The worst that can happen is that we
 		 * have some link structures left over
 		 */
-		ret = allocate_cg_links(css_set_count, &tmp_cg_links);
+		ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
 		if (ret)
 			goto unlock_drop;
 
@@ -1646,7 +1652,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 
 		ret = rebind_subsystems(root, root->subsys_mask);
 		if (ret == -EBUSY) {
-			free_cg_links(&tmp_cg_links);
+			free_cgrp_cset_links(&tmp_links);
 			goto unlock_drop;
 		}
 		/*
@@ -1668,10 +1674,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * the css_set objects */
 		write_lock(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
-			link_css_set(&tmp_cg_links, cset, root_cgrp);
+			link_css_set(&tmp_links, cset, root_cgrp);
 		write_unlock(&css_set_lock);
 
-		free_cg_links(&tmp_cg_links);
+		free_cgrp_cset_links(&tmp_links);
 
 		BUG_ON(!list_empty(&root_cgrp->children));
 		BUG_ON(root->number_of_cgroups != 1);
@@ -1722,9 +1728,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 static void cgroup_kill_sb(struct super_block *sb) {
 	struct cgroupfs_root *root = sb->s_fs_info;
 	struct cgroup *cgrp = &root->top_cgroup;
+	struct cgrp_cset_link *link, *tmp_link;
 	int ret;
-	struct cg_cgroup_link *link;
-	struct cg_cgroup_link *saved_link;
 
 	BUG_ON(!root);
 
@@ -1740,15 +1745,14 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	BUG_ON(ret);
 
 	/*
-	 * Release all the links from css_sets to this hierarchy's
+	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
 	write_lock(&css_set_lock);
 
-	list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
-				 cgrp_link_list) {
-		list_del(&link->cg_link_list);
-		list_del(&link->cgrp_link_list);
+	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
+		list_del(&link->cset_link);
+		list_del(&link->cgrp_link);
 		kfree(link);
 	}
 	write_unlock(&css_set_lock);
@@ -2908,12 +2912,11 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 int cgroup_task_count(const struct cgroup *cgrp)
 {
 	int count = 0;
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 
 	read_lock(&css_set_lock);
-	list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-		count += atomic_read(&link->cg->refcount);
-	}
+	list_for_each_entry(link, &cgrp->cset_links, cset_link)
+		count += atomic_read(&link->cset->refcount);
 	read_unlock(&css_set_lock);
 	return count;
 }
@@ -2922,24 +2925,23 @@ int cgroup_task_count(const struct cgroup *cgrp)
  * Advance a list_head iterator.  The iterator should be positioned at
  * the start of a css_set
  */
-static void cgroup_advance_iter(struct cgroup *cgrp,
-				struct cgroup_iter *it)
+static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it)
 {
-	struct list_head *l = it->cg_link;
-	struct cg_cgroup_link *link;
+	struct list_head *l = it->cset_link;
+	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
-		if (l == &cgrp->css_sets) {
-			it->cg_link = NULL;
+		if (l == &cgrp->cset_links) {
+			it->cset_link = NULL;
 			return;
 		}
-		link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
-		cset = link->cg;
+		link = list_entry(l, struct cgrp_cset_link, cset_link);
+		cset = link->cset;
 	} while (list_empty(&cset->tasks));
-	it->cg_link = l;
+	it->cset_link = l;
 	it->task = cset->tasks.next;
 }
 
@@ -3160,7 +3162,7 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
 		cgroup_enable_task_cg_lists();
 
 	read_lock(&css_set_lock);
-	it->cg_link = &cgrp->css_sets;
+	it->cset_link = &cgrp->cset_links;
 	cgroup_advance_iter(cgrp, it);
 }
 
@@ -3169,16 +3171,16 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
 	struct task_struct *res;
 	struct list_head *l = it->task;
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 
 	/* If the iterator cg is NULL, we have no tasks */
-	if (!it->cg_link)
+	if (!it->cset_link)
 		return NULL;
 	res = list_entry(l, struct task_struct, cg_list);
 	/* Advance iterator to find next entry */
 	l = l->next;
-	link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
-	if (l == &link->cg->tasks) {
+	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
+	if (l == &link->cset->tasks) {
 		/* We reached the end of this task list - move on to
 		 * the next cg_cgroup_link */
 		cgroup_advance_iter(cgrp, it);
@@ -4625,7 +4627,7 @@ EXPORT_SYMBOL_GPL(cgroup_load_subsys);
  */
 void cgroup_unload_subsys(struct cgroup_subsys *ss)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 
 	BUG_ON(ss->module == NULL);
 
@@ -4654,8 +4656,8 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	 * in loading, we need to pay our respects to the hashtable gods.
 	 */
 	write_lock(&css_set_lock);
-	list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
-		struct css_set *cset = link->cg;
+	list_for_each_entry(link, &dummytop->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
 		unsigned long key;
 
 		hash_del(&cset->hlist);
@@ -4688,7 +4690,7 @@ int __init cgroup_init_early(void)
 {
 	int i;
 	atomic_set(&init_css_set.refcount, 1);
-	INIT_LIST_HEAD(&init_css_set.cg_links);
+	INIT_LIST_HEAD(&init_css_set.cgrp_links);
 	INIT_LIST_HEAD(&init_css_set.tasks);
 	INIT_HLIST_NODE(&init_css_set.hlist);
 	css_set_count = 1;
@@ -4696,12 +4698,10 @@ int __init cgroup_init_early(void)
 	root_count = 1;
 	init_task.cgroups = &init_css_set;
 
-	init_css_set_link.cg = &init_css_set;
-	init_css_set_link.cgrp = dummytop;
-	list_add(&init_css_set_link.cgrp_link_list,
-		 &rootnode.top_cgroup.css_sets);
-	list_add(&init_css_set_link.cg_link_list,
-		 &init_css_set.cg_links);
+	init_cgrp_cset_link.cset = &init_css_set;
+	init_cgrp_cset_link.cgrp = dummytop;
+	list_add(&init_cgrp_cset_link.cset_link, &rootnode.top_cgroup.cset_links);
+	list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links);
 
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
@@ -5454,13 +5454,13 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
 					 struct cftype *cft,
 					 struct seq_file *seq)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
 	read_lock(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
-	list_for_each_entry(link, &cset->cg_links, cg_link_list) {
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
 		const char *name;
 
@@ -5481,11 +5481,11 @@ static int cgroup_css_links_read(struct cgroup *cont,
 				 struct cftype *cft,
 				 struct seq_file *seq)
 {
-	struct cg_cgroup_link *link;
+	struct cgrp_cset_link *link;
 
 	read_lock(&css_set_lock);
-	list_for_each_entry(link, &cont->css_sets, cgrp_link_list) {
-		struct css_set *cset = link->cg;
+	list_for_each_entry(link, &cont->cset_links, cset_link) {
+		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
 		seq_printf(seq, "css_set %p\n", cset);
-- 
cgit v1.3-7-g2ca7


From 5de0107e634ce862f16360139709d9d3a656463e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:52 -0700
Subject: cgroup: clean up css_[try]get() and css_put()

* __css_get() isn't used by anyone.  Fold it into css_get().

* Add proper function comments to all css reference functions.

This patch is purely cosmetic.

v2: Typo fix as per Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 48 ++++++++++++++++++++++++------------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0e32855edc92..a494636a34da 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -94,33 +94,31 @@ enum {
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
-/* Caller must verify that the css is not for root cgroup */
-static inline void __css_get(struct cgroup_subsys_state *css, int count)
-{
-	atomic_add(count, &css->refcnt);
-}
-
-/*
- * Call css_get() to hold a reference on the css; it can be used
- * for a reference obtained via:
- * - an existing ref-counted reference to the css
- * - task->cgroups for a locked task
+/**
+ * css_get - obtain a reference on the specified css
+ * @css: target css
+ *
+ * The caller must already have a reference.
  */
-
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		__css_get(css, 1);
+		atomic_inc(&css->refcnt);
 }
 
-/*
- * Call css_tryget() to take a reference on a css if your existing
- * (known-valid) reference isn't already ref-counted. Returns false if
- * the css has been destroyed.
- */
-
 extern bool __css_tryget(struct cgroup_subsys_state *css);
+
+/**
+ * css_tryget - try to obtain a reference on the specified css
+ * @css: target css
+ *
+ * Obtain a reference on @css if it's alive.  The caller naturally needs to
+ * ensure that @css is accessible but doesn't have to be holding a
+ * reference on it - IOW, RCU protected access is good enough for this
+ * function.  Returns %true if a reference count was successfully obtained;
+ * %false otherwise.
+ */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
@@ -128,12 +126,14 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 	return __css_tryget(css);
 }
 
-/*
- * css_put() should be called to release a reference taken by
- * css_get() or css_tryget()
- */
-
 extern void __css_put(struct cgroup_subsys_state *css);
+
+/**
+ * css_put - put a css reference
+ * @css: target css
+ *
+ * Put a reference obtained via css_get() and css_tryget().
+ */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-- 
cgit v1.3-7-g2ca7


From 54766d4a1d3d6f84ff8fa475cd8f165c0a0000eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:53 -0700
Subject: cgroup: rename CGRP_REMOVED to CGRP_DEAD

We will add another flag indicating that the cgroup is in the process
of being killed.  REMOVING / REMOVED is more difficult to distinguish
and cgroup_is_removing()/cgroup_is_removed() are a bit awkward.  Also,
later percpu_ref usage will involve "kill"ing the refcnt.

 s/CGRP_REMOVED/CGRP_DEAD/
 s/cgroup_is_removed()/cgroup_is_dead()

This patch is purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup.c        | 30 ++++++++++++++----------------
 2 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a494636a34da..c86a93abe83d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,7 +143,7 @@ static inline void css_put(struct cgroup_subsys_state *css)
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group is dead */
-	CGRP_REMOVED,
+	CGRP_DEAD,
 	/*
 	 * Control Group has previously had a child cgroup or a task,
 	 * but no longer (only if CGRP_NOTIFY_ON_RELEASE is set)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d86a8477d56a..84efb344fdf6 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -226,9 +226,9 @@ static int css_refcnt(struct cgroup_subsys_state *css)
 }
 
 /* convenient tests for these bits */
-static inline bool cgroup_is_removed(const struct cgroup *cgrp)
+static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
-	return test_bit(CGRP_REMOVED, &cgrp->flags);
+	return test_bit(CGRP_DEAD, &cgrp->flags);
 }
 
 /**
@@ -300,7 +300,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
-	if (cgroup_is_removed(cgrp)) {
+	if (cgroup_is_dead(cgrp)) {
 		mutex_unlock(&cgroup_mutex);
 		return false;
 	}
@@ -892,7 +892,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	if (S_ISDIR(inode->i_mode)) {
 		struct cgroup *cgrp = dentry->d_fsdata;
 
-		BUG_ON(!(cgroup_is_removed(cgrp)));
+		BUG_ON(!(cgroup_is_dead(cgrp)));
 		call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
@@ -2363,7 +2363,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (cgroup_is_removed(cgrp))
+	if (cgroup_is_dead(cgrp))
 		return -ENODEV;
 	if (cft->write)
 		return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -2408,7 +2408,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-	if (cgroup_is_removed(cgrp))
+	if (cgroup_is_dead(cgrp))
 		return -ENODEV;
 
 	if (cft->read)
@@ -2831,7 +2831,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
-		if (!cgroup_is_removed(cgrp))
+		if (!cgroup_is_dead(cgrp))
 			cgroup_addrm_files(cgrp, ss, cfts, is_add);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
@@ -2999,14 +2999,14 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
-	 * changes.  As CGRP_REMOVED is set on removal which is fully
+	 * changes.  As CGRP_DEAD is set on removal which is fully
 	 * serialized, if we see it unasserted, it's guaranteed that the
 	 * next sibling hasn't finished its grace period even if it's
 	 * already removed, and thus safe to dereference from this RCU
 	 * critical section.  If ->sibling.next is inaccessible,
-	 * cgroup_is_removed() is guaranteed to be visible as %true here.
+	 * cgroup_is_dead() is guaranteed to be visible as %true here.
 	 */
-	if (likely(!cgroup_is_removed(pos))) {
+	if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
 		if (&next->sibling != &pos->parent->children)
 			return next;
@@ -4383,7 +4383,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	 * attempts fail thus maintaining the removal conditions verified
 	 * above.
 	 *
-	 * Note that CGRP_REMVOED clearing is depended upon by
+	 * Note that CGRP_DEAD assertion is depended upon by
 	 * cgroup_next_sibling() to resume iteration after dropping RCU
 	 * read lock.  See cgroup_next_sibling() for details.
 	 */
@@ -4393,7 +4393,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 		WARN_ON(atomic_read(&css->refcnt) < 0);
 		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
 	}
-	set_bit(CGRP_REMOVED, &cgrp->flags);
+	set_bit(CGRP_DEAD, &cgrp->flags);
 
 	/* tell subsystems to initate destruction */
 	for_each_subsys(cgrp->root, ss)
@@ -5063,7 +5063,7 @@ static void check_for_release(struct cgroup *cgrp)
 		int need_schedule_work = 0;
 
 		raw_spin_lock(&release_list_lock);
-		if (!cgroup_is_removed(cgrp) &&
+		if (!cgroup_is_dead(cgrp) &&
 		    list_empty(&cgrp->release_list)) {
 			list_add(&cgrp->release_list, &release_list);
 			need_schedule_work = 1;
@@ -5209,9 +5209,7 @@ __setup("cgroup_disable=", cgroup_disable);
  * Functons for CSS ID.
  */
 
-/*
- *To get ID other than 0, this should be called when !cgroup_is_removed().
- */
+/* to get ID other than 0, this should be called when !cgroup_is_dead() */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
-- 
cgit v1.3-7-g2ca7


From 6f3d828f0fb7fdaffc6f32cb8a1cb7fcf8824598 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 21:04:55 -0700
Subject: cgroup: remove cgroup->count and use

cgroup->count tracks the number of css_sets associated with the cgroup
and used only to verify that no css_set is associated when the cgroup
is being destroyed.  It's superflous as the destruction path can
simply check whether cgroup->cset_links is empty instead.

Drop cgroup->count and check ->cset_links directly from
cgroup_destroy_locked().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  6 ------
 kernel/cgroup.c        | 21 +++++----------------
 2 files changed, 5 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c86a93abe83d..81bfd0268e93 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -169,12 +169,6 @@ struct cgroup_name {
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
-	/*
-	 * count users of this cgroup. >0 means busy, but doesn't
-	 * necessarily indicate the number of tasks in the cgroup
-	 */
-	atomic_t count;
-
 	int id;				/* ida allocated in-hierarchy ID */
 
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a68241ca835..49bfd7b0bbda 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -408,8 +408,7 @@ static void __put_css_set(struct css_set *cset, int taskexit)
 		list_del(&link->cgrp_link);
 
 		/* @cgrp can't go away while we're holding css_set_lock */
-		if (atomic_dec_and_test(&cgrp->count) &&
-		    notify_on_release(cgrp)) {
+		if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
 			if (taskexit)
 				set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -616,7 +615,6 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
 	link->cset = cset;
 	link->cgrp = cgrp;
-	atomic_inc(&cgrp->count);
 	list_move(&link->cset_link, &cgrp->cset_links);
 	/*
 	 * Always add links to the tail of the list so that the list
@@ -4370,11 +4368,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	/*
-	 * css_set_lock prevents @cgrp from being removed while
-	 * __put_css_set() is in progress.
+	 * css_set_lock synchronizes access to ->cset_links and prevents
+	 * @cgrp from being removed while __put_css_set() is in progress.
 	 */
 	read_lock(&css_set_lock);
-	empty = !atomic_read(&cgrp->count) && list_empty(&cgrp->children);
+	empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
 	read_unlock(&css_set_lock);
 	if (!empty)
 		return -EBUSY;
@@ -5054,7 +5052,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 static void check_for_release(struct cgroup *cgrp)
 {
 	if (cgroup_is_releasable(cgrp) &&
-	    !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) {
+	    list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
 		/*
 		 * Control Group is currently removeable. If it's not
 		 * already queued for a userspace notification, queue
@@ -5422,11 +5420,6 @@ static void debug_css_free(struct cgroup *cont)
 	kfree(cont->subsys[debug_subsys_id]);
 }
 
-static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft)
-{
-	return atomic_read(&cont->count);
-}
-
 static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	return cgroup_task_count(cont);
@@ -5507,10 +5500,6 @@ static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
 }
 
 static struct cftype debug_files[] =  {
-	{
-		.name = "cgroup_refcount",
-		.read_u64 = cgroup_refcount_read,
-	},
 	{
 		.name = "taskcount",
 		.read_u64 = debug_taskcount_read,
-- 
cgit v1.3-7-g2ca7


From acac7883ee7bcc32476963bce7baf73d44574dd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 20:52:01 -0700
Subject: percpu-refcount: add __must_check to percpu_ref_init() and don't use
 ACCESS_ONCE() in percpu_ref_kill_rcu()

Two small changes.

* Unlike most init functions, percpu_ref_init() allocates memory and
  may fail.  Let's mark it with __must_check in case the caller
  forgets.

* percpu_ref_kill_rcu() is unnecessarily using ACCESS_ONCE() to
  dereference @ref->pcpu_count, which can be misleading.  The pointer
  is guaranteed to be valid and visible and can't change underneath
  the function.  Drop ACCESS_ONCE().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/percpu-refcount.h | 3 ++-
 lib/percpu-refcount.c           | 4 +---
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index b61bd6f23985..8146aa9cd89e 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -66,7 +66,8 @@ struct percpu_ref {
 	struct rcu_head		rcu;
 };
 
-int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release);
+int __must_check percpu_ref_init(struct percpu_ref *ref,
+				 percpu_ref_func_t *release);
 void percpu_ref_kill(struct percpu_ref *ref);
 
 #define PCPU_STATUS_BITS	2
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 9a78e55fa48f..b35eaac2954f 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -57,12 +57,10 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
 static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 {
 	struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
-	unsigned __percpu *pcpu_count;
+	unsigned __percpu *pcpu_count = ref->pcpu_count;
 	unsigned count = 0;
 	int cpu;
 
-	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
-
 	/* Mask out PCPU_REF_DEAD */
 	pcpu_count = (unsigned __percpu *)
 		(((unsigned long) pcpu_count) & ~PCPU_STATUS_MASK);
-- 
cgit v1.3-7-g2ca7


From bc497bd33b2d6a6f07bc8574b4764edbd7fdffa8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 12 Jun 2013 20:52:35 -0700
Subject: percpu-refcount: implement percpu_ref_cancel_init()

Normally, percpu_ref_init() initializes and percpu_ref_kill()
initiates destruction which completes asynchronously.  The
asynchronous destruction can be problematic in init failure path where
the caller wants to destroy half-constructed object - distinguishing
half-constructed objects from the usual release method can be painful
for complex objects.

This patch implements percpu_ref_cancel_init() which synchronously
destroys the percpu_ref without invoking release.  To avoid
unintentional misuses, the function requires the ref to have finished
percpu_ref_init() but never used and triggers WARN otherwise.

v2: Explain the weird name and usage restriction in the function
    comment.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Kent Overstreet <koverstreet@google.com>
---
 include/linux/percpu-refcount.h |  1 +
 lib/percpu-refcount.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 8146aa9cd89e..6d843d60690d 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -68,6 +68,7 @@ struct percpu_ref {
 
 int __must_check percpu_ref_init(struct percpu_ref *ref,
 				 percpu_ref_func_t *release);
+void percpu_ref_cancel_init(struct percpu_ref *ref);
 void percpu_ref_kill(struct percpu_ref *ref);
 
 #define PCPU_STATUS_BITS	2
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index b35eaac2954f..ebeaac274cb9 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -54,6 +54,37 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
 	return 0;
 }
 
+/**
+ * percpu_ref_cancel_init - cancel percpu_ref_init()
+ * @ref: percpu_ref to cancel init for
+ *
+ * Once a percpu_ref is initialized, its destruction is initiated by
+ * percpu_ref_kill() and completes asynchronously, which can be painful to
+ * do when destroying a half-constructed object in init failure path.
+ *
+ * This function destroys @ref without invoking @ref->release and the
+ * memory area containing it can be freed immediately on return.  To
+ * prevent accidental misuse, it's required that @ref has finished
+ * percpu_ref_init(), whether successful or not, but never used.
+ *
+ * The weird name and usage restriction are to prevent people from using
+ * this function by mistake for normal shutdown instead of
+ * percpu_ref_kill().
+ */
+void percpu_ref_cancel_init(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count = ref->pcpu_count;
+	int cpu;
+
+	WARN_ON_ONCE(atomic_read(&ref->count) != 1 + PCPU_COUNT_BIAS);
+
+	if (pcpu_count) {
+		for_each_possible_cpu(cpu)
+			WARN_ON_ONCE(*per_cpu_ptr(pcpu_count, cpu));
+		free_percpu(ref->pcpu_count);
+	}
+}
+
 static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 {
 	struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
-- 
cgit v1.3-7-g2ca7


From dbece3a0f1ef0b19aff1cc6ed0942fec9ab98de1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:23:53 -0700
Subject: percpu-refcount: implement percpu_tryget() along with
 percpu_ref_kill_and_confirm()

Implement percpu_tryget() which stops giving out references once the
percpu_ref is visible as killed.  Because the refcnt is per-cpu,
different CPUs will start to see a refcnt as killed at different
points in time and tryget() may continue to succeed on subset of cpus
for a while after percpu_ref_kill() returns.

For use cases where it's necessary to know when all CPUs start to see
the refcnt as dead, percpu_ref_kill_and_confirm() is added.  The new
function takes an extra argument @confirm_kill which is invoked when
the refcnt is guaranteed to be viewed as killed on all CPUs.

While this isn't the prettiest interface, it doesn't force synchronous
wait and is much safer than requiring the caller to do its own
call_rcu().

v2: Patch description rephrased to emphasize that tryget() may
    continue to succeed on some CPUs after kill() returns as suggested
    by Kent.

v3: Function comment in percpu_ref_kill_and_confirm() updated warning
    people to not depend on the implied RCU grace period from the
    confirm callback as it's an implementation detail.

Signed-off-by: Tejun Heo <tj@kernel.org>
Slightly-Grumpily-Acked-by: Kent Overstreet <koverstreet@google.com>
---
 include/linux/percpu-refcount.h | 50 ++++++++++++++++++++++++++++++++++++++++-
 lib/percpu-refcount.c           | 23 ++++++++++++++-----
 2 files changed, 66 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 6d843d60690d..dd2a08600453 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -63,13 +63,30 @@ struct percpu_ref {
 	 */
 	unsigned __percpu	*pcpu_count;
 	percpu_ref_func_t	*release;
+	percpu_ref_func_t	*confirm_kill;
 	struct rcu_head		rcu;
 };
 
 int __must_check percpu_ref_init(struct percpu_ref *ref,
 				 percpu_ref_func_t *release);
 void percpu_ref_cancel_init(struct percpu_ref *ref);
-void percpu_ref_kill(struct percpu_ref *ref);
+void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_kill);
+
+/**
+ * percpu_ref_kill - drop the initial ref
+ * @ref: percpu_ref to kill
+ *
+ * Must be used to drop the initial ref on a percpu refcount; must be called
+ * precisely once before shutdown.
+ *
+ * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
+ * percpu counters and dropping the initial ref.
+ */
+static inline void percpu_ref_kill(struct percpu_ref *ref)
+{
+	return percpu_ref_kill_and_confirm(ref, NULL);
+}
 
 #define PCPU_STATUS_BITS	2
 #define PCPU_STATUS_MASK	((1 << PCPU_STATUS_BITS) - 1)
@@ -100,6 +117,37 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 	rcu_read_unlock();
 }
 
+/**
+ * percpu_ref_tryget - try to increment a percpu refcount
+ * @ref: percpu_ref to try-get
+ *
+ * Increment a percpu refcount unless it has already been killed.  Returns
+ * %true on success; %false on failure.
+ *
+ * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget
+ * will fail.  For such guarantee, percpu_ref_kill_and_confirm() should be
+ * used.  After the confirm_kill callback is invoked, it's guaranteed that
+ * no new reference will be given out by percpu_ref_tryget().
+ */
+static inline bool percpu_ref_tryget(struct percpu_ref *ref)
+{
+	unsigned __percpu *pcpu_count;
+	int ret = false;
+
+	rcu_read_lock();
+
+	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
+
+	if (likely(REF_STATUS(pcpu_count) == PCPU_REF_PTR)) {
+		__this_cpu_inc(*pcpu_count);
+		ret = true;
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
 /**
  * percpu_ref_put - decrement a percpu refcount
  * @ref: percpu_ref to put
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index ebeaac274cb9..8bf9e719cca0 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -118,6 +118,10 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 
 	atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count);
 
+	/* @ref is viewed as dead on all CPUs, send out kill confirmation */
+	if (ref->confirm_kill)
+		ref->confirm_kill(ref);
+
 	/*
 	 * Now we're in single atomic_t mode with a consistent refcount, so it's
 	 * safe to drop our initial ref:
@@ -126,22 +130,29 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 }
 
 /**
- * percpu_ref_kill - safely drop initial ref
+ * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation
  * @ref: percpu_ref to kill
+ * @confirm_kill: optional confirmation callback
  *
- * Must be used to drop the initial ref on a percpu refcount; must be called
- * precisely once before shutdown.
+ * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
+ * @confirm_kill is not NULL.  @confirm_kill, which may not block, will be
+ * called after @ref is seen as dead from all CPUs - all further
+ * invocations of percpu_ref_tryget() will fail.  See percpu_ref_tryget()
+ * for more details.
  *
- * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
- * percpu counters and dropping the initial ref.
+ * Due to the way percpu_ref is implemented, @confirm_kill will be called
+ * after at least one full RCU grace period has passed but this is an
+ * implementation detail and callers must not depend on it.
  */
-void percpu_ref_kill(struct percpu_ref *ref)
+void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
+				 percpu_ref_func_t *confirm_kill)
 {
 	WARN_ONCE(REF_STATUS(ref->pcpu_count) == PCPU_REF_DEAD,
 		  "percpu_ref_kill() called more than once!\n");
 
 	ref->pcpu_count = (unsigned __percpu *)
 		(((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD);
+	ref->confirm_kill = confirm_kill;
 
 	call_rcu(&ref->rcu, percpu_ref_kill_rcu);
 }
-- 
cgit v1.3-7-g2ca7


From ea15f8ccdb430af1e8bc9b4e19a230eb4c356777 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:27:42 -0700
Subject: cgroup: split cgroup destruction into two steps

Split cgroup_destroy_locked() into two steps and put the latter half
into cgroup_offline_fn() which is executed from a work item.  The
latter half is responsible for offlining the css's, removing the
cgroup from internal lists, and propagating release notification to
the parent.  The separation is to allow using percpu refcnt for css.

Note that this allows for other cgroup operations to happen between
the first and second halves of destruction, including creating a new
cgroup with the same name.  As the target cgroup is marked DEAD in the
first half and cgroup internals don't care about the names of cgroups,
this should be fine.  A comment explaining this will be added by the
next patch which implements the actual percpu refcnting.

As RCU freeing is guaranteed to happen after the second step of
destruction, we can use the same work item for both.  This patch
renames cgroup->free_work to ->destroy_work and uses it for both
purposes.  INIT_WORK() is now performed right before queueing the work
item.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  2 +-
 kernel/cgroup.c        | 38 +++++++++++++++++++++++++++-----------
 2 files changed, 28 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 81bfd0268e93..e345d8b90046 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -233,7 +233,7 @@ struct cgroup {
 
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
-	struct work_struct free_work;
+	struct work_struct destroy_work;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a1ddecc3cfa..df6814706cca 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -208,6 +208,7 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
  */
 static int need_forkexit_callback __read_mostly;
 
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
@@ -830,7 +831,7 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 
 static void cgroup_free_fn(struct work_struct *work)
 {
-	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
@@ -875,7 +876,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
-	schedule_work(&cgrp->free_work);
+	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+	schedule_work(&cgrp->destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1407,7 +1409,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
-	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -2991,12 +2992,13 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos)
 	/*
 	 * @pos could already have been removed.  Once a cgroup is removed,
 	 * its ->sibling.next is no longer updated when its next sibling
-	 * changes.  As CGRP_DEAD is set on removal which is fully
-	 * serialized, if we see it unasserted, it's guaranteed that the
-	 * next sibling hasn't finished its grace period even if it's
-	 * already removed, and thus safe to dereference from this RCU
-	 * critical section.  If ->sibling.next is inaccessible,
-	 * cgroup_is_dead() is guaranteed to be visible as %true here.
+	 * changes.  As CGRP_DEAD assertion is serialized and happens
+	 * before the cgroup is taken off the ->sibling list, if we see it
+	 * unasserted, it's guaranteed that the next sibling hasn't
+	 * finished its grace period even if it's already removed, and thus
+	 * safe to dereference from this RCU critical section.  If
+	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
+	 * to be visible as %true here.
 	 */
 	if (likely(!cgroup_is_dead(pos))) {
 		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
@@ -4359,7 +4361,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup *parent = cgrp->parent;
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
 	bool empty;
@@ -4423,6 +4424,21 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	schedule_work(&cgrp->destroy_work);
+
+	return 0;
+};
+
+static void cgroup_offline_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+	struct cgroup *parent = cgrp->parent;
+	struct dentry *d = cgrp->dentry;
+	struct cgroup_subsys *ss;
+
+	mutex_lock(&cgroup_mutex);
+
 	/* tell subsystems to initate destruction */
 	for_each_subsys(cgrp->root, ss)
 		offline_css(ss, cgrp);
@@ -4446,7 +4462,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	set_bit(CGRP_RELEASABLE, &parent->flags);
 	check_for_release(parent);
 
-	return 0;
+	mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
-- 
cgit v1.3-7-g2ca7


From d3daf28da16a30af95bfb303189a634a87606725 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:39:16 -0700
Subject: cgroup: use percpu refcnt for cgroup_subsys_states

A css (cgroup_subsys_state) is how each cgroup is represented to a
controller.  As such, it can be used in hot paths across the various
subsystems different controllers are associated with.

One of the common operations is reference counting, which up until now
has been implemented using a global atomic counter and can have
significant adverse impact on scalability.  For example, css refcnt
can be gotten and put multiple times by blkcg for each IO request.
For highops configurations which try to do as much per-cpu as
possible, the global frequent refcnting can be very expensive.

In general, given the various and hugely diverse paths css's end up
being used from, we need to make it cheap and highly scalable.  In its
usage, css refcnting isn't very different from module refcnting.

This patch converts css refcnting to use the recently added
percpu_ref.  css_get/tryget/put() directly maps to the matching
percpu_ref operations and the deactivation logic is no longer
necessary as percpu_ref already has refcnt killing.

The only complication is that as the refcnt is per-cpu,
percpu_ref_kill() in itself doesn't ensure that further tryget
operations will fail, which we need to guarantee before invoking
->css_offline()'s.  This is resolved collecting kill confirmation
using percpu_ref_kill_and_confirm() and initiating the offline phase
of destruction after all css refcnt's are confirmed to be seen as
killed on all CPUs.  The previous patches already splitted destruction
into two phases, so percpu_ref_kill_and_confirm() can be hooked up
easily.

This patch removes css_refcnt() which is used for rcu dereference
sanity check in css_id().  While we can add a percpu refcnt API to ask
the same question, css_id() itself is scheduled to be removed fairly
soon, so let's not bother with it.  Just drop the sanity check and use
rcu_dereference_raw() instead.

v2: - init_cgroup_css() was calling percpu_ref_init() without checking
      the return value.  This causes two problems - the obvious lack
      of error handling and percpu_ref_init() being called from
      cgroup_init_subsys() before the allocators are up, which
      triggers warnings but doesn't cause actual problems as the
      refcnt isn't used for roots anyway.  Fix both by moving
      percpu_ref_init() to cgroup_create().

    - The base references were put too early by
      percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the
      refs one extra time.  This wasn't noticeable because css's go
      through another RCU grace period before being freed.  Update
      cgroup_destroy_locked() to grab an extra reference before
      killing the refcnts.  This problem was noticed by Kent.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Kent Overstreet <koverstreet@google.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: "Alasdair G. Kergon" <agk@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mikulas Patocka <mpatocka@redhat.com>
Cc: Glauber Costa <glommer@gmail.com>
---
 include/linux/cgroup.h |  23 +++----
 kernel/cgroup.c        | 165 +++++++++++++++++++++++++++++++------------------
 2 files changed, 112 insertions(+), 76 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b90046..b7bd4beae294 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
 #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
 	 */
 	struct cgroup *cgroup;
 
-	/*
-	 * State maintained by the cgroup system to allow subsystems
-	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and css_put().
-	 */
-
-	atomic_t refcnt;
+	/* reference count - access via css_[try]get() and css_put() */
+	struct percpu_ref refcnt;
 
 	unsigned long flags;
 	/* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		atomic_inc(&css->refcnt);
+		percpu_ref_get(&css->refcnt);
 }
 
-extern bool __css_tryget(struct cgroup_subsys_state *css);
-
 /**
  * css_tryget - try to obtain a reference on the specified css
  * @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return __css_tryget(css);
+	return percpu_ref_tryget(&css->refcnt);
 }
 
-extern void __css_put(struct cgroup_subsys_state *css);
-
 /**
  * css_put - put a css reference
  * @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-		__css_put(css);
+		percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* For RCU-protected deletion */
+	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
+	atomic_t css_kill_cnt;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ebbfc043153f..2e9da7bf25cb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,9 +63,6 @@
 
 #include <linux/atomic.h>
 
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS		INT_MIN
-
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -213,19 +210,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
 
-static int css_unbias_refcnt(int refcnt)
-{
-	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-	int v = atomic_read(&css->refcnt);
-
-	return css_unbias_refcnt(v);
-}
-
 /* convenient tests for these bits */
 static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 {
@@ -4139,12 +4123,19 @@ static void css_dput_fn(struct work_struct *work)
 	deactivate_super(sb);
 }
 
+static void css_release(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	schedule_work(&css->dput_work);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	css->id = NULL;
 	if (cgrp == dummytop)
@@ -4266,7 +4257,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			err = PTR_ERR(css);
 			goto err_free_all;
 		}
+
+		err = percpu_ref_init(&css->refcnt, css_release);
+		if (err)
+			goto err_free_all;
+
 		init_cgroup_css(css, ss, cgrp);
+
 		if (ss->use_id) {
 			err = alloc_css_id(ss, parent, cgrp);
 			if (err)
@@ -4331,8 +4328,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 err_free_all:
 	for_each_subsys(root, ss) {
-		if (cgrp->subsys[ss->subsys_id])
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+		if (css) {
+			percpu_ref_cancel_init(&css->refcnt);
 			ss->css_free(cgrp);
+		}
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
@@ -4360,6 +4361,48 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+		return;
+
+	/* percpu ref's of all css's are killed, kick off the next step */
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	schedule_work(&cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
@@ -4382,16 +4425,34 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 		return -EBUSY;
 
 	/*
-	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
-	 * removed.  This makes future css_tryget() attempts fail which we
-	 * guarantee to ->css_offline() callbacks.
+	 * Block new css_tryget() by killing css refcnts.  cgroup core
+	 * guarantees that, by the time ->css_offline() is invoked, no new
+	 * css reference will be given out via css_tryget().  We can't
+	 * simply call percpu_ref_kill() and proceed to offlining css's
+	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
+	 * as killed on all CPUs on return.
+	 *
+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
+	 * css is confirmed to be seen as killed on all CPUs.  The
+	 * notification callback keeps track of the number of css's to be
+	 * killed and schedules cgroup_offline_fn() to perform the rest of
+	 * destruction once the percpu refs of all css's are confirmed to
+	 * be killed.
 	 */
+	atomic_set(&cgrp->css_kill_cnt, 1);
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
-		WARN_ON(atomic_read(&css->refcnt) < 0);
-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+		/*
+		 * Killing would put the base ref, but we need to keep it
+		 * alive until after ->css_offline.
+		 */
+		percpu_ref_get(&css->refcnt);
+
+		atomic_inc(&cgrp->css_kill_cnt);
+		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
 	}
+	cgroup_css_killed(cgrp);
 
 	/*
 	 * Mark @cgrp dead.  This prevents further task migration and child
@@ -4427,12 +4488,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	}
 	spin_unlock(&cgrp->event_list_lock);
 
-	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
-	schedule_work(&cgrp->destroy_work);
-
 	return 0;
 };
 
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
 static void cgroup_offline_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
@@ -4442,16 +4510,19 @@ static void cgroup_offline_fn(struct work_struct *work)
 
 	mutex_lock(&cgroup_mutex);
 
-	/* tell subsystems to initate destruction */
+	/*
+	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
+	 * initate destruction.
+	 */
 	for_each_subsys(cgrp->root, ss)
 		offline_css(ss, cgrp);
 
 	/*
-	 * Put all the base refs.  Each css holds an extra reference to the
-	 * cgroup's dentry and cgroup removal proceeds regardless of css
-	 * refs.  On the last put of each css, whenever that may be, the
-	 * extra dentry ref is put so that dentry destruction happens only
-	 * after all css's are released.
+	 * Put the css refs from cgroup_destroy_locked().  Each css holds
+	 * an extra reference to the cgroup's dentry and cgroup removal
+	 * proceeds regardless of css refs.  On the last put of each css,
+	 * whenever that may be, the extra dentry ref is put so that dentry
+	 * destruction happens only after all css's are released.
 	 */
 	for_each_subsys(cgrp->root, ss)
 		css_put(cgrp->subsys[ss->subsys_id]);
@@ -5100,34 +5171,6 @@ static void check_for_release(struct cgroup *cgrp)
 	}
 }
 
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-	while (true) {
-		int t, v;
-
-		v = css_refcnt(css);
-		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-		if (likely(t == v))
-			return true;
-		else if (t < 0)
-			return false;
-		cpu_relax();
-	}
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-	int v;
-
-	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-	if (v == 0)
-		schedule_work(&css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
@@ -5245,7 +5288,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
+	cssid = rcu_dereference_raw(css->id);
 
 	if (cssid)
 		return cssid->id;
-- 
cgit v1.3-7-g2ca7


From f63674fd0d6afa1ba24309aee1f8c60195d39041 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 13 Jun 2013 19:58:38 -0700
Subject: cgroup: update sane_behavior documentation

f12dc02014 ("cgroup: mark "tasks" cgroup file as insane") and
cc5943a781 ("cgroup: mark "notify_on_release" and "release_agent"
cgroup files insane") forgot to update the changed behavior
documentation in cgroup.h.  Update it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b7bd4beae294..17604767adfd 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -264,13 +264,14 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
-	 * - memcg: use_hierarchy is on by default and the cgroup file for
-	 *   the flag is not created.
+	 * - "tasks" is removed.  Everything should be at process
+	 *   granularity.  Use "cgroup.procs" instead.
 	 *
-	 * The followings are planned changes.
+	 * - "release_agent" and "notify_on_release" are removed.
+	 *   Replacement notification mechanism will be implemented.
 	 *
-	 * - release_agent will be disallowed once replacement notification
-	 *   mechanism is implemented.
+	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 *   the flag is not created.
 	 */
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
 
-- 
cgit v1.3-7-g2ca7


From 3212b535f200c85b5a67cbfaea18431da71b5c72 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@linaro.org>
Date: Tue, 23 Apr 2013 12:35:02 +0100
Subject: mm: hugetlb: Copy huge_pmd_share from x86 to mm.

Under x86, multiple puds can be made to reference the same bank of
huge pmds provided that they represent a full PUD_SIZE of shared
huge memory that is aligned to a PUD_SIZE boundary.

The code to share pmds does not require any architecture specific
knowledge other than the fact that pmds can be indexed, thus can
be beneficial to some other architectures.

This patch copies the huge pmd sharing (and unsharing) logic from
x86/ to mm/ and introduces a new config option to activate it:
CONFIG_ARCH_WANTS_HUGE_PMD_SHARE

Signed-off-by: Steve Capper <steve.capper@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/hugetlb.h |   4 ++
 mm/hugetlb.c            | 122 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 126 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b4890fa57e7..981546ad231c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -69,6 +69,10 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
 
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud);
+#endif
+
 extern unsigned long hugepages_treat_as_movable;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
 extern int sysctl_hugetlb_shm_group;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f8feeeca6686..b0bfb292350e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3169,6 +3169,128 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 	hugetlb_acct_memory(h, -(chg - freed));
 }
 
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+				struct vm_area_struct *vma,
+				unsigned long addr, pgoff_t idx)
+{
+	unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+				svma->vm_start;
+	unsigned long sbase = saddr & PUD_MASK;
+	unsigned long s_end = sbase + PUD_SIZE;
+
+	/* Allow segments to share if only one is marked locked */
+	unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
+	unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+
+	/*
+	 * match the virtual addresses, permission and the alignment of the
+	 * page table page.
+	 */
+	if (pmd_index(addr) != pmd_index(saddr) ||
+	    vm_flags != svm_flags ||
+	    sbase < svma->vm_start || svma->vm_end < s_end)
+		return 0;
+
+	return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+	unsigned long base = addr & PUD_MASK;
+	unsigned long end = base + PUD_SIZE;
+
+	/*
+	 * check on proper vm_flags and page table alignment
+	 */
+	if (vma->vm_flags & VM_MAYSHARE &&
+	    vma->vm_start <= base && end <= vma->vm_end)
+		return 1;
+	return 0;
+}
+
+/*
+ * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
+ * and returns the corresponding pte. While this is not necessary for the
+ * !shared pmd case because we can allocate the pmd later as well, it makes the
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_mutex section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
+ */
+pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+	struct vm_area_struct *vma = find_vma(mm, addr);
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+			vma->vm_pgoff;
+	struct vm_area_struct *svma;
+	unsigned long saddr;
+	pte_t *spte = NULL;
+	pte_t *pte;
+
+	if (!vma_shareable(vma, addr))
+		return (pte_t *)pmd_alloc(mm, pud, addr);
+
+	mutex_lock(&mapping->i_mmap_mutex);
+	vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
+		if (svma == vma)
+			continue;
+
+		saddr = page_table_shareable(svma, vma, addr, idx);
+		if (saddr) {
+			spte = huge_pte_offset(svma->vm_mm, saddr);
+			if (spte) {
+				get_page(virt_to_page(spte));
+				break;
+			}
+		}
+	}
+
+	if (!spte)
+		goto out;
+
+	spin_lock(&mm->page_table_lock);
+	if (pud_none(*pud))
+		pud_populate(mm, pud,
+				(pmd_t *)((unsigned long)spte & PAGE_MASK));
+	else
+		put_page(virt_to_page(spte));
+	spin_unlock(&mm->page_table_lock);
+out:
+	pte = (pte_t *)pmd_alloc(mm, pud, addr);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	return pte;
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *	    0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+	pgd_t *pgd = pgd_offset(mm, *addr);
+	pud_t *pud = pud_offset(pgd, *addr);
+
+	BUG_ON(page_count(virt_to_page(ptep)) == 0);
+	if (page_count(virt_to_page(ptep)) == 1)
+		return 0;
+
+	pud_clear(pud);
+	put_page(virt_to_page(ptep));
+	*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+	return 1;
+}
+#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
 #ifdef CONFIG_MEMORY_FAILURE
 
 /* Should be called in hugetlb_lock */
-- 
cgit v1.3-7-g2ca7


From dfa5e237e935bc6712c9ac2f52a5469a0df85bcf Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@linaro.org>
Date: Tue, 7 May 2013 14:46:03 +0100
Subject: mm: thp: Correct the HPAGE_PMD_ORDER check.

All Transparent Huge Pages are allocated by the buddy allocator.

A compile time check is in place that fails when the order of a
transparent huge page is too large to be allocated by the buddy
allocator. Unfortunately that compile time check passes when:
HPAGE_PMD_ORDER == MAX_ORDER
( which is incorrect as the buddy allocator can only allocate
memory of order strictly less than MAX_ORDER. )

This patch updates the compile time check to fail in the above
case.

Signed-off-by: Steve Capper <steve.capper@linaro.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/huge_mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 528454c2caa9..26ee56c80dc7 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -123,7 +123,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 	} while (0)
 extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd);
-#if HPAGE_PMD_ORDER > MAX_ORDER
+#if HPAGE_PMD_ORDER >= MAX_ORDER
 #error "hugepages can't be allocated by the buddy allocator"
 #endif
 extern int hugepage_madvise(struct vm_area_struct *vma,
-- 
cgit v1.3-7-g2ca7


From ba492e900704ba00d43c7af9d94b00da4df52587 Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@linaro.org>
Date: Sat, 8 Jun 2013 22:47:17 +0800
Subject: clk: mux: add CLK_MUX_HIWORD_MASK

In both Hisilicon & Rockchip Cortex-A9 based chips, they don't use the
paradigm of reading-changing-writing the register contents.
Instead they use a hiword mask to indicate the changed bits.

When b01 should be set as switching mux, it also needs to indicate
the change by setting hiword mask (b11 << 16).

The patch adds mux flag for this usage.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-mux.c        | 17 +++++++++++++++--
 include/linux/clk-provider.h |  5 +++++
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 25b1734560d0..614444ca40cd 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -86,8 +86,12 @@ static int clk_mux_set_parent(struct clk_hw *hw, u8 index)
 	if (mux->lock)
 		spin_lock_irqsave(mux->lock, flags);
 
-	val = readl(mux->reg);
-	val &= ~(mux->mask << mux->shift);
+	if (mux->flags & CLK_MUX_HIWORD_MASK) {
+		val = mux->mask << (mux->shift + 16);
+	} else {
+		val = readl(mux->reg);
+		val &= ~(mux->mask << mux->shift);
+	}
 	val |= index << mux->shift;
 	writel(val, mux->reg);
 
@@ -111,6 +115,15 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 	struct clk_mux *mux;
 	struct clk *clk;
 	struct clk_init_data init;
+	u8 width = 0;
+
+	if (clk_mux_flags & CLK_MUX_HIWORD_MASK) {
+		width = fls(mask) - ffs(mask) + 1;
+		if (width + shift > 16) {
+			pr_err("mux value exceeds LOWORD field\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
 
 	/* allocate the mux */
 	mux = kzalloc(sizeof(struct clk_mux), GFP_KERNEL);
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 265f384f1e01..37ad97961e5a 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -299,6 +299,10 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
  * Flags:
  * CLK_MUX_INDEX_ONE - register index starts at 1, not 0
  * CLK_MUX_INDEX_BIT - register index is a single bit (power of two)
+ * CLK_MUX_HIWORD_MASK - The mux settings are only in lower 16-bit of this
+ *   register, and mask of mux bits are in higher 16-bit of this register.
+ *   While setting the mux bits, higher 16-bit should also be updated to
+ *   indicate changing mux bits.
  */
 struct clk_mux {
 	struct clk_hw	hw;
@@ -312,6 +316,7 @@ struct clk_mux {
 
 #define CLK_MUX_INDEX_ONE		BIT(0)
 #define CLK_MUX_INDEX_BIT		BIT(1)
+#define CLK_MUX_HIWORD_MASK		BIT(2)
 
 extern const struct clk_ops clk_mux_ops;
 
-- 
cgit v1.3-7-g2ca7


From d57dfe7508af2b528e26d84792edec1e7d919682 Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@linaro.org>
Date: Sat, 8 Jun 2013 22:47:18 +0800
Subject: clk: divider: add CLK_DIVIDER_HIWORD_MASK flag

In both Hisilicon & Rockchip Cortex-A9 based chips, they don't use the
paradigm of reading-changing-writing the register contents.
Instead they use a hiword mask to indicate the changed bits.

When b01 should be set as setting divider, it also needs to indicate
the change by setting hiword mask (b11 << 16).

The patch adds divider flag for this usage.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-divider.c    | 15 +++++++++++++--
 include/linux/clk-provider.h |  5 +++++
 2 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 6024e60e49aa..6d55eb2cb959 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -227,8 +227,12 @@ static int clk_divider_set_rate(struct clk_hw *hw, unsigned long rate,
 	if (divider->lock)
 		spin_lock_irqsave(divider->lock, flags);
 
-	val = readl(divider->reg);
-	val &= ~(div_mask(divider) << divider->shift);
+	if (divider->flags & CLK_DIVIDER_HIWORD_MASK) {
+		val = div_mask(divider) << (divider->shift + 16);
+	} else {
+		val = readl(divider->reg);
+		val &= ~(div_mask(divider) << divider->shift);
+	}
 	val |= value << divider->shift;
 	writel(val, divider->reg);
 
@@ -255,6 +259,13 @@ static struct clk *_register_divider(struct device *dev, const char *name,
 	struct clk *clk;
 	struct clk_init_data init;
 
+	if (clk_divider_flags & CLK_DIVIDER_HIWORD_MASK) {
+		if (width + shift > 16) {
+			pr_warn("divider value exceeds LOWORD field\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
 	/* allocate the divider */
 	div = kzalloc(sizeof(struct clk_divider), GFP_KERNEL);
 	if (!div) {
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 37ad97961e5a..d77f1267f419 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -257,6 +257,10 @@ struct clk_div_table {
  *	Some hardware implementations gracefully handle this case and allow a
  *	zero divisor by not modifying their input clock
  *	(divide by one / bypass).
+ * CLK_DIVIDER_HIWORD_MASK - The divider settings are only in lower 16-bit
+ *   of this register, and mask of divider bits are in higher 16-bit of this
+ *   register.  While setting the divider bits, higher 16-bit should also be
+ *   updated to indicate changing divider bits.
  */
 struct clk_divider {
 	struct clk_hw	hw;
@@ -271,6 +275,7 @@ struct clk_divider {
 #define CLK_DIVIDER_ONE_BASED		BIT(0)
 #define CLK_DIVIDER_POWER_OF_TWO	BIT(1)
 #define CLK_DIVIDER_ALLOW_ZERO		BIT(2)
+#define CLK_DIVIDER_HIWORD_MASK		BIT(3)
 
 extern const struct clk_ops clk_divider_ops;
 struct clk *clk_register_divider(struct device *dev, const char *name,
-- 
cgit v1.3-7-g2ca7


From 045779942c04646a222289989e6a5b617dfdedf7 Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@linaro.org>
Date: Sat, 8 Jun 2013 22:47:19 +0800
Subject: clk: gate: add CLK_GATE_HIWORD_MASK

In Rockchip Cortex-A9 based chips, they don't use paradigm of
reading-changing-writing the register contents.  Instead they
use a hiword mask to indicate the changed bits.

When b1 should be set as gate, it also needs to indicate the change
by setting hiword mask (b1 << 16).

The patch adds gate flag for this usage.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-gate.c       | 25 +++++++++++++++++++------
 include/linux/clk-provider.h |  5 +++++
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-gate.c b/drivers/clk/clk-gate.c
index 15114febfd92..790306e921c8 100644
--- a/drivers/clk/clk-gate.c
+++ b/drivers/clk/clk-gate.c
@@ -53,12 +53,18 @@ static void clk_gate_endisable(struct clk_hw *hw, int enable)
 	if (gate->lock)
 		spin_lock_irqsave(gate->lock, flags);
 
-	reg = readl(gate->reg);
-
-	if (set)
-		reg |= BIT(gate->bit_idx);
-	else
-		reg &= ~BIT(gate->bit_idx);
+	if (gate->flags & CLK_GATE_HIWORD_MASK) {
+		reg = BIT(gate->bit_idx + 16);
+		if (set)
+			reg |= BIT(gate->bit_idx);
+	} else {
+		reg = readl(gate->reg);
+
+		if (set)
+			reg |= BIT(gate->bit_idx);
+		else
+			reg &= ~BIT(gate->bit_idx);
+	}
 
 	writel(reg, gate->reg);
 
@@ -121,6 +127,13 @@ struct clk *clk_register_gate(struct device *dev, const char *name,
 	struct clk *clk;
 	struct clk_init_data init;
 
+	if (clk_gate_flags & CLK_GATE_HIWORD_MASK) {
+		if (bit_idx > 16) {
+			pr_err("gate bit exceeds LOWORD field\n");
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
 	/* allocate the gate */
 	gate = kzalloc(sizeof(struct clk_gate), GFP_KERNEL);
 	if (!gate) {
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index d77f1267f419..1ec14a732176 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -210,6 +210,10 @@ void of_fixed_clk_setup(struct device_node *np);
  * CLK_GATE_SET_TO_DISABLE - by default this clock sets the bit at bit_idx to
  * 	enable the clock.  Setting this flag does the opposite: setting the bit
  * 	disable the clock and clearing it enables the clock
+ * CLK_GATE_HIWORD_MASK - The gate settings are only in lower 16-bit
+ *   of this register, and mask of gate bits are in higher 16-bit of this
+ *   register.  While setting the gate bits, higher 16-bit should also be
+ *   updated to indicate changing gate bits.
  */
 struct clk_gate {
 	struct clk_hw hw;
@@ -220,6 +224,7 @@ struct clk_gate {
 };
 
 #define CLK_GATE_SET_TO_DISABLE		BIT(0)
+#define CLK_GATE_HIWORD_MASK		BIT(1)
 
 extern const struct clk_ops clk_gate_ops;
 struct clk *clk_register_gate(struct device *dev, const char *name,
-- 
cgit v1.3-7-g2ca7


From a2df4269cad79635201587c5c5404f0b1cb0b05c Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Fri, 24 May 2013 17:21:12 +0100
Subject: pinconf-generic: add BIAS_BUS_HOLD pinconf

Add a new PIN_CONFIG_BIAS_BUS_HOLD pin configuration for a bus holder
pin mode (also known as bus keeper, or repeater). This is a weak latch
which drives the last value on a tristate bus. Another device on the bus
can drive the bus high or low before going tristate to change the value
driven by the pin.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 1 +
 include/linux/pinctrl/pinconf-generic.h | 6 ++++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 33a5392cf40c..ba465b3e85ef 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -37,6 +37,7 @@ struct pin_config_item {
 static struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_BIAS_DISABLE, "input bias disabled", NULL),
 	PCONFDUMP(PIN_CONFIG_BIAS_HIGH_IMPEDANCE, "input bias high impedance", NULL),
+	PCONFDUMP(PIN_CONFIG_BIAS_BUS_HOLD, "input bias bus hold", NULL),
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_UP, "input bias pull up", NULL),
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_DOWN, "input bias pull down", NULL),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL),
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 6aa238096622..ac05b3cfeacc 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -29,6 +29,11 @@
  *	if for example some other pin is going to drive the signal connected
  *	to it for a while. Pins used for input are usually always high
  *	impedance.
+ * @PIN_CONFIG_BIAS_BUS_HOLD: the pin will be set to weakly latch so that it
+ *	weakly drives the last value on a tristate bus, also known as a "bus
+ *	holder", "bus keeper" or "repeater". This allows another device on the
+ *	bus to change the value by driving the bus high or low and switching to
+ *	tristate. The argument is ignored.
  * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high
  *	impedance to VDD). If the argument is != 0 pull-up is enabled,
  *	if it is 0, pull-up is disabled.
@@ -78,6 +83,7 @@
 enum pin_config_param {
 	PIN_CONFIG_BIAS_DISABLE,
 	PIN_CONFIG_BIAS_HIGH_IMPEDANCE,
+	PIN_CONFIG_BIAS_BUS_HOLD,
 	PIN_CONFIG_BIAS_PULL_UP,
 	PIN_CONFIG_BIAS_PULL_DOWN,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
-- 
cgit v1.3-7-g2ca7


From 7970cb770dffa23cb20a36f46602e688e075f5d9 Mon Sep 17 00:00:00 2001
From: Heiko Stübner <heiko@sntech.de>
Date: Thu, 6 Jun 2013 16:44:25 +0200
Subject: pinctrl: add pinconf-generic define for a pin-default pull

There exist controllers that don't support to set the pull to up or down
separately but instead automatically set the pull direction based on
embedded knowledge inside the controller, for example depending on the
selected mux function of the pin.

Therefore this patch adds another config option to use this default
pull-state for a pin where it is not possible to know or decide if the
pin will be pulled up or down.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf-generic.c       | 2 ++
 include/linux/pinctrl/pinconf-generic.h | 5 +++++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index ba465b3e85ef..9a6812b50a32 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -40,6 +40,8 @@ static struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_BIAS_BUS_HOLD, "input bias bus hold", NULL),
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_UP, "input bias pull up", NULL),
 	PCONFDUMP(PIN_CONFIG_BIAS_PULL_DOWN, "input bias pull down", NULL),
+	PCONFDUMP(PIN_CONFIG_BIAS_PULL_PIN_DEFAULT,
+				"input bias pull to pin specific state", NULL),
 	PCONFDUMP(PIN_CONFIG_DRIVE_PUSH_PULL, "output drive push pull", NULL),
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_DRAIN, "output drive open drain", NULL),
 	PCONFDUMP(PIN_CONFIG_DRIVE_OPEN_SOURCE, "output drive open source", NULL),
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index ac05b3cfeacc..d414a7729424 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -40,6 +40,10 @@
  * @PIN_CONFIG_BIAS_PULL_DOWN: the pin will be pulled down (usually with high
  *	impedance to GROUND). If the argument is != 0 pull-down is enabled,
  *	if it is 0, pull-down is disabled.
+ * @PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: the pin will be pulled up or down based
+ *	on embedded knowledge of the controller, like current mux function.
+ *	If the argument is != 0 pull up/down is enabled, if it is 0,
+ *	the pull is disabled.
  * @PIN_CONFIG_DRIVE_PUSH_PULL: the pin will be driven actively high and
  *	low, this is the most typical case and is typically achieved with two
  *	active transistors on the output. Setting this config will enable
@@ -86,6 +90,7 @@ enum pin_config_param {
 	PIN_CONFIG_BIAS_BUS_HOLD,
 	PIN_CONFIG_BIAS_PULL_UP,
 	PIN_CONFIG_BIAS_PULL_DOWN,
+	PIN_CONFIG_BIAS_PULL_PIN_DEFAULT,
 	PIN_CONFIG_DRIVE_PUSH_PULL,
 	PIN_CONFIG_DRIVE_OPEN_DRAIN,
 	PIN_CONFIG_DRIVE_OPEN_SOURCE,
-- 
cgit v1.3-7-g2ca7


From 14005ee270cad7078adbce6b7f3687b992a8334e Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 5 Jun 2013 15:30:33 +0200
Subject: drivers: pinctrl sleep and idle states in the core

If a device have sleep and idle states in addition to the
default state, look up these in the core and stash them in
the pinctrl state container.

Add accessor functions for pinctrl consumers to put the pins
into "default", "sleep" and "idle" states passing nothing but
the struct device * affected.

Solution suggested by Kevin Hilman, Mark Brown and Dmitry
Torokhov in response to a patch series from Hebbar
Gururaja.

Cc: Hebbar Gururaja <gururaja.hebbar@ti.com>
Cc: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Acked-by: Wolfram Sang <wsa@the-dreams.de>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/base/pinctrl.c           | 19 +++++++++++++
 drivers/pinctrl/core.c           | 61 ++++++++++++++++++++++++++++++++++++++++
 include/linux/pinctrl/consumer.h | 34 ++++++++++++++++++++++
 include/linux/pinctrl/devinfo.h  |  4 +++
 4 files changed, 118 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/pinctrl.c b/drivers/base/pinctrl.c
index 67a274e86727..5fb74b43848e 100644
--- a/drivers/base/pinctrl.c
+++ b/drivers/base/pinctrl.c
@@ -48,6 +48,25 @@ int pinctrl_bind_pins(struct device *dev)
 		goto cleanup_get;
 	}
 
+#ifdef CONFIG_PM
+	/*
+	 * If power management is enabled, we also look for the optional
+	 * sleep and idle pin states, with semantics as defined in
+	 * <linux/pinctrl/pinctrl-state.h>
+	 */
+	dev->pins->sleep_state = pinctrl_lookup_state(dev->pins->p,
+					PINCTRL_STATE_SLEEP);
+	if (IS_ERR(dev->pins->sleep_state))
+		/* Not supplying this state is perfectly legal */
+		dev_dbg(dev, "no sleep pinctrl state\n");
+
+	dev->pins->idle_state = pinctrl_lookup_state(dev->pins->p,
+					PINCTRL_STATE_IDLE);
+	if (IS_ERR(dev->pins->idle_state))
+		/* Not supplying this state is perfectly legal */
+		dev_dbg(dev, "no idle pinctrl state\n");
+#endif
+
 	return 0;
 
 	/*
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index 1f9608bd237e..dca9208f5463 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -1196,6 +1196,67 @@ int pinctrl_force_default(struct pinctrl_dev *pctldev)
 }
 EXPORT_SYMBOL_GPL(pinctrl_force_default);
 
+#ifdef CONFIG_PM
+
+/**
+ * pinctrl_pm_select_default_state() - select default pinctrl state for PM
+ * @dev: device to select default state for
+ */
+int pinctrl_pm_select_default_state(struct device *dev)
+{
+	struct dev_pin_info *pins = dev->pins;
+	int ret;
+
+	if (!pins)
+		return 0;
+	if (IS_ERR(pins->default_state))
+		return 0; /* No default state */
+	ret = pinctrl_select_state(pins->p, pins->default_state);
+	if (ret)
+		dev_err(dev, "failed to activate default pinctrl state\n");
+	return ret;
+}
+
+/**
+ * pinctrl_pm_select_sleep_state() - select sleep pinctrl state for PM
+ * @dev: device to select sleep state for
+ */
+int pinctrl_pm_select_sleep_state(struct device *dev)
+{
+	struct dev_pin_info *pins = dev->pins;
+	int ret;
+
+	if (!pins)
+		return 0;
+	if (IS_ERR(pins->sleep_state))
+		return 0; /* No sleep state */
+	ret = pinctrl_select_state(pins->p, pins->sleep_state);
+	if (ret)
+		dev_err(dev, "failed to activate pinctrl sleep state\n");
+	return ret;
+}
+
+/**
+ * pinctrl_pm_select_idle_state() - select idle pinctrl state for PM
+ * @dev: device to select idle state for
+ */
+int pinctrl_pm_select_idle_state(struct device *dev)
+{
+	struct dev_pin_info *pins = dev->pins;
+	int ret;
+
+	if (!pins)
+		return 0;
+	if (IS_ERR(pins->idle_state))
+		return 0; /* No idle state */
+	ret = pinctrl_select_state(pins->p, pins->idle_state);
+	if (ret)
+		dev_err(dev, "failed to activate pinctrl idle state\n");
+	return ret;
+}
+
+#endif
+
 #ifdef CONFIG_DEBUG_FS
 
 static int pinctrl_pins_show(struct seq_file *s, void *what)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 4aad3cea69ae..0f32f10e347d 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -40,6 +40,25 @@ extern int pinctrl_select_state(struct pinctrl *p, struct pinctrl_state *s);
 extern struct pinctrl * __must_check devm_pinctrl_get(struct device *dev);
 extern void devm_pinctrl_put(struct pinctrl *p);
 
+#ifdef CONFIG_PM
+extern int pinctrl_pm_select_default_state(struct device *dev);
+extern int pinctrl_pm_select_sleep_state(struct device *dev);
+extern int pinctrl_pm_select_idle_state(struct device *dev);
+#else
+static inline int pinctrl_pm_select_default_state(struct device *dev)
+{
+	return 0;
+}
+static inline int pinctrl_pm_select_sleep_state(struct device *dev)
+{
+	return 0;
+}
+static inline int pinctrl_pm_select_idle_state(struct device *dev)
+{
+	return 0;
+}
+#endif
+
 #else /* !CONFIG_PINCTRL */
 
 static inline int pinctrl_request_gpio(unsigned gpio)
@@ -199,6 +218,21 @@ static inline int pin_config_group_set(const char *dev_name,
 	return 0;
 }
 
+static inline int pinctrl_pm_select_default_state(struct device *dev)
+{
+	return 0;
+}
+
+static inline int pinctrl_pm_select_sleep_state(struct device *dev)
+{
+	return 0;
+}
+
+static inline int pinctrl_pm_select_idle_state(struct device *dev)
+{
+	return 0;
+}
+
 #endif
 
 #endif /* __LINUX_PINCTRL_CONSUMER_H */
diff --git a/include/linux/pinctrl/devinfo.h b/include/linux/pinctrl/devinfo.h
index 6e5f8a985ea7..281cb91ddcf5 100644
--- a/include/linux/pinctrl/devinfo.h
+++ b/include/linux/pinctrl/devinfo.h
@@ -28,6 +28,10 @@
 struct dev_pin_info {
 	struct pinctrl *p;
 	struct pinctrl_state *default_state;
+#ifdef CONFIG_PM
+	struct pinctrl_state *sleep_state;
+	struct pinctrl_state *idle_state;
+#endif
 };
 
 extern int pinctrl_bind_pins(struct device *dev);
-- 
cgit v1.3-7-g2ca7


From a4244454df1296e90cc961c1b636b1176ef0d9a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 16 Jun 2013 16:12:26 -0700
Subject: percpu-refcount: use RCU-sched insted of normal RCU

percpu-refcount was incorrectly using preempt_disable/enable() for RCU
critical sections against call_rcu().  6a24474da8 ("percpu-refcount:
consistently use plain (non-sched) RCU") fixed it by converting the
preepmtion operations with rcu_read_[un]lock() citing that there isn't
any advantage in using sched-RCU over using the usual one; however,
rcu_read_[un]lock() for the preemptible RCU implementation -
CONFIG_TREE_PREEMPT_RCU, chosen when CONFIG_PREEMPT - are slightly
more expensive than preempt_disable/enable().

In a contrived microbench which repeats the followings,

 - percpu_ref_get()
 - copy 32 bytes of data into percpu buffer
 - percpu_put_get()
 - copy 32 bytes of data into percpu buffer

rcu_read_[un]lock() used in percpu_ref_get/put() makes it go slower by
about 15% when compared to using sched-RCU.

As the RCU critical sections are extremely short, using sched-RCU
shouldn't have any latency implications.  Convert to RCU-sched.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Kent Overstreet <koverstreet@google.com>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/percpu-refcount.h | 12 ++++++------
 lib/percpu-refcount.c           |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index dd2a08600453..95961f0bf62d 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -105,7 +105,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count;
 
-	rcu_read_lock();
+	rcu_read_lock_sched();
 
 	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
 
@@ -114,7 +114,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
 	else
 		atomic_inc(&ref->count);
 
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 }
 
 /**
@@ -134,7 +134,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 	unsigned __percpu *pcpu_count;
 	int ret = false;
 
-	rcu_read_lock();
+	rcu_read_lock_sched();
 
 	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
 
@@ -143,7 +143,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 		ret = true;
 	}
 
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 
 	return ret;
 }
@@ -159,7 +159,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 {
 	unsigned __percpu *pcpu_count;
 
-	rcu_read_lock();
+	rcu_read_lock_sched();
 
 	pcpu_count = ACCESS_ONCE(ref->pcpu_count);
 
@@ -168,7 +168,7 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
 	else if (unlikely(atomic_dec_and_test(&ref->count)))
 		ref->release(ref);
 
-	rcu_read_unlock();
+	rcu_read_unlock_sched();
 }
 
 #endif
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 8bf9e719cca0..7deeb6297a48 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -154,5 +154,5 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 		(((unsigned long) ref->pcpu_count)|PCPU_REF_DEAD);
 	ref->confirm_kill = confirm_kill;
 
-	call_rcu(&ref->rcu, percpu_ref_kill_rcu);
+	call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu);
 }
-- 
cgit v1.3-7-g2ca7


From c8587eeef8fc219e806e868c6f0c7170c769efab Mon Sep 17 00:00:00 2001
From: Christian Ruppert <christian.ruppert@abilis.com>
Date: Thu, 13 Jun 2013 14:55:31 +0200
Subject: pinctrl: add pin list based GPIO ranges

Traditionally, GPIO ranges are based on consecutive ranges of both GPIO
and pin numbers. This patch allows for GPIO ranges with arbitrary lists
of pin numbers.

Signed-off-by: Christian Ruppert <christian.ruppert@abilis.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/core.c          | 59 ++++++++++++++++++++++++++++++++++-------
 include/linux/pinctrl/pinctrl.h |  4 ++-
 2 files changed, 52 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index dca9208f5463..33710b5e7bb5 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -282,6 +282,29 @@ static int pinctrl_register_pins(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
+/**
+ * gpio_to_pin() - GPIO range GPIO number to pin number translation
+ * @range: GPIO range used for the translation
+ * @gpio: gpio pin to translate to a pin number
+ *
+ * Finds the pin number for a given GPIO using the specified GPIO range
+ * as a base for translation. The distinction between linear GPIO ranges
+ * and pin list based GPIO ranges is managed correctly by this function.
+ *
+ * This function assumes the gpio is part of the specified GPIO range, use
+ * only after making sure this is the case (e.g. by calling it on the
+ * result of successful pinctrl_get_device_gpio_range calls)!
+ */
+static inline int gpio_to_pin(struct pinctrl_gpio_range *range,
+				unsigned int gpio)
+{
+	unsigned int offset = gpio - range->base;
+	if (range->pins)
+		return range->pins[offset];
+	else
+		return range->pin_base + offset;
+}
+
 /**
  * pinctrl_match_gpio_range() - check if a certain GPIO pin is in range
  * @pctldev: pin controller device to check
@@ -448,8 +471,14 @@ pinctrl_find_gpio_range_from_pin(struct pinctrl_dev *pctldev,
 	/* Loop over the ranges */
 	list_for_each_entry(range, &pctldev->gpio_ranges, node) {
 		/* Check if we're in the valid range */
-		if (pin >= range->pin_base &&
-		    pin < range->pin_base + range->npins) {
+		if (range->pins) {
+			int a;
+			for (a = 0; a < range->npins; a++) {
+				if (range->pins[a] == pin)
+					return range;
+			}
+		} else if (pin >= range->pin_base &&
+			   pin < range->pin_base + range->npins) {
 			mutex_unlock(&pctldev->mutex);
 			return range;
 		}
@@ -529,7 +558,7 @@ int pinctrl_request_gpio(unsigned gpio)
 	}
 
 	/* Convert to the pin controllers number space */
-	pin = gpio - range->base + range->pin_base;
+	pin = gpio_to_pin(range, gpio);
 
 	ret = pinmux_request_gpio(pctldev, range, pin, gpio);
 
@@ -559,7 +588,7 @@ void pinctrl_free_gpio(unsigned gpio)
 	mutex_lock(&pctldev->mutex);
 
 	/* Convert to the pin controllers number space */
-	pin = gpio - range->base + range->pin_base;
+	pin = gpio_to_pin(range, gpio);
 
 	pinmux_free_gpio(pctldev, pin, range);
 
@@ -582,7 +611,7 @@ static int pinctrl_gpio_direction(unsigned gpio, bool input)
 	mutex_lock(&pctldev->mutex);
 
 	/* Convert to the pin controllers number space */
-	pin = gpio - range->base + range->pin_base;
+	pin = gpio_to_pin(range, gpio);
 	ret = pinmux_gpio_direction(pctldev, range, pin, input);
 
 	mutex_unlock(&pctldev->mutex);
@@ -1349,11 +1378,21 @@ static int pinctrl_gpioranges_show(struct seq_file *s, void *what)
 
 	/* Loop over the ranges */
 	list_for_each_entry(range, &pctldev->gpio_ranges, node) {
-		seq_printf(s, "%u: %s GPIOS [%u - %u] PINS [%u - %u]\n",
-			   range->id, range->name,
-			   range->base, (range->base + range->npins - 1),
-			   range->pin_base,
-			   (range->pin_base + range->npins - 1));
+		if (range->pins) {
+			int a;
+			seq_printf(s, "%u: %s GPIOS [%u - %u] PINS {",
+				range->id, range->name,
+				range->base, (range->base + range->npins - 1));
+			for (a = 0; a < range->npins - 1; a++)
+				seq_printf(s, "%u, ", range->pins[a]);
+			seq_printf(s, "%u}\n", range->pins[a]);
+		}
+		else
+			seq_printf(s, "%u: %s GPIOS [%u - %u] PINS [%u - %u]\n",
+				range->id, range->name,
+				range->base, (range->base + range->npins - 1),
+				range->pin_base,
+				(range->pin_base + range->npins - 1));
 	}
 
 	mutex_unlock(&pctldev->mutex);
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 2c2a9e8d8578..176a6c1b4e03 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -49,7 +49,8 @@ struct pinctrl_pin_desc {
  * @name: a name for the chip in this range
  * @id: an ID number for the chip in this range
  * @base: base offset of the GPIO range
- * @pin_base: base pin number of the GPIO range
+ * @pin_base: base pin number of the GPIO range if pins != NULL
+ * @pins: enumeration of pins in GPIO range or NULL
  * @npins: number of pins in the GPIO range, including the base number
  * @gc: an optional pointer to a gpio_chip
  */
@@ -59,6 +60,7 @@ struct pinctrl_gpio_range {
 	unsigned int id;
 	unsigned int base;
 	unsigned int pin_base;
+	unsigned const *pins;
 	unsigned int npins;
 	struct gpio_chip *gc;
 };
-- 
cgit v1.3-7-g2ca7


From ff73ceed0b5a046468de20d3b014a8c59051f90a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 13 Jun 2013 22:27:59 +0200
Subject: pinctrl: move the pm state stubs

The stubs for the !PINCTRL case were placed in the wrong
part of the file, causing breakage in linux-next when compiling
SH without pinctrl. Fix it up by moving the stubs to the right
place.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/consumer.h | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index 0f32f10e347d..d071d850d1de 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -111,6 +111,21 @@ static inline void devm_pinctrl_put(struct pinctrl *p)
 {
 }
 
+static inline int pinctrl_pm_select_default_state(struct device *dev)
+{
+	return 0;
+}
+
+static inline int pinctrl_pm_select_sleep_state(struct device *dev)
+{
+	return 0;
+}
+
+static inline int pinctrl_pm_select_idle_state(struct device *dev)
+{
+	return 0;
+}
+
 #endif /* CONFIG_PINCTRL */
 
 static inline struct pinctrl * __must_check pinctrl_get_select(
@@ -218,21 +233,6 @@ static inline int pin_config_group_set(const char *dev_name,
 	return 0;
 }
 
-static inline int pinctrl_pm_select_default_state(struct device *dev)
-{
-	return 0;
-}
-
-static inline int pinctrl_pm_select_sleep_state(struct device *dev)
-{
-	return 0;
-}
-
-static inline int pinctrl_pm_select_idle_state(struct device *dev)
-{
-	return 0;
-}
-
 #endif
 
 #endif /* __LINUX_PINCTRL_CONSUMER_H */
-- 
cgit v1.3-7-g2ca7


From 56a59911e502fabbbefa5cfa0cddae22e3c94866 Mon Sep 17 00:00:00 2001
From: Christian Ruppert <christian.ruppert@abilis.com>
Date: Fri, 14 Jun 2013 10:24:48 +0200
Subject: Fix comment on pinctrl_gpio_range.pin_base

The comment introduced with the recently added pinctrl_gpio_range.pins
element was wrong. This corrects it.
Thanks to Patrice Chotard for pointing this out.

Signed-off-by: Christian Ruppert <christian.ruppert@abilis.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinctrl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 176a6c1b4e03..5979147d2bda 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -49,7 +49,7 @@ struct pinctrl_pin_desc {
  * @name: a name for the chip in this range
  * @id: an ID number for the chip in this range
  * @base: base offset of the GPIO range
- * @pin_base: base pin number of the GPIO range if pins != NULL
+ * @pin_base: base pin number of the GPIO range if pins == NULL
  * @pins: enumeration of pins in GPIO range or NULL
  * @npins: number of pins in the GPIO range, including the base number
  * @gc: an optional pointer to a gpio_chip
-- 
cgit v1.3-7-g2ca7


From 5ca3353bcee929c3b7bbf39f79038842f443f01a Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Sun, 16 Jun 2013 12:43:06 +0200
Subject: pinctrl: establish pull-up/pull-down terminology

It is counter-intuitive to have "0" mean disable in a boolean
manner for electronic properties of pins such as pull-up and
pull-down. Therefore, define that a pull-up/pull-down argument
of 0 to such a generic option means that the pin is
short-circuited to VDD or GROUND. Pull disablement shall be
done using PIN_CONFIG_BIAS_DISABLE.

Cc: James Hogan <james.hogan@imgtec.com>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Acked-by Heiko Stuebner <heiko@sntech.de>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf-generic.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index d414a7729424..10ad996afee4 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -36,14 +36,15 @@
  *	tristate. The argument is ignored.
  * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high
  *	impedance to VDD). If the argument is != 0 pull-up is enabled,
- *	if it is 0, pull-up is disabled.
+ *	if it is 0, pull-up is total, i.e. the pin is connected to VDD.
  * @PIN_CONFIG_BIAS_PULL_DOWN: the pin will be pulled down (usually with high
  *	impedance to GROUND). If the argument is != 0 pull-down is enabled,
- *	if it is 0, pull-down is disabled.
+ *	if it is 0, pull-down is total, i.e. the pin is connected to GROUND.
  * @PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: the pin will be pulled up or down based
  *	on embedded knowledge of the controller, like current mux function.
- *	If the argument is != 0 pull up/down is enabled, if it is 0,
- *	the pull is disabled.
+ *	If the argument is != 0 pull up/down is enabled, if it is 0, the
+ *	configuration is ignored. The proper way to disable it is to use
+ *	@PIN_CONFIG_BIAS_DISABLE.
  * @PIN_CONFIG_DRIVE_PUSH_PULL: the pin will be driven actively high and
  *	low, this is the most typical case and is typically achieved with two
  *	active transistors on the output. Setting this config will enable
@@ -72,8 +73,8 @@
  *	supplies, the argument to this parameter (on a custom format) tells
  *	the driver which alternative power source to use.
  * @PIN_CONFIG_SLEW_RATE: if the pin can select slew rate, the argument to
- * 	this parameter (on a custom format) tells the driver which alternative
- * 	slew rate to use.
+ *	this parameter (on a custom format) tells the driver which alternative
+ *	slew rate to use.
  * @PIN_CONFIG_LOW_POWER_MODE: this will configure the pin for low power
  *	operation, if several modes of operation are supported these can be
  *	passed in the argument on a custom form, else just use argument 1
-- 
cgit v1.3-7-g2ca7


From 6db8e85c5c1f89cd0183b76dab027c81009f129f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 14 Jun 2013 11:18:22 -0700
Subject: cgroup: disallow rename(2) if sane_behavior

cgroup's rename(2) isn't a proper migration implementation - it can't
move the cgroup to a different parent in the hierarchy.  All it can do
is swapping the name string for that cgroup.  This isn't useful and
can mislead users to think that cgroup supports proper cgroup-level
migration.  Disallow rename(2) if sane_behavior.

v2: Fail with -EPERM instead of -EINVAL so that it matches the vfs
    return value when ->rename is not implemented as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 2 ++
 kernel/cgroup.c        | 7 +++++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 17604767adfd..f97522790682 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -270,6 +270,8 @@ enum {
 	 * - "release_agent" and "notify_on_release" are removed.
 	 *   Replacement notification mechanism will be implemented.
 	 *
+	 * - rename(2) is disallowed.
+	 *
 	 * - memcg: use_hierarchy is on by default and the cgroup file for
 	 *   the flag is not created.
 	 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2e9da7bf25cb..c2c64005bbc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2508,6 +2508,13 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	cgrp = __d_cgrp(old_dentry);
 
+	/*
+	 * This isn't a proper migration and its usefulness is very
+	 * limited.  Disallow if sane_behavior.
+	 */
+	if (cgroup_sane_behavior(cgrp))
+		return -EPERM;
+
 	name = cgroup_alloc_name(new_dentry);
 	if (!name)
 		return -ENOMEM;
-- 
cgit v1.3-7-g2ca7


From 1c851fb189152b6572688fda7fc487ade2a4cb8a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 18 Jun 2013 10:49:33 +0100
Subject: pinctrl: fix pinconf_ops::pin_config_dbg_parse_modify kerneldoc

The kerneldoc comment for struct pinconf_ops was missing
pin_config_dbg_parse_modify, and instead described
pin_config_group_dbg_set (which is presumably an old name for the same
function). Rename it in the kerneldoc comment so they match.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 include/linux/pinctrl/pinconf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index 1ad4f31ef6b8..f6998692bdc9 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -30,7 +30,7 @@ struct seq_file;
  * @pin_config_set: configure an individual pin
  * @pin_config_group_get: get configurations for an entire pin group
  * @pin_config_group_set: configure all pins in a group
- * @pin_config_group_dbg_set: optional debugfs to modify a pin configuration
+ * @pin_config_dbg_parse_modify: optional debugfs to modify a pin configuration
  * @pin_config_dbg_show: optional debugfs display hook that will provide
  *	per-device info for a certain pin in debugfs
  * @pin_config_group_dbg_show: optional debugfs display hook that will provide
-- 
cgit v1.3-7-g2ca7


From e8c82d20a9f729cf4b9f73043f7fd4e0872bebfd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 18 Jun 2013 18:48:37 +0800
Subject: cgroup: convert cgroup_cft_commit() to use
 cgroup_for_each_descendant_pre()

We used root->allcg_list to iterate cgroup hierarchy because at that time
cgroup_for_each_descendant_pre() hasn't been invented.

tj: In cgroup_cfts_commit(), s/@serial_nr/@update_upto/, move the
    assignment right above releasing cgroup_mutex and explain what's
    going on there.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  6 ----
 kernel/cgroup.c        | 80 +++++++++++++++++++++++++++-----------------------
 2 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f97522790682..b28365890646 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -206,9 +206,6 @@ struct cgroup {
 	 */
 	struct list_head cset_links;
 
-	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
-	struct list_head cft_q_node;	/* used during cftype add/rm */
-
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
@@ -313,9 +310,6 @@ struct cgroupfs_root {
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 
-	/* All cgroups on this root, cgroup_mutex protected */
-	struct list_head allcg_list;
-
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e6571ca822a0..0ed7d8db6508 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1399,7 +1399,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->children);
 	INIT_LIST_HEAD(&cgrp->files);
 	INIT_LIST_HEAD(&cgrp->cset_links);
-	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
@@ -1414,12 +1413,10 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
-	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->name = &root_cgroup_name;
 	init_cgroup_housekeeping(cgrp);
-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
 
 static int cgroup_init_root_id(struct cgroupfs_root *root)
@@ -2785,65 +2782,78 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	return ret;
 }
 
-static DEFINE_MUTEX(cgroup_cft_mutex);
-
 static void cgroup_cfts_prepare(void)
-	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+	__acquires(&cgroup_mutex)
 {
 	/*
 	 * Thanks to the entanglement with vfs inode locking, we can't walk
 	 * the existing cgroups under cgroup_mutex and create files.
-	 * Instead, we increment reference on all cgroups and build list of
-	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
-	 * exclusive access to the field.
+	 * Instead, we use cgroup_for_each_descendant_pre() and drop RCU
+	 * read lock before calling cgroup_addrm_files().
 	 */
-	mutex_lock(&cgroup_cft_mutex);
 	mutex_lock(&cgroup_mutex);
 }
 
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 			       struct cftype *cfts, bool is_add)
-	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+	__releases(&cgroup_mutex)
 {
 	LIST_HEAD(pending);
-	struct cgroup *cgrp, *n;
+	struct cgroup *cgrp, *root = &ss->root->top_cgroup;
 	struct super_block *sb = ss->root->sb;
+	struct dentry *prev = NULL;
+	struct inode *inode;
+	u64 update_upto;
 
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
-	if (cfts && ss->root != &rootnode &&
-	    atomic_inc_not_zero(sb->s_active)) {
-		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
-			dget(cgrp->dentry);
-			list_add_tail(&cgrp->cft_q_node, &pending);
-		}
-	} else {
-		sb = NULL;
+	if (!cfts || ss->root == &rootnode ||
+	    !atomic_inc_not_zero(&sb->s_active)) {
+		mutex_unlock(&cgroup_mutex);
+		return;
 	}
 
-	mutex_unlock(&cgroup_mutex);
-
 	/*
-	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
-	 * files for all cgroups which were created before.
+	 * All cgroups which are created after we drop cgroup_mutex will
+	 * have the updated set of files, so we only need to update the
+	 * cgroups created before the current @cgroup_serial_nr_cursor.
 	 */
-	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
-		struct inode *inode = cgrp->dentry->d_inode;
+	update_upto = atomic64_read(&cgroup_serial_nr_cursor);
+
+	mutex_unlock(&cgroup_mutex);
+
+	/* @root always needs to be updated */
+	inode = root->dentry->d_inode;
+	mutex_lock(&inode->i_mutex);
+	mutex_lock(&cgroup_mutex);
+	cgroup_addrm_files(root, ss, cfts, is_add);
+	mutex_unlock(&cgroup_mutex);
+	mutex_unlock(&inode->i_mutex);
+
+	/* add/rm files for all cgroups created before */
+	rcu_read_lock();
+	cgroup_for_each_descendant_pre(cgrp, root) {
+		if (cgroup_is_dead(cgrp))
+			continue;
+
+		inode = cgrp->dentry->d_inode;
+		dget(cgrp->dentry);
+		rcu_read_unlock();
+
+		dput(prev);
+		prev = cgrp->dentry;
 
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
-		if (!cgroup_is_dead(cgrp))
+		if (cgrp->serial_nr <= update_upto && !cgroup_is_dead(cgrp))
 			cgroup_addrm_files(cgrp, ss, cfts, is_add);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 
-		list_del_init(&cgrp->cft_q_node);
-		dput(cgrp->dentry);
+		rcu_read_lock();
 	}
-
-	if (sb)
-		deactivate_super(sb);
-
-	mutex_unlock(&cgroup_cft_mutex);
+	rcu_read_unlock();
+	dput(prev);
+	deactivate_super(sb);
 }
 
 /**
@@ -4320,7 +4330,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	cgrp->serial_nr = atomic64_inc_return(&cgroup_serial_nr_cursor);
 
 	/* allocation complete, commit to creation */
-	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
 	root->number_of_cgroups++;
 
@@ -4559,7 +4568,6 @@ static void cgroup_offline_fn(struct work_struct *work)
 
 	/* delete this cgroup from parent->children */
 	list_del_rcu(&cgrp->sibling);
-	list_del_init(&cgrp->allcg_node);
 
 	dput(d);
 
-- 
cgit v1.3-7-g2ca7


From 08f502c1c343031f0d126bd00e87dede38269d12 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Tue, 18 Jun 2013 15:06:45 -0600
Subject: ACPI: Do not use CONFIG_ACPI_HOTPLUG_MEMORY_MODULE

CONFIG_ACPI_HOTPLUG_MEMORY has been changed to bool (y/n), and
its module option is no longer valid.  So, stop using
CONFIG_ACPI_HOTPLUG_MEMORY_MODULE.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/acpi.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 17b5b5967641..353ba256f368 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -352,8 +352,7 @@ extern acpi_status acpi_pci_osc_control_set(acpi_handle handle,
 
 /* Enable _OST when all relevant hotplug operations are enabled */
 #if defined(CONFIG_ACPI_HOTPLUG_CPU) &&			\
-	(defined(CONFIG_ACPI_HOTPLUG_MEMORY) ||		\
-	 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) &&	\
+	defined(CONFIG_ACPI_HOTPLUG_MEMORY) &&		\
 	defined(CONFIG_ACPI_CONTAINER)
 #define ACPI_HOTPLUG_OST
 #endif
-- 
cgit v1.3-7-g2ca7


From 03c78cbebb323fc97295ff97dc5e009d56371d57 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 14 Jun 2013 11:17:19 +0800
Subject: cgroup: rename cont to cgrp

Cont is short for container. control group was named process container
at first, but then people found container already has a meaning in
linux kernel.

Clean up the leftover variable name @cont.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 22 +++++++++++-----------
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b28365890646..6c2ba52fc5d4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -433,13 +433,13 @@ struct cftype {
 	 * entry. The key/value pairs (and their ordering) should not
 	 * change between reboots.
 	 */
-	int (*read_map)(struct cgroup *cont, struct cftype *cft,
+	int (*read_map)(struct cgroup *cgrp, struct cftype *cft,
 			struct cgroup_map_cb *cb);
 	/*
 	 * read_seq_string() is used for outputting a simple sequence
 	 * using seqfile.
 	 */
-	int (*read_seq_string)(struct cgroup *cont, struct cftype *cft,
+	int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *m);
 
 	ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 65f333ebb572..1051c1f69674 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5515,7 +5515,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
 }
 
 #ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
+static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
 {
 	struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
 
@@ -5525,23 +5525,23 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont)
 	return css;
 }
 
-static void debug_css_free(struct cgroup *cont)
+static void debug_css_free(struct cgroup *cgrp)
 {
-	kfree(cont->subsys[debug_subsys_id]);
+	kfree(cgrp->subsys[debug_subsys_id]);
 }
 
-static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft)
+static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
 {
-	return cgroup_task_count(cont);
+	return cgroup_task_count(cgrp);
 }
 
-static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft)
+static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
 {
 	return (u64)(unsigned long)current->cgroups;
 }
 
-static u64 current_css_set_refcount_read(struct cgroup *cont,
-					   struct cftype *cft)
+static u64 current_css_set_refcount_read(struct cgroup *cgrp,
+					 struct cftype *cft)
 {
 	u64 count;
 
@@ -5551,7 +5551,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
 	return count;
 }
 
-static int current_css_set_cg_links_read(struct cgroup *cont,
+static int current_css_set_cg_links_read(struct cgroup *cgrp,
 					 struct cftype *cft,
 					 struct seq_file *seq)
 {
@@ -5578,14 +5578,14 @@ static int current_css_set_cg_links_read(struct cgroup *cont,
 }
 
 #define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct cgroup *cont,
+static int cgroup_css_links_read(struct cgroup *cgrp,
 				 struct cftype *cft,
 				 struct seq_file *seq)
 {
 	struct cgrp_cset_link *link;
 
 	read_lock(&css_set_lock);
-	list_for_each_entry(link, &cont->cset_links, cset_link) {
+	list_for_each_entry(link, &cgrp->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
 		int count = 0;
-- 
cgit v1.3-7-g2ca7


From 03d8e80beb7db78a13c192431205b9c83f7e0cd1 Mon Sep 17 00:00:00 2001
From: Mischa Jonker <Mischa.Jonker@synopsys.com>
Date: Tue, 4 Jun 2013 11:45:48 +0200
Subject: perf: Add const qualifier to perf_pmu_register's 'name' arg

This allows us to use pdev->name for registering a PMU device.
IMO the name is not supposed to be changed anyway.

Signed-off-by: Mischa Jonker <mjonker@synopsys.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1370339148-5566-1-git-send-email-mjonker@synopsys.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/metag/kernel/perf/perf_event.c | 2 +-
 include/linux/perf_event.h          | 4 ++--
 kernel/events/core.c                | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/metag/kernel/perf/perf_event.c b/arch/metag/kernel/perf/perf_event.c
index 366569425c52..5b18888ee364 100644
--- a/arch/metag/kernel/perf/perf_event.c
+++ b/arch/metag/kernel/perf/perf_event.c
@@ -882,7 +882,7 @@ static int __init init_hw_perf_events(void)
 	}
 
 	register_cpu_notifier(&metag_pmu_notifier);
-	ret = perf_pmu_register(&pmu, (char *)metag_pmu->name, PERF_TYPE_RAW);
+	ret = perf_pmu_register(&pmu, metag_pmu->name, PERF_TYPE_RAW);
 out:
 	return ret;
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 74a4e14ab60b..4bc57d017fc8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -188,7 +188,7 @@ struct pmu {
 
 	struct device			*dev;
 	const struct attribute_group	**attr_groups;
-	char				*name;
+	const char			*name;
 	int				type;
 
 	int * __percpu			pmu_disable_count;
@@ -519,7 +519,7 @@ struct perf_output_handle {
 
 #ifdef CONFIG_PERF_EVENTS
 
-extern int perf_pmu_register(struct pmu *pmu, char *name, int type);
+extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
 extern void perf_pmu_unregister(struct pmu *pmu);
 
 extern int perf_num_counters(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index aca95bce34c8..9c8920783317 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6179,7 +6179,7 @@ free_dev:
 static struct lock_class_key cpuctx_mutex;
 static struct lock_class_key cpuctx_lock;
 
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 {
 	int cpu, ret;
 
-- 
cgit v1.3-7-g2ca7


From 43b4578071c0e6d87761e113e05d45776cc75437 Mon Sep 17 00:00:00 2001
From: Andrew Hunter <ahh@google.com>
Date: Thu, 23 May 2013 11:07:03 -0700
Subject: perf/x86: Reduce stack usage of x86_schedule_events()

x86_schedule_events() caches event constraints on the stack during
scheduling.  Given the number of possible events, this is 512 bytes of
stack; since it can be invoked under schedule() under god-knows-what,
this is causing stack blowouts.

Trade some space usage for stack safety: add a place to cache the
constraint pointer to struct perf_event.  For 8 bytes per event (1% of
its size) we can save the giant stack frame.

This shouldn't change any aspect of scheduling whatsoever and while in
theory the locality's a tiny bit worse, I doubt we'll see any
performance impact either.

Tested: `perf stat whatever` does not blow up and produces
results that aren't hugely obviously wrong.  I'm not sure how to run
particularly good tests of perf code, but this should not produce any
functional change whatsoever.

Signed-off-by: Andrew Hunter <ahh@google.com>
Reviewed-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1369332423-4400-1-git-send-email-ahh@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c              | 28 ++++++++++++++-------------
 arch/x86/kernel/cpu/perf_event.h              |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_uncore.c | 10 ++++++----
 include/linux/perf_event.h                    |  4 ++++
 4 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 1025f3c99d20..e52a9e577783 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -568,7 +568,7 @@ struct sched_state {
 struct perf_sched {
 	int			max_weight;
 	int			max_events;
-	struct event_constraint	**constraints;
+	struct perf_event	**events;
 	struct sched_state	state;
 	int			saved_states;
 	struct sched_state	saved[SCHED_STATES_MAX];
@@ -577,7 +577,7 @@ struct perf_sched {
 /*
  * Initialize interator that runs through all events and counters.
  */
-static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
+static void perf_sched_init(struct perf_sched *sched, struct perf_event **events,
 			    int num, int wmin, int wmax)
 {
 	int idx;
@@ -585,10 +585,10 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **
 	memset(sched, 0, sizeof(*sched));
 	sched->max_events	= num;
 	sched->max_weight	= wmax;
-	sched->constraints	= c;
+	sched->events		= events;
 
 	for (idx = 0; idx < num; idx++) {
-		if (c[idx]->weight == wmin)
+		if (events[idx]->hw.constraint->weight == wmin)
 			break;
 	}
 
@@ -635,8 +635,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched)
 	if (sched->state.event >= sched->max_events)
 		return false;
 
-	c = sched->constraints[sched->state.event];
-
+	c = sched->events[sched->state.event]->hw.constraint;
 	/* Prefer fixed purpose counters */
 	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
 		idx = INTEL_PMC_IDX_FIXED;
@@ -694,7 +693,7 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 			if (sched->state.weight > sched->max_weight)
 				return false;
 		}
-		c = sched->constraints[sched->state.event];
+		c = sched->events[sched->state.event]->hw.constraint;
 	} while (c->weight != sched->state.weight);
 
 	sched->state.counter = 0;	/* start with first counter */
@@ -705,12 +704,12 @@ static bool perf_sched_next_event(struct perf_sched *sched)
 /*
  * Assign a counter for each event.
  */
-int perf_assign_events(struct event_constraint **constraints, int n,
+int perf_assign_events(struct perf_event **events, int n,
 			int wmin, int wmax, int *assign)
 {
 	struct perf_sched sched;
 
-	perf_sched_init(&sched, constraints, n, wmin, wmax);
+	perf_sched_init(&sched, events, n, wmin, wmax);
 
 	do {
 		if (!perf_sched_find_counter(&sched))
@@ -724,7 +723,7 @@ int perf_assign_events(struct event_constraint **constraints, int n,
 
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
-	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
+	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int i, wmin, wmax, num = 0;
 	struct hw_perf_event *hwc;
@@ -732,8 +731,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		hwc = &cpuc->event_list[i]->hw;
 		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
-		constraints[i] = c;
+		hwc->constraint = c;
+
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
 	}
@@ -743,7 +744,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	 */
 	for (i = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = constraints[i];
+		c = hwc->constraint;
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -764,7 +765,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	/* slow path */
 	if (i != n)
-		num = perf_assign_events(constraints, n, wmin, wmax, assign);
+		num = perf_assign_events(cpuc->event_list, n, wmin,
+					 wmax, assign);
 
 	/*
 	 * scheduling failed or is just a simulation,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ba9aadfa683b..6a6ca01090f9 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -528,7 +528,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 
 void x86_pmu_enable_all(int added);
 
-int perf_assign_events(struct event_constraint **constraints, int n,
+int perf_assign_events(struct perf_event **events, int n,
 			int wmin, int wmax, int *assign);
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index c0e356da7408..adabe6f1bb6e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2723,15 +2723,16 @@ static void uncore_put_event_constraint(struct intel_uncore_box *box, struct per
 static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
 {
 	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
-	struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
+	struct event_constraint *c;
 	int i, wmin, wmax, ret = 0;
 	struct hw_perf_event *hwc;
 
 	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
 
 	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
+		hwc = &box->event_list[i]->hw;
 		c = uncore_get_event_constraint(box, box->event_list[i]);
-		constraints[i] = c;
+		hwc->constraint = c;
 		wmin = min(wmin, c->weight);
 		wmax = max(wmax, c->weight);
 	}
@@ -2739,7 +2740,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	/* fastpath, try to reuse previous register */
 	for (i = 0; i < n; i++) {
 		hwc = &box->event_list[i]->hw;
-		c = constraints[i];
+		c = hwc->constraint;
 
 		/* never assigned */
 		if (hwc->idx == -1)
@@ -2759,7 +2760,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int
 	}
 	/* slow path */
 	if (i != n)
-		ret = perf_assign_events(constraints, n, wmin, wmax, assign);
+		ret = perf_assign_events(box->event_list, n,
+					 wmin, wmax, assign);
 
 	if (!assign || ret) {
 		for (i = 0; i < n; i++)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 4bc57d017fc8..33e8d65836d6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -113,6 +113,8 @@ struct hw_perf_event_extra {
 	int		idx;	/* index in shared_regs->regs[] */
 };
 
+struct event_constraint;
+
 /**
  * struct hw_perf_event - performance event hardware details:
  */
@@ -131,6 +133,8 @@ struct hw_perf_event {
 
 			struct hw_perf_event_extra extra_reg;
 			struct hw_perf_event_extra branch_reg;
+
+			struct event_constraint *constraint;
 		};
 		struct { /* software */
 			struct hrtimer	hrtimer;
-- 
cgit v1.3-7-g2ca7


From 0a0fca9d832b704f116a25badd1ca8c16771dcac Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 4 Jun 2013 13:10:24 +0530
Subject: sched: Rename sched.c as sched/core.c in comments and Documentation

Most of the stuff from kernel/sched.c was moved to kernel/sched/core.c long time
back and the comments/Documentation never got updated.

I figured it out when I was going through sched-domains.txt and so thought of
fixing it globally.

I haven't crossed check if the stuff that is referenced in sched/core.c by all
these files is still present and hasn't changed as that wasn't the motive behind
this patch.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/cdff76a265326ab8d71922a1db5be599f20aad45.1370329560.git.viresh.kumar@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/cgroups/cpusets.txt                 | 2 +-
 Documentation/rt-mutex-design.txt                 | 2 +-
 Documentation/scheduler/sched-domains.txt         | 4 ++--
 Documentation/spinlocks.txt                       | 2 +-
 Documentation/virtual/uml/UserModeLinux-HOWTO.txt | 4 ++--
 arch/avr32/kernel/process.c                       | 2 +-
 arch/cris/include/arch-v10/arch/bitops.h          | 2 +-
 arch/ia64/kernel/head.S                           | 2 +-
 arch/mips/kernel/mips-mt-fpaff.c                  | 4 ++--
 arch/mips/kernel/scall32-o32.S                    | 5 +++--
 arch/powerpc/include/asm/mmu_context.h            | 2 +-
 arch/tile/include/asm/processor.h                 | 2 +-
 arch/tile/kernel/stack.c                          | 2 +-
 arch/um/kernel/sysrq.c                            | 2 +-
 include/linux/completion.h                        | 2 +-
 include/linux/perf_event.h                        | 2 +-
 include/linux/spinlock_up.h                       | 2 +-
 include/uapi/asm-generic/unistd.h                 | 2 +-
 kernel/cpuset.c                                   | 4 ++--
 kernel/time.c                                     | 2 +-
 kernel/workqueue_internal.h                       | 2 +-
 21 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt
index 12e01d432bfe..7740038d82bc 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -373,7 +373,7 @@ can become very uneven.
 1.7 What is sched_load_balance ?
 --------------------------------
 
-The kernel scheduler (kernel/sched.c) automatically load balances
+The kernel scheduler (kernel/sched/core.c) automatically load balances
 tasks.  If one CPU is underutilized, kernel code running on that
 CPU will look for tasks on other more overloaded CPUs and move those
 tasks to itself, within the constraints of such placement mechanisms
diff --git a/Documentation/rt-mutex-design.txt b/Documentation/rt-mutex-design.txt
index 33ed8007a845..a5bcd7f5c33f 100644
--- a/Documentation/rt-mutex-design.txt
+++ b/Documentation/rt-mutex-design.txt
@@ -384,7 +384,7 @@ priority back.
 __rt_mutex_adjust_prio examines the result of rt_mutex_getprio, and if the
 result does not equal the task's current priority, then rt_mutex_setprio
 is called to adjust the priority of the task to the new priority.
-Note that rt_mutex_setprio is defined in kernel/sched.c to implement the
+Note that rt_mutex_setprio is defined in kernel/sched/core.c to implement the
 actual change in priority.
 
 It is interesting to note that __rt_mutex_adjust_prio can either increase
diff --git a/Documentation/scheduler/sched-domains.txt b/Documentation/scheduler/sched-domains.txt
index 443f0c76bab4..4af80b1c05aa 100644
--- a/Documentation/scheduler/sched-domains.txt
+++ b/Documentation/scheduler/sched-domains.txt
@@ -25,7 +25,7 @@ is treated as one entity. The load of a group is defined as the sum of the
 load of each of its member CPUs, and only when the load of a group becomes
 out of balance are tasks moved between groups.
 
-In kernel/sched.c, trigger_load_balance() is run periodically on each CPU
+In kernel/sched/core.c, trigger_load_balance() is run periodically on each CPU
 through scheduler_tick(). It raises a softirq after the next regularly scheduled
 rebalancing event for the current runqueue has arrived. The actual load
 balancing workhorse, run_rebalance_domains()->rebalance_domains(), is then run
@@ -62,7 +62,7 @@ struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
 the specifics and what to tune.
 
 Architectures may retain the regular override the default SD_*_INIT flags
-while using the generic domain builder in kernel/sched.c if they wish to
+while using the generic domain builder in kernel/sched/core.c if they wish to
 retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
 can be done by #define'ing ARCH_HASH_SCHED_TUNE.
 
diff --git a/Documentation/spinlocks.txt b/Documentation/spinlocks.txt
index 9dbe885ecd8d..97eaf5727178 100644
--- a/Documentation/spinlocks.txt
+++ b/Documentation/spinlocks.txt
@@ -137,7 +137,7 @@ don't block on each other (and thus there is no dead-lock wrt interrupts.
 But when you do the write-lock, you have to use the irq-safe version. 
 
 For an example of being clever with rw-locks, see the "waitqueue_lock" 
-handling in kernel/sched.c - nothing ever _changes_ a wait-queue from
+handling in kernel/sched/core.c - nothing ever _changes_ a wait-queue from
 within an interrupt, they only read the queue in order to know whom to
 wake up. So read-locks are safe (which is good: they are very common
 indeed), while write-locks need to protect themselves against interrupts.
diff --git a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
index a5f8436753e7..f4099ca6b483 100644
--- a/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
+++ b/Documentation/virtual/uml/UserModeLinux-HOWTO.txt
@@ -3127,7 +3127,7 @@
            at process_kern.c:156
        #3  0x1006a052 in switch_to (prev=0x50072000, next=0x507e8000, last=0x50072000)
            at process_kern.c:161
-       #4  0x10001d12 in schedule () at sched.c:777
+       #4  0x10001d12 in schedule () at core.c:777
        #5  0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71
        #6  0x1006aa10 in __down_failed () at semaphore.c:157
        #7  0x1006c5d8 in segv_handler (sc=0x5006e940) at trap_user.c:174
@@ -3191,7 +3191,7 @@
            at process_kern.c:161
        161       _switch_to(prev, next);
        (gdb)
-       #4  0x10001d12 in schedule () at sched.c:777
+       #4  0x10001d12 in schedule () at core.c:777
        777             switch_to(prev, next, prev);
        (gdb)
        #5  0x1006a744 in __down (sem=0x507d241c) at semaphore.c:71
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index e7b61494c312..c2731003edef 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -341,7 +341,7 @@ unsigned long get_wchan(struct task_struct *p)
 		 * is actually quite ugly. It might be possible to
 		 * determine the frame size automatically at build
 		 * time by doing this:
-		 *   - compile sched.c
+		 *   - compile sched/core.c
 		 *   - disassemble the resulting sched.o
 		 *   - look for 'sub sp,??' shortly after '<schedule>:'
 		 */
diff --git a/arch/cris/include/arch-v10/arch/bitops.h b/arch/cris/include/arch-v10/arch/bitops.h
index be85f6de25d3..03d9cfd92c8a 100644
--- a/arch/cris/include/arch-v10/arch/bitops.h
+++ b/arch/cris/include/arch-v10/arch/bitops.h
@@ -17,7 +17,7 @@ static inline unsigned long cris_swapnwbrlz(unsigned long w)
 	   in another register:
 	   !  __asm__ ("swapnwbr %2\n\tlz %2,%0"
 	   !	      : "=r,r" (res), "=r,X" (dummy) : "1,0" (w));
-	   confuses gcc (sched.c, gcc from cris-dist-1.14).  */
+	   confuses gcc (core.c, gcc from cris-dist-1.14).  */
 
 	unsigned long res;
 	__asm__ ("swapnwbr %0 \n\t"
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 9be4e497f3d3..991ca336b8a2 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1035,7 +1035,7 @@ END(ia64_delay_loop)
  * Return a CPU-local timestamp in nano-seconds.  This timestamp is
  * NOT synchronized across CPUs its return value must never be
  * compared against the values returned on another CPU.  The usage in
- * kernel/sched.c ensures that.
+ * kernel/sched/core.c ensures that.
  *
  * The return-value of sched_clock() is NOT supposed to wrap-around.
  * If it did, it would cause some scheduling hiccups (at the worst).
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index fd814e08c945..cb098628aee8 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -27,12 +27,12 @@ unsigned long mt_fpemul_threshold;
  * FPU affinity with the user's requested processor affinity.
  * This code is 98% identical with the sys_sched_setaffinity()
  * and sys_sched_getaffinity() system calls, and should be
- * updated when kernel/sched.c changes.
+ * updated when kernel/sched/core.c changes.
  */
 
 /*
  * find_process_by_pid - find a process with a matching PID value.
- * used in sys_sched_set/getaffinity() in kernel/sched.c, so
+ * used in sys_sched_set/getaffinity() in kernel/sched/core.c, so
  * cloned here.
  */
 static inline struct task_struct *find_process_by_pid(pid_t pid)
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index 9b36424b03c5..e9127ec612ef 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -476,8 +476,9 @@ einval: li	v0, -ENOSYS
 	/*
 	 * For FPU affinity scheduling on MIPS MT processors, we need to
 	 * intercept sys_sched_xxxaffinity() calls until we get a proper hook
-	 * in kernel/sched.c.  Considered only temporary we only support these
-	 * hooks for the 32-bit kernel - there is no MIPS64 MT processor atm.
+	 * in kernel/sched/core.c.  Considered only temporary we only support
+	 * these hooks for the 32-bit kernel - there is no MIPS64 MT processor
+	 * atm.
 	 */
 	sys	mipsmt_sys_sched_setaffinity	3
 	sys	mipsmt_sys_sched_getaffinity	3
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index a73668a5f30d..b467530e2485 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -38,7 +38,7 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
 
 /*
  * switch_mm is the entry point called from the architecture independent
- * code in kernel/sched.c
+ * code in kernel/sched/core.c
  */
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 2b70dfb1442e..b3f104953da2 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -225,7 +225,7 @@ extern int do_work_pending(struct pt_regs *regs, u32 flags);
 
 /*
  * Return saved (kernel) PC of a blocked thread.
- * Only used in a printk() in kernel/sched.c, so don't work too hard.
+ * Only used in a printk() in kernel/sched/core.c, so don't work too hard.
  */
 #define thread_saved_pc(t)   ((t)->thread.pc)
 
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index ed258b8ae320..af8dfc9665f6 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -442,7 +442,7 @@ void _KBacktraceIterator_init_current(struct KBacktraceIterator *kbt, ulong pc,
 				regs_to_pt_regs(&regs, pc, lr, sp, r52));
 }
 
-/* This is called only from kernel/sched.c, with esp == NULL */
+/* This is called only from kernel/sched/core.c, with esp == NULL */
 void show_stack(struct task_struct *task, unsigned long *esp)
 {
 	struct KBacktraceIterator kbt;
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index 7d101a2a1541..0dc4d1c6f98a 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -39,7 +39,7 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 static const int kstack_depth_to_print = 24;
 
 /* This recently started being used in arch-independent code too, as in
- * kernel/sched.c.*/
+ * kernel/sched/core.c.*/
 void show_stack(struct task_struct *task, unsigned long *esp)
 {
 	unsigned long *stack;
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 33f0280fd533..3cd574d5b19e 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -5,7 +5,7 @@
  * (C) Copyright 2001 Linus Torvalds
  *
  * Atomic wait-for-completion handler data structures.
- * See kernel/sched.c for details.
+ * See kernel/sched/core.c for details.
  */
 
 #include <linux/wait.h>
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index f463a46424e2..5ec99e5a50d2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -803,7 +803,7 @@ static inline void perf_restore_debug_store(void)			{ }
 #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
 
 /*
- * This has to have a higher priority than migration_notifier in sched.c.
+ * This has to have a higher priority than migration_notifier in sched/core.c.
  */
 #define perf_cpu_notifier(fn)						\
 do {									\
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index e2369c167dbd..8b3ac0d718eb 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -67,7 +67,7 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock)
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
-/* for sched.c and kernel_lock.c: */
+/* for sched/core.c and kernel_lock.c: */
 # define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
 # define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
 # define arch_spin_unlock(lock)	do { barrier(); (void)(lock); } while (0)
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 0cc74c4403e4..a20a9b4d3871 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -361,7 +361,7 @@ __SYSCALL(__NR_syslog, sys_syslog)
 #define __NR_ptrace 117
 __SYSCALL(__NR_ptrace, sys_ptrace)
 
-/* kernel/sched.c */
+/* kernel/sched/core.c */
 #define __NR_sched_setparam 118
 __SYSCALL(__NR_sched_setparam, sys_sched_setparam)
 #define __NR_sched_setscheduler 119
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe5..902d13fc2b13 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -540,7 +540,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  * This function builds a partial partition of the systems CPUs
  * A 'partial partition' is a set of non-overlapping subsets whose
  * union is a subset of that set.
- * The output of this function needs to be passed to kernel/sched.c
+ * The output of this function needs to be passed to kernel/sched/core.c
  * partition_sched_domains() routine, which will rebuild the scheduler's
  * load balancing domains (sched domains) as specified by that partial
  * partition.
@@ -569,7 +569,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  *	   is a subset of one of these domains, while there are as
  *	   many such domains as possible, each as small as possible.
  * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
- *	   the kernel/sched.c routine partition_sched_domains() in a
+ *	   the kernel/sched/core.c routine partition_sched_domains() in a
  *	   convenient format, that can be easily compared to the prior
  *	   value to determine what partition elements (sched domains)
  *	   were changed (added or removed.)
diff --git a/kernel/time.c b/kernel/time.c
index d3617dbd3dca..7c7964c33ae7 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -11,7 +11,7 @@
  * Modification history kernel/time.c
  *
  * 1993-09-02    Philip Gladstone
- *      Created file with time related functions from sched.c and adjtimex()
+ *      Created file with time related functions from sched/core.c and adjtimex()
  * 1993-10-08    Torsten Duwe
  *      adjtime interface update and CMOS clock write code
  * 1995-08-13    Torsten Duwe
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index ad83c96b2ece..7e2204db0b1a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -64,7 +64,7 @@ static inline struct worker *current_wq_worker(void)
 
 /*
  * Scheduler hooks for concurrency managed workqueue.  Only to be used from
- * sched.c and workqueue.c.
+ * sched/core.c and workqueue.c.
  */
 void wq_worker_waking_up(struct task_struct *task, int cpu);
 struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
-- 
cgit v1.3-7-g2ca7


From 135c5612c460f89657c4698fe2ea753f6f667963 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 17 Jun 2013 17:36:51 -0700
Subject: perf/x86/intel: Support Haswell/v4 LBR format

Haswell has two additional LBR from flags for TSX: in_tx and
abort_tx, implemented as a new "v4" version of the LBR format.

Handle those in and adjust the sign extension code to still
correctly extend. The flags are exported similarly in the LBR
record to the existing misprediction flag

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Andi Kleen <ak@linux.jf.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/r/1371515812-9646-6-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 56 +++++++++++++++++++++++++++---
 include/linux/perf_event.h                 |  7 +++-
 include/uapi/linux/perf_event.h            |  5 ++-
 3 files changed, 61 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index de341d4ec92a..d5be06a5005e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -12,6 +12,16 @@ enum {
 	LBR_FORMAT_LIP		= 0x01,
 	LBR_FORMAT_EIP		= 0x02,
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2,
+};
+
+static enum {
+	LBR_EIP_FLAGS		= 1,
+	LBR_TSX			= 2,
+} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
+	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
+	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
 };
 
 /*
@@ -56,6 +66,8 @@ enum {
 	 LBR_FAR)
 
 #define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
+#define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
+#define LBR_FROM_FLAG_ABORT    (1ULL << 61)
 
 #define for_each_branch_sample_type(x) \
 	for ((x) = PERF_SAMPLE_BRANCH_USER; \
@@ -81,9 +93,13 @@ enum {
 	X86_BR_JMP      = 1 << 9, /* jump */
 	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
 	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
+	X86_BR_ABORT    = 1 << 12,/* transaction abort */
+	X86_BR_IN_TX    = 1 << 13,/* in transaction */
+	X86_BR_NO_TX    = 1 << 14,/* not in transaction */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
+#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
 
 #define X86_BR_ANY       \
 	(X86_BR_CALL    |\
@@ -95,6 +111,7 @@ enum {
 	 X86_BR_JCC     |\
 	 X86_BR_JMP	 |\
 	 X86_BR_IRQ	 |\
+	 X86_BR_ABORT	 |\
 	 X86_BR_IND_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
@@ -270,21 +287,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
-		u64 from, to, mis = 0, pred = 0;
+		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
+		int skip = 0;
+		int lbr_flags = lbr_desc[lbr_format];
 
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
 
-		if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
+		if (lbr_flags & LBR_EIP_FLAGS) {
 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
 			pred = !mis;
-			from = (u64)((((s64)from) << 1) >> 1);
+			skip = 1;
+		}
+		if (lbr_flags & LBR_TSX) {
+			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+			abort = !!(from & LBR_FROM_FLAG_ABORT);
+			skip = 3;
 		}
+		from = (u64)((((s64)from) << skip) >> skip);
 
 		cpuc->lbr_entries[i].from	= from;
 		cpuc->lbr_entries[i].to		= to;
 		cpuc->lbr_entries[i].mispred	= mis;
 		cpuc->lbr_entries[i].predicted	= pred;
+		cpuc->lbr_entries[i].in_tx	= in_tx;
+		cpuc->lbr_entries[i].abort	= abort;
 		cpuc->lbr_entries[i].reserved	= 0;
 	}
 	cpuc->lbr_stack.nr = i;
@@ -334,6 +361,16 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 
 	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
 		mask |= X86_BR_IND_CALL;
+
+	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
+		mask |= X86_BR_ABORT;
+
+	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
+		mask |= X86_BR_IN_TX;
+
+	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
+		mask |= X86_BR_NO_TX;
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
@@ -408,7 +445,7 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
  * decoded (e.g., text page not present), then X86_BR_NONE is
  * returned.
  */
-static int branch_type(unsigned long from, unsigned long to)
+static int branch_type(unsigned long from, unsigned long to, int abort)
 {
 	struct insn insn;
 	void *addr;
@@ -428,6 +465,9 @@ static int branch_type(unsigned long from, unsigned long to)
 	if (from == 0 || to == 0)
 		return X86_BR_NONE;
 
+	if (abort)
+		return X86_BR_ABORT | to_plm;
+
 	if (from_plm == X86_BR_USER) {
 		/*
 		 * can happen if measuring at the user level only
@@ -574,7 +614,13 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 		from = cpuc->lbr_entries[i].from;
 		to = cpuc->lbr_entries[i].to;
 
-		type = branch_type(from, to);
+		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
+			if (cpuc->lbr_entries[i].in_tx)
+				type |= X86_BR_IN_TX;
+			else
+				type |= X86_BR_NO_TX;
+		}
 
 		/* if type does not correspond, then discard */
 		if (type == X86_BR_NONE || (br_sel & type) != type) {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33e8d65836d6..056f93a7990f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -73,13 +73,18 @@ struct perf_raw_record {
  *
  * support for mispred, predicted is optional. In case it
  * is not supported mispred = predicted = 0.
+ *
+ *     in_tx: running in a hardware transaction
+ *     abort: aborting a hardware transaction
  */
 struct perf_branch_entry {
 	__u64	from;
 	__u64	to;
 	__u64	mispred:1,  /* target mispredicted */
 		predicted:1,/* target predicted */
-		reserved:62;
+		in_tx:1,    /* in transaction */
+		abort:1,    /* transaction abort */
+		reserved:60;
 };
 
 /*
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index fb104e51496e..0b1df41691e8 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -157,8 +157,11 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
 	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
 	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << 7, /* transaction aborts */
+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << 8, /* in transaction */
+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << 9, /* not in transaction */
 
-	PERF_SAMPLE_BRANCH_MAX		= 1U << 7, /* non-ABI */
+	PERF_SAMPLE_BRANCH_MAX		= 1U << 10, /* non-ABI */
 };
 
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
-- 
cgit v1.3-7-g2ca7


From bb177fedd348c92c2bea6adc9a2163ebff15272e Mon Sep 17 00:00:00 2001
From: Julius Werner <jwerner@chromium.org>
Date: Wed, 12 Jun 2013 12:55:22 -0700
Subject: PM / Sleep: Print last wakeup source on failed wakeup_count write

Commit a938da06 introduced a useful little log message to tell
users/debuggers which wakeup source aborted a suspend.  However,
this message is only printed if the abort happens during the
in-kernel suspend path (after writing /sys/power/state).

The full specification of the /sys/power/wakeup_count facility
allows user-space power managers to double-check if wakeups have
already happened before it actually tries to suspend (e.g. while it
was running user-space pre-suspend hooks), by writing the last known
wakeup_count value to /sys/power/wakeup_count.  This patch changes
the sysfs handler for that node to also print said log message if
that write fails, so that we can figure out the offending wakeup
source for both kinds of suspend aborts.

Signed-off-by: Julius Werner <jwerner@chromium.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 5 +++--
 include/linux/suspend.h     | 1 +
 kernel/power/main.c         | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 407a2efa10bb..2d56f4113ae7 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -659,7 +659,7 @@ void pm_wakeup_event(struct device *dev, unsigned int msec)
 }
 EXPORT_SYMBOL_GPL(pm_wakeup_event);
 
-static void print_active_wakeup_sources(void)
+void pm_print_active_wakeup_sources(void)
 {
 	struct wakeup_source *ws;
 	int active = 0;
@@ -683,6 +683,7 @@ static void print_active_wakeup_sources(void)
 			last_activity_ws->name);
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(pm_print_active_wakeup_sources);
 
 /**
  * pm_wakeup_pending - Check if power transition in progress should be aborted.
@@ -709,7 +710,7 @@ bool pm_wakeup_pending(void)
 
 	if (ret) {
 		pr_info("PM: Wakeup pending, aborting suspend\n");
-		print_active_wakeup_sources();
+		pm_print_active_wakeup_sources();
 	}
 
 	return ret;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d4e3f16d5e89..f73cabf59012 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -363,6 +363,7 @@ extern bool pm_wakeup_pending(void);
 extern bool pm_get_wakeup_count(unsigned int *count, bool block);
 extern bool pm_save_wakeup_count(unsigned int count);
 extern void pm_wakep_autosleep_enabled(bool set);
+extern void pm_print_active_wakeup_sources(void);
 
 static inline void lock_system_sleep(void)
 {
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d77663bfedeb..0828070d38b4 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -424,6 +424,8 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
 	if (sscanf(buf, "%u", &val) == 1) {
 		if (pm_save_wakeup_count(val))
 			error = n;
+		else
+			pm_print_active_wakeup_sources();
 	}
 
  out:
-- 
cgit v1.3-7-g2ca7


From 95731ebb114c5f0c028459388560fc2a72fe5049 Mon Sep 17 00:00:00 2001
From: Xiaoguang Chen <chenxg@marvell.com>
Date: Wed, 19 Jun 2013 15:00:07 +0800
Subject: cpufreq: Fix governor start/stop race condition

Cpufreq governors' stop and start operations should be carried out
in sequence.  Otherwise, there will be unexpected behavior, like in
the example below.

Suppose there are 4 CPUs and policy->cpu=CPU0, CPU1/2/3 are linked
to CPU0.  The normal sequence is:

 1) Current governor is userspace.  An application tries to set the
    governor to ondemand.  It will call __cpufreq_set_policy() in
    which it will stop the userspace governor and then start the
    ondemand governor.

 2) Current governor is userspace.  The online of CPU3 runs on CPU0.
    It will call cpufreq_add_policy_cpu() in which it will first
    stop the userspace governor, and then start it again.

If the sequence of the above two cases interleaves, it becomes:

 1) Application stops userspace governor
 2)                                  Hotplug stops userspace governor

which is a problem, because the governor shouldn't be stopped twice
in a row.  What happens next is:

 3) Application starts ondemand governor
 4)                                  Hotplug starts a governor

In step 4, the hotplug is supposed to start the userspace governor,
but now the governor has been changed by the application to ondemand,
so the ondemand governor is started once again, which is incorrect.

The solution is to prevent policy governors from being stopped
multiple times in a row.  A governor should only be stopped once for
one policy.  After it has been stopped, no more governor stop
operations should be executed.

Also add a mutex to serialize governor operations.

[rjw: Changelog.  And you owe me a beverage of my choice.]
Signed-off-by: Xiaoguang Chen <chenxg@marvell.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 24 ++++++++++++++++++++++++
 include/linux/cpufreq.h   |  1 +
 2 files changed, 25 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f8c28607ccd6..43cf60832468 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -49,6 +49,7 @@ static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
 static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor);
 #endif
 static DEFINE_RWLOCK(cpufreq_driver_lock);
+static DEFINE_MUTEX(cpufreq_governor_lock);
 
 /*
  * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
@@ -1635,6 +1636,21 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 
 	pr_debug("__cpufreq_governor for CPU %u, event %u\n",
 						policy->cpu, event);
+
+	mutex_lock(&cpufreq_governor_lock);
+	if ((!policy->governor_enabled && (event == CPUFREQ_GOV_STOP)) ||
+	    (policy->governor_enabled && (event == CPUFREQ_GOV_START))) {
+		mutex_unlock(&cpufreq_governor_lock);
+		return -EBUSY;
+	}
+
+	if (event == CPUFREQ_GOV_STOP)
+		policy->governor_enabled = false;
+	else if (event == CPUFREQ_GOV_START)
+		policy->governor_enabled = true;
+
+	mutex_unlock(&cpufreq_governor_lock);
+
 	ret = policy->governor->governor(policy, event);
 
 	if (!ret) {
@@ -1642,6 +1658,14 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 			policy->governor->initialized++;
 		else if (event == CPUFREQ_GOV_POLICY_EXIT)
 			policy->governor->initialized--;
+	} else {
+		/* Restore original values */
+		mutex_lock(&cpufreq_governor_lock);
+		if (event == CPUFREQ_GOV_STOP)
+			policy->governor_enabled = true;
+		else if (event == CPUFREQ_GOV_START)
+			policy->governor_enabled = false;
+		mutex_unlock(&cpufreq_governor_lock);
 	}
 
 	/* we keep one module reference alive for
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index d93905633dc7..125719d41285 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -111,6 +111,7 @@ struct cpufreq_policy {
 	unsigned int		policy; /* see above */
 	struct cpufreq_governor	*governor; /* see below */
 	void			*governor_data;
+	bool			governor_enabled; /* governor start/stop flag */
 
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
-- 
cgit v1.3-7-g2ca7


From bb176f7d038fee4d46b3293e64e173bfb05ab7b5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jun 2013 14:19:33 +0530
Subject: cpufreq: Fix minor formatting issues

There were a few noticeable formatting issues in core cpufreq code.
This cleans them up to make code look better.  The changes include:
 - Whitespace cleanup.
 - Rearrangements of code.
 - Multiline comments fixes.
 - Formatting changes to fit 80 columns.

Copyright information in cpufreq.c is also updated to include my name
for 2013.

[rjw: Changelog]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c             | 63 +++++++++++++++--------------------
 drivers/cpufreq/cpufreq_governor.h    |  4 +--
 drivers/cpufreq/cpufreq_performance.c |  4 ---
 drivers/cpufreq/cpufreq_powersave.c   |  6 ++--
 drivers/cpufreq/cpufreq_stats.c       |  4 +--
 drivers/cpufreq/cpufreq_userspace.c   |  4 ---
 include/linux/cpufreq.h               | 41 +++++++++++------------
 7 files changed, 53 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 43cf60832468..075edeff358c 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -3,6 +3,7 @@
  *
  *  Copyright (C) 2001 Russell King
  *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *            (C) 2013 Viresh Kumar <viresh.kumar@linaro.org>
  *
  *  Oct 2005 - Ashok Raj <ashok.raj@intel.com>
  *	Added handling for CPU hotplug
@@ -12,7 +13,6 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
- *
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -44,12 +44,13 @@
  */
 static struct cpufreq_driver *cpufreq_driver;
 static DEFINE_PER_CPU(struct cpufreq_policy *, cpufreq_cpu_data);
+static DEFINE_RWLOCK(cpufreq_driver_lock);
+static DEFINE_MUTEX(cpufreq_governor_lock);
+
 #ifdef CONFIG_HOTPLUG_CPU
 /* This one keeps track of the previously set governor of a removed CPU */
 static DEFINE_PER_CPU(char[CPUFREQ_NAME_LEN], cpufreq_cpu_governor);
 #endif
-static DEFINE_RWLOCK(cpufreq_driver_lock);
-static DEFINE_MUTEX(cpufreq_governor_lock);
 
 /*
  * cpu_policy_rwsem is a per CPU reader-writer semaphore designed to cure
@@ -199,7 +200,6 @@ static struct cpufreq_policy *__cpufreq_cpu_get(unsigned int cpu, bool sysfs)
 	if (!try_module_get(cpufreq_driver->owner))
 		goto err_out_unlock;
 
-
 	/* get the CPU */
 	data = per_cpu(cpufreq_cpu_data, cpu);
 
@@ -269,7 +269,7 @@ static void cpufreq_cpu_put_sysfs(struct cpufreq_policy *data)
  */
 #ifndef CONFIG_SMP
 static unsigned long l_p_j_ref;
-static unsigned int  l_p_j_ref_freq;
+static unsigned int l_p_j_ref_freq;
 
 static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 {
@@ -282,7 +282,7 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 		pr_debug("saving %lu as reference value for loops_per_jiffy; "
 			"freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
 	}
-	if ((val == CPUFREQ_POSTCHANGE  && ci->old != ci->new) ||
+	if ((val == CPUFREQ_POSTCHANGE && ci->old != ci->new) ||
 	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) {
 		loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,
 								ci->new);
@@ -297,7 +297,6 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 }
 #endif
 
-
 void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs, unsigned int state)
 {
@@ -343,6 +342,7 @@ void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		break;
 	}
 }
+
 /**
  * cpufreq_notify_transition - call notifier chain and adjust_jiffies
  * on frequency transition.
@@ -360,7 +360,6 @@ void cpufreq_notify_transition(struct cpufreq_policy *policy,
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
 
-
 /*********************************************************************
  *                          SYSFS INTERFACE                          *
  *********************************************************************/
@@ -425,7 +424,6 @@ out:
 	return err;
 }
 
-
 /**
  * cpufreq_per_cpu_attr_read() / show_##file_name() -
  * print out cpufreq information
@@ -490,7 +488,6 @@ static ssize_t show_cpuinfo_cur_freq(struct cpufreq_policy *policy,
 	return sprintf(buf, "%u\n", cur_freq);
 }
 
-
 /**
  * show_scaling_governor - show the current policy for the specified CPU
  */
@@ -506,7 +503,6 @@ static ssize_t show_scaling_governor(struct cpufreq_policy *policy, char *buf)
 	return -EINVAL;
 }
 
-
 /**
  * store_scaling_governor - store policy for the specified CPU
  */
@@ -529,8 +525,10 @@ static ssize_t store_scaling_governor(struct cpufreq_policy *policy,
 						&new_policy.governor))
 		return -EINVAL;
 
-	/* Do not use cpufreq_set_policy here or the user_policy.max
-	   will be wrongly overridden */
+	/*
+	 * Do not use cpufreq_set_policy here or the user_policy.max
+	 * will be wrongly overridden
+	 */
 	ret = __cpufreq_set_policy(policy, &new_policy);
 
 	policy->user_policy.policy = policy->policy;
@@ -1094,7 +1092,8 @@ static void update_policy_cpu(struct cpufreq_policy *policy, unsigned int cpu)
  * Caller should already have policy_rwsem in write mode for this CPU.
  * This routine frees the rwsem before returning.
  */
-static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
+static int __cpufreq_remove_dev(struct device *dev,
+		struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id, ret, cpus;
 	unsigned long flags;
@@ -1201,7 +1200,6 @@ static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif
 	return 0;
 }
 
-
 static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 {
 	unsigned int cpu = dev->id;
@@ -1214,7 +1212,6 @@ static int cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
 	return retval;
 }
 
-
 static void handle_update(struct work_struct *work)
 {
 	struct cpufreq_policy *policy =
@@ -1225,7 +1222,8 @@ static void handle_update(struct work_struct *work)
 }
 
 /**
- *	cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're in deep trouble.
+ *	cpufreq_out_of_sync - If actual and saved CPU frequency differs, we're
+ *	in deep trouble.
  *	@cpu: cpu number
  *	@old_freq: CPU frequency the kernel thinks the CPU runs at
  *	@new_freq: CPU frequency the CPU actually runs at
@@ -1240,7 +1238,6 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
 	struct cpufreq_freqs freqs;
 	unsigned long flags;
 
-
 	pr_debug("Warning: CPU frequency out of sync: cpufreq and timing "
 	       "core thinks of %u, is %u kHz.\n", old_freq, new_freq);
 
@@ -1255,7 +1252,6 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
 	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 }
 
-
 /**
  * cpufreq_quick_get - get the CPU frequency (in kHz) from policy->cur
  * @cpu: CPU number
@@ -1301,7 +1297,6 @@ unsigned int cpufreq_quick_get_max(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_quick_get_max);
 
-
 static unsigned int __cpufreq_get(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);
@@ -1360,7 +1355,6 @@ static struct subsys_interface cpufreq_interface = {
 	.remove_dev	= cpufreq_remove_dev,
 };
 
-
 /**
  * cpufreq_bp_suspend - Prepare the boot CPU for system suspend.
  *
@@ -1497,11 +1491,10 @@ int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
 }
 EXPORT_SYMBOL(cpufreq_register_notifier);
 
-
 /**
  *	cpufreq_unregister_notifier - unregister a driver with cpufreq
  *	@nb: notifier block to be unregistered
- *      @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
+ *	@list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
  *
  *	Remove a driver from the CPU frequency notifier list.
  *
@@ -1537,7 +1530,6 @@ EXPORT_SYMBOL(cpufreq_unregister_notifier);
  *                              GOVERNORS                            *
  *********************************************************************/
 
-
 int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int target_freq,
 			    unsigned int relation)
@@ -1678,7 +1670,6 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 	return ret;
 }
 
-
 int cpufreq_register_governor(struct cpufreq_governor *governor)
 {
 	int err;
@@ -1703,7 +1694,6 @@ int cpufreq_register_governor(struct cpufreq_governor *governor)
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_governor);
 
-
 void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 {
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1733,7 +1723,6 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor)
 EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
 
 
-
 /*********************************************************************
  *                          POLICY INTERFACE                         *
  *********************************************************************/
@@ -1762,7 +1751,6 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
-
 /*
  * data   : current policy.
  * policy : policy to be set.
@@ -1796,8 +1784,10 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 	blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
 			CPUFREQ_INCOMPATIBLE, policy);
 
-	/* verify the cpu speed can be set within this limit,
-	   which might be different to the first one */
+	/*
+	 * verify the cpu speed can be set within this limit, which might be
+	 * different to the first one
+	 */
 	ret = cpufreq_driver->verify(policy);
 	if (ret)
 		goto error_out;
@@ -1899,8 +1889,10 @@ int cpufreq_update_policy(unsigned int cpu)
 	policy.policy = data->user_policy.policy;
 	policy.governor = data->user_policy.governor;
 
-	/* BIOS might change freq behind our back
-	  -> ask driver for current freq and notify governors about a change */
+	/*
+	 * BIOS might change freq behind our back
+	 * -> ask driver for current freq and notify governors about a change
+	 */
 	if (cpufreq_driver->get) {
 		policy.cur = cpufreq_driver->get(cpu);
 		if (!data->cur) {
@@ -1949,7 +1941,7 @@ static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
 }
 
 static struct notifier_block __refdata cpufreq_cpu_notifier = {
-    .notifier_call = cpufreq_cpu_callback,
+	.notifier_call = cpufreq_cpu_callback,
 };
 
 /*********************************************************************
@@ -1961,7 +1953,7 @@ static struct notifier_block __refdata cpufreq_cpu_notifier = {
  * @driver_data: A struct cpufreq_driver containing the values#
  * submitted by the CPU Frequency driver.
  *
- *   Registers a CPU Frequency driver to this core code. This code
+ * Registers a CPU Frequency driver to this core code. This code
  * returns zero on success, -EBUSY when another driver got here first
  * (and isn't unregistered in the meantime).
  *
@@ -2028,11 +2020,10 @@ err_null_driver:
 }
 EXPORT_SYMBOL_GPL(cpufreq_register_driver);
 
-
 /**
  * cpufreq_unregister_driver - unregister the current CPUFreq driver
  *
- *    Unregister the current CPUFreq driver. Only call this if you have
+ * Unregister the current CPUFreq driver. Only call this if you have
  * the right to do so, i.e. if you have succeeded in initialising before!
  * Returns zero if successful, and -EINVAL if the cpufreq_driver is
  * currently not initialised.
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index e7bbf767380d..6663ec3b3056 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -81,7 +81,7 @@ static ssize_t show_##file_name##_gov_sys				\
 	return sprintf(buf, "%u\n", tuners->file_name);			\
 }									\
 									\
-static ssize_t show_##file_name##_gov_pol					\
+static ssize_t show_##file_name##_gov_pol				\
 (struct cpufreq_policy *policy, char *buf)				\
 {									\
 	struct dbs_data *dbs_data = policy->governor_data;		\
@@ -91,7 +91,7 @@ static ssize_t show_##file_name##_gov_pol					\
 
 #define store_one(_gov, file_name)					\
 static ssize_t store_##file_name##_gov_sys				\
-(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)	\
+(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) \
 {									\
 	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
 	return store_##file_name(dbs_data, buf, count);			\
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index ceee06849b91..9fef7d6e4e6a 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -17,7 +17,6 @@
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 
-
 static int cpufreq_governor_performance(struct cpufreq_policy *policy,
 					unsigned int event)
 {
@@ -44,19 +43,16 @@ struct cpufreq_governor cpufreq_gov_performance = {
 	.owner		= THIS_MODULE,
 };
 
-
 static int __init cpufreq_gov_performance_init(void)
 {
 	return cpufreq_register_governor(&cpufreq_gov_performance);
 }
 
-
 static void __exit cpufreq_gov_performance_exit(void)
 {
 	cpufreq_unregister_governor(&cpufreq_gov_performance);
 }
 
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'performance'");
 MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index 2d948a171155..32109a14f5dc 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -1,7 +1,7 @@
 /*
- *  linux/drivers/cpufreq/cpufreq_powersave.c
+ * linux/drivers/cpufreq/cpufreq_powersave.c
  *
- *  Copyright (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
  *
  * This program is free software; you can redistribute it and/or modify
@@ -48,13 +48,11 @@ static int __init cpufreq_gov_powersave_init(void)
 	return cpufreq_register_governor(&cpufreq_gov_powersave);
 }
 
-
 static void __exit cpufreq_gov_powersave_exit(void)
 {
 	cpufreq_unregister_governor(&cpufreq_gov_powersave);
 }
 
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'powersave'");
 MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index fb65decffa28..6d35caa91167 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -27,7 +27,7 @@ static spinlock_t cpufreq_stats_lock;
 struct cpufreq_stats {
 	unsigned int cpu;
 	unsigned int total_trans;
-	unsigned long long  last_time;
+	unsigned long long last_time;
 	unsigned int max_state;
 	unsigned int state_num;
 	unsigned int last_index;
@@ -116,7 +116,7 @@ static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 		len += snprintf(buf + len, PAGE_SIZE - len, "%9u: ",
 				stat->freq_table[i]);
 
-		for (j = 0; j < stat->state_num; j++)   {
+		for (j = 0; j < stat->state_num; j++) {
 			if (len >= PAGE_SIZE)
 				break;
 			len += snprintf(buf + len, PAGE_SIZE - len, "%9u ",
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 5dc77b7a7594..03078090b5f7 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -55,7 +55,6 @@ static int cpufreq_set(struct cpufreq_policy *policy, unsigned int freq)
 	return ret;
 }
 
-
 static ssize_t show_speed(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%u\n", policy->cur);
@@ -101,7 +100,6 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
 	return rc;
 }
 
-
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE
 static
 #endif
@@ -118,13 +116,11 @@ static int __init cpufreq_gov_userspace_init(void)
 	return cpufreq_register_governor(&cpufreq_gov_userspace);
 }
 
-
 static void __exit cpufreq_gov_userspace_exit(void)
 {
 	cpufreq_unregister_governor(&cpufreq_gov_userspace);
 }
 
-
 MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>, "
 		"Russell King <rmk@arm.linux.org.uk>");
 MODULE_DESCRIPTION("CPUfreq policy governor 'userspace'");
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 125719d41285..3c7ee2f90370 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -1,8 +1,8 @@
 /*
- *  linux/include/linux/cpufreq.h
+ * linux/include/linux/cpufreq.h
  *
- *  Copyright (C) 2001 Russell King
- *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ * Copyright (C) 2001 Russell King
+ *           (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,6 @@
 /* Print length for names. Extra 1 space for accomodating '\n' in prints */
 #define CPUFREQ_NAME_PLEN (CPUFREQ_NAME_LEN + 1)
 
-
 /*********************************************************************
  *                     CPUFREQ NOTIFIER INTERFACE                    *
  *********************************************************************/
@@ -153,17 +152,18 @@ struct cpufreq_freqs {
 	u8 flags;		/* flags of cpufreq_driver, see below. */
 };
 
-
 /**
- * cpufreq_scale - "old * mult / div" calculation for large values (32-bit-arch safe)
+ * cpufreq_scale - "old * mult / div" calculation for large values (32-bit-arch
+ * safe)
  * @old:   old value
  * @div:   divisor
  * @mult:  multiplier
  *
  *
- *    new = old * mult / div
+ * new = old * mult / div
  */
-static inline unsigned long cpufreq_scale(unsigned long old, u_int div, u_int mult)
+static inline unsigned long cpufreq_scale(unsigned long old, u_int div,
+		u_int mult)
 {
 #if BITS_PER_LONG == 32
 
@@ -216,14 +216,12 @@ extern int __cpufreq_driver_target(struct cpufreq_policy *policy,
 				   unsigned int target_freq,
 				   unsigned int relation);
 
-
 extern int __cpufreq_driver_getavg(struct cpufreq_policy *policy,
 				   unsigned int cpu);
 
 int cpufreq_register_governor(struct cpufreq_governor *governor);
 void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 
-
 /*********************************************************************
  *                      CPUFREQ DRIVER INTERFACE                     *
  *********************************************************************/
@@ -234,7 +232,7 @@ void cpufreq_unregister_governor(struct cpufreq_governor *governor);
 struct freq_attr;
 
 struct cpufreq_driver {
-	struct module           *owner;
+	struct module		*owner;
 	char			name[CPUFREQ_NAME_LEN];
 	u8			flags;
 	/*
@@ -282,11 +280,11 @@ struct cpufreq_driver {
 int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
-
 void cpufreq_notify_transition(struct cpufreq_policy *policy,
 		struct cpufreq_freqs *freqs, unsigned int state);
 
-static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max)
+static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy,
+		unsigned int min, unsigned int max)
 {
 	if (policy->min < min)
 		policy->min = min;
@@ -349,7 +347,9 @@ bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 
 #ifdef CONFIG_CPU_FREQ
-/* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */
+/*
+ * query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it
+ */
 unsigned int cpufreq_get(unsigned int cpu);
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
@@ -358,7 +358,9 @@ static inline unsigned int cpufreq_get(unsigned int cpu)
 }
 #endif
 
-/* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
+/*
+ * query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it
+ */
 #ifdef CONFIG_CPU_FREQ
 unsigned int cpufreq_quick_get(unsigned int cpu);
 unsigned int cpufreq_quick_get_max(unsigned int cpu);
@@ -373,16 +375,14 @@ static inline unsigned int cpufreq_quick_get_max(unsigned int cpu)
 }
 #endif
 
-
 /*********************************************************************
  *                       CPUFREQ DEFAULT GOVERNOR                    *
  *********************************************************************/
 
-
 /*
-  Performance governor is fallback governor if any other gov failed to
-  auto load due latency restrictions
-*/
+ * Performance governor is fallback governor if any other gov failed to auto
+ * load due latency restrictions
+ */
 #ifdef CONFIG_CPU_FREQ_GOV_PERFORMANCE
 extern struct cpufreq_governor cpufreq_gov_performance;
 #endif
@@ -402,7 +402,6 @@ extern struct cpufreq_governor cpufreq_gov_conservative;
 #define CPUFREQ_DEFAULT_GOVERNOR	(&cpufreq_gov_conservative)
 #endif
 
-
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
-- 
cgit v1.3-7-g2ca7


From 9d263813c27e2ad3da7ea0877e623f4ff8767ddd Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Mon, 10 Jun 2013 09:59:30 -0700
Subject: leds: leds-mc13783: Prepare driver to support MC13892 LEDs

This patch rewrite driver code to be ready to add support for
MC13892 LEDs and probe from devicetree.

(cooloney@gmail.com: fix one coding style issue when apply this patch)

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Tested-by: Philippe Retornaz <philippe.retornaz@epfl.ch>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 arch/arm/mach-imx/mach-mx31moboard.c |   9 +-
 drivers/leds/leds-mc13783.c          | 406 ++++++++++++++---------------------
 include/linux/mfd/mc13xxx.h          |  93 ++++----
 3 files changed, 204 insertions(+), 304 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-imx/mach-mx31moboard.c b/arch/arm/mach-imx/mach-mx31moboard.c
index dae4cd7be040..6f424eced181 100644
--- a/arch/arm/mach-imx/mach-mx31moboard.c
+++ b/arch/arm/mach-imx/mach-mx31moboard.c
@@ -268,10 +268,11 @@ static struct mc13xxx_led_platform_data moboard_led[] = {
 static struct mc13xxx_leds_platform_data moboard_leds = {
 	.num_leds = ARRAY_SIZE(moboard_led),
 	.led = moboard_led,
-	.flags = MC13783_LED_SLEWLIMTC,
-	.abmode = MC13783_LED_AB_DISABLED,
-	.tc1_period = MC13783_LED_PERIOD_10MS,
-	.tc2_period = MC13783_LED_PERIOD_10MS,
+	.led_control[0]	= MC13783_LED_C0_ENABLE | MC13783_LED_C0_ABMODE(0),
+	.led_control[1]	= MC13783_LED_C1_SLEWLIM,
+	.led_control[2]	= MC13783_LED_C2_SLEWLIM,
+	.led_control[3]	= MC13783_LED_C3_PERIOD(0),
+	.led_control[4]	= MC13783_LED_C3_PERIOD(0),
 };
 
 static struct mc13xxx_buttons_platform_data moboard_buttons = {
diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c
index ea8fc5d432da..da8ec244a641 100644
--- a/drivers/leds/leds-mc13783.c
+++ b/drivers/leds/leds-mc13783.c
@@ -22,9 +22,16 @@
 #include <linux/leds.h>
 #include <linux/workqueue.h>
 #include <linux/mfd/mc13xxx.h>
-#include <linux/slab.h>
 
-struct mc13783_led {
+#define MC13XXX_REG_LED_CONTROL(x)	(51 + (x))
+
+struct mc13xxx_led_devtype {
+	int	led_min;
+	int	led_max;
+	int	num_regs;
+};
+
+struct mc13xxx_led {
 	struct led_classdev	cdev;
 	struct work_struct	work;
 	struct mc13xxx		*master;
@@ -32,66 +39,35 @@ struct mc13783_led {
 	int			id;
 };
 
-#define MC13783_REG_LED_CONTROL_0	51
-#define MC13783_LED_C0_ENABLE_BIT	(1 << 0)
-#define MC13783_LED_C0_TRIODE_MD_BIT	(1 << 7)
-#define MC13783_LED_C0_TRIODE_AD_BIT	(1 << 8)
-#define MC13783_LED_C0_TRIODE_KP_BIT	(1 << 9)
-#define MC13783_LED_C0_BOOST_BIT	(1 << 10)
-#define MC13783_LED_C0_ABMODE_MASK	0x7
-#define MC13783_LED_C0_ABMODE		11
-#define MC13783_LED_C0_ABREF_MASK	0x3
-#define MC13783_LED_C0_ABREF		14
-
-#define MC13783_REG_LED_CONTROL_1	52
-#define MC13783_LED_C1_TC1HALF_BIT	(1 << 18)
-
-#define MC13783_REG_LED_CONTROL_2	53
-#define MC13783_LED_C2_BL_P_MASK	0xf
-#define MC13783_LED_C2_MD_P		9
-#define MC13783_LED_C2_AD_P		13
-#define MC13783_LED_C2_KP_P		17
-#define MC13783_LED_C2_BL_C_MASK	0x7
-#define MC13783_LED_C2_MD_C		0
-#define MC13783_LED_C2_AD_C		3
-#define MC13783_LED_C2_KP_C		6
-
-#define MC13783_REG_LED_CONTROL_3	54
-#define MC13783_LED_C3_TC_P		6
-#define MC13783_LED_C3_TC_P_MASK	0x1f
-
-#define MC13783_REG_LED_CONTROL_4	55
-#define MC13783_REG_LED_CONTROL_5	56
-
-#define MC13783_LED_Cx_PERIOD		21
-#define MC13783_LED_Cx_PERIOD_MASK	0x3
-#define MC13783_LED_Cx_SLEWLIM_BIT      (1 << 23)
-#define MC13783_LED_Cx_TRIODE_TC_BIT	(1 << 23)
-#define MC13783_LED_Cx_TC_C_MASK	0x3
-
-static void mc13783_led_work(struct work_struct *work)
+struct mc13xxx_leds {
+	struct mc13xxx_led_devtype	*devtype;
+	int				num_leds;
+	struct mc13xxx_led		led[0];
+};
+
+static void mc13xxx_led_work(struct work_struct *work)
 {
-	struct mc13783_led *led = container_of(work, struct mc13783_led, work);
-	int reg = 0;
-	int mask = 0;
-	int value = 0;
-	int bank, off, shift;
+	struct mc13xxx_led *led = container_of(work, struct mc13xxx_led, work);
+	int reg, mask, value, bank, off, shift;
 
 	switch (led->id) {
 	case MC13783_LED_MD:
-		reg = MC13783_REG_LED_CONTROL_2;
-		mask = MC13783_LED_C2_BL_P_MASK << MC13783_LED_C2_MD_P;
-		value = (led->new_brightness >> 4) << MC13783_LED_C2_MD_P;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 9;
+		mask = 0x0f;
+		value = led->new_brightness >> 4;
 		break;
 	case MC13783_LED_AD:
-		reg = MC13783_REG_LED_CONTROL_2;
-		mask = MC13783_LED_C2_BL_P_MASK << MC13783_LED_C2_AD_P;
-		value = (led->new_brightness >> 4) << MC13783_LED_C2_AD_P;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 13;
+		mask = 0x0f;
+		value = led->new_brightness >> 4;
 		break;
 	case MC13783_LED_KP:
-		reg = MC13783_REG_LED_CONTROL_2;
-		mask = MC13783_LED_C2_BL_P_MASK << MC13783_LED_C2_KP_P;
-		value = (led->new_brightness >> 4) << MC13783_LED_C2_KP_P;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 17;
+		mask = 0x0f;
+		value = led->new_brightness >> 4;
 		break;
 	case MC13783_LED_R1:
 	case MC13783_LED_G1:
@@ -103,57 +79,50 @@ static void mc13783_led_work(struct work_struct *work)
 	case MC13783_LED_G3:
 	case MC13783_LED_B3:
 		off = led->id - MC13783_LED_R1;
-		bank = off/3;
-		reg = MC13783_REG_LED_CONTROL_3 + off/3;
-		shift = (off - bank * 3) * 5 + MC13783_LED_C3_TC_P;
-		value = (led->new_brightness >> 3) << shift;
-		mask = MC13783_LED_C3_TC_P_MASK << shift;
+		bank = off / 3;
+		reg = MC13XXX_REG_LED_CONTROL(3) + bank;
+		shift = (off - bank * 3) * 5 + 6;
+		value = led->new_brightness >> 3;
+		mask = 0x1f;
 		break;
+	default:
+		BUG();
 	}
 
 	mc13xxx_lock(led->master);
-
-	mc13xxx_reg_rmw(led->master, reg, mask, value);
-
+	mc13xxx_reg_rmw(led->master, reg, mask << shift, value << shift);
 	mc13xxx_unlock(led->master);
 }
 
-static void mc13783_led_set(struct led_classdev *led_cdev,
-			   enum led_brightness value)
+static void mc13xxx_led_set(struct led_classdev *led_cdev,
+			    enum led_brightness value)
 {
-	struct mc13783_led *led;
+	struct mc13xxx_led *led =
+		container_of(led_cdev, struct mc13xxx_led, cdev);
 
-	led = container_of(led_cdev, struct mc13783_led, cdev);
 	led->new_brightness = value;
 	schedule_work(&led->work);
 }
 
-static int mc13783_led_setup(struct mc13783_led *led, int max_current)
+static int __init mc13xxx_led_setup(struct mc13xxx_led *led, int max_current)
 {
-	int shift = 0;
-	int mask = 0;
-	int value = 0;
-	int reg = 0;
-	int ret, bank;
+	int shift, mask, reg, ret, bank;
 
 	switch (led->id) {
 	case MC13783_LED_MD:
-		shift = MC13783_LED_C2_MD_C;
-		mask = MC13783_LED_C2_BL_C_MASK;
-		value = max_current & MC13783_LED_C2_BL_C_MASK;
-		reg = MC13783_REG_LED_CONTROL_2;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 0;
+		mask = 0x07;
 		break;
 	case MC13783_LED_AD:
-		shift = MC13783_LED_C2_AD_C;
-		mask = MC13783_LED_C2_BL_C_MASK;
-		value = max_current & MC13783_LED_C2_BL_C_MASK;
-		reg = MC13783_REG_LED_CONTROL_2;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 3;
+		mask = 0x07;
 		break;
 	case MC13783_LED_KP:
-		shift = MC13783_LED_C2_KP_C;
-		mask = MC13783_LED_C2_BL_C_MASK;
-		value = max_current & MC13783_LED_C2_BL_C_MASK;
-		reg = MC13783_REG_LED_CONTROL_2;
+		reg = MC13XXX_REG_LED_CONTROL(2);
+		shift = 6;
+		mask = 0x07;
 		break;
 	case MC13783_LED_R1:
 	case MC13783_LED_G1:
@@ -164,228 +133,165 @@ static int mc13783_led_setup(struct mc13783_led *led, int max_current)
 	case MC13783_LED_R3:
 	case MC13783_LED_G3:
 	case MC13783_LED_B3:
-		bank = (led->id - MC13783_LED_R1)/3;
-		reg = MC13783_REG_LED_CONTROL_3 + bank;
+		bank = (led->id - MC13783_LED_R1) / 3;
+		reg = MC13XXX_REG_LED_CONTROL(3) + bank;
 		shift = ((led->id - MC13783_LED_R1) - bank * 3) * 2;
-		mask = MC13783_LED_Cx_TC_C_MASK;
-		value = max_current & MC13783_LED_Cx_TC_C_MASK;
+		mask = 0x03;
 		break;
+	default:
+		BUG();
 	}
 
 	mc13xxx_lock(led->master);
-
 	ret = mc13xxx_reg_rmw(led->master, reg, mask << shift,
-						value << shift);
-
+			      max_current << shift);
 	mc13xxx_unlock(led->master);
-	return ret;
-}
-
-static int mc13783_leds_prepare(struct platform_device *pdev)
-{
-	struct mc13xxx_leds_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct mc13xxx *dev = dev_get_drvdata(pdev->dev.parent);
-	int ret = 0;
-	int reg = 0;
-
-	mc13xxx_lock(dev);
-
-	if (pdata->flags & MC13783_LED_TC1HALF)
-		reg |= MC13783_LED_C1_TC1HALF_BIT;
-
-	if (pdata->flags & MC13783_LED_SLEWLIMTC)
-		reg |= MC13783_LED_Cx_SLEWLIM_BIT;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_1, reg);
-	if (ret)
-		goto out;
-
-	reg = (pdata->bl_period & MC13783_LED_Cx_PERIOD_MASK) <<
-							MC13783_LED_Cx_PERIOD;
-
-	if (pdata->flags & MC13783_LED_SLEWLIMBL)
-		reg |= MC13783_LED_Cx_SLEWLIM_BIT;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_2, reg);
-	if (ret)
-		goto out;
-
-	reg = (pdata->tc1_period & MC13783_LED_Cx_PERIOD_MASK) <<
-							MC13783_LED_Cx_PERIOD;
-
-	if (pdata->flags & MC13783_LED_TRIODE_TC1)
-		reg |= MC13783_LED_Cx_TRIODE_TC_BIT;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_3, reg);
-	if (ret)
-		goto out;
 
-	reg = (pdata->tc2_period & MC13783_LED_Cx_PERIOD_MASK) <<
-							MC13783_LED_Cx_PERIOD;
-
-	if (pdata->flags & MC13783_LED_TRIODE_TC2)
-		reg |= MC13783_LED_Cx_TRIODE_TC_BIT;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_4, reg);
-	if (ret)
-		goto out;
-
-	reg = (pdata->tc3_period & MC13783_LED_Cx_PERIOD_MASK) <<
-							MC13783_LED_Cx_PERIOD;
-
-	if (pdata->flags & MC13783_LED_TRIODE_TC3)
-		reg |= MC13783_LED_Cx_TRIODE_TC_BIT;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_5, reg);
-	if (ret)
-		goto out;
-
-	reg = MC13783_LED_C0_ENABLE_BIT;
-	if (pdata->flags & MC13783_LED_TRIODE_MD)
-		reg |= MC13783_LED_C0_TRIODE_MD_BIT;
-	if (pdata->flags & MC13783_LED_TRIODE_AD)
-		reg |= MC13783_LED_C0_TRIODE_AD_BIT;
-	if (pdata->flags & MC13783_LED_TRIODE_KP)
-		reg |= MC13783_LED_C0_TRIODE_KP_BIT;
-	if (pdata->flags & MC13783_LED_BOOST_EN)
-		reg |= MC13783_LED_C0_BOOST_BIT;
-
-	reg |= (pdata->abmode & MC13783_LED_C0_ABMODE_MASK) <<
-							MC13783_LED_C0_ABMODE;
-	reg |= (pdata->abref & MC13783_LED_C0_ABREF_MASK) <<
-							MC13783_LED_C0_ABREF;
-
-	ret = mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_0, reg);
-
-out:
-	mc13xxx_unlock(dev);
 	return ret;
 }
 
-static int mc13783_led_probe(struct platform_device *pdev)
+static int __init mc13xxx_led_probe(struct platform_device *pdev)
 {
 	struct mc13xxx_leds_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct mc13xxx_led_platform_data *led_cur;
-	struct mc13783_led *led, *led_dat;
-	int ret, i;
-	int init_led = 0;
-
-	if (pdata == NULL) {
-		dev_err(&pdev->dev, "missing platform data\n");
+	struct mc13xxx *mcdev = dev_get_drvdata(pdev->dev.parent);
+	struct mc13xxx_led_devtype *devtype =
+		(struct mc13xxx_led_devtype *)pdev->id_entry->driver_data;
+	struct mc13xxx_leds *leds;
+	int i, id, num_leds, ret;
+	u32 reg, init_led = 0;
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "Missing platform data\n");
 		return -ENODEV;
 	}
 
-	if (pdata->num_leds < 1 || pdata->num_leds > (MC13783_LED_MAX + 1)) {
-		dev_err(&pdev->dev, "Invalid led count %d\n", pdata->num_leds);
+	num_leds = pdata->num_leds;
+
+	if ((num_leds < 1) ||
+	    (num_leds > (devtype->led_max - devtype->led_min + 1))) {
+		dev_err(&pdev->dev, "Invalid LED count %d\n", num_leds);
 		return -EINVAL;
 	}
 
-	led = devm_kzalloc(&pdev->dev, pdata->num_leds * sizeof(*led),
-				GFP_KERNEL);
-	if (led == NULL) {
-		dev_err(&pdev->dev, "failed to alloc memory\n");
+	leds = devm_kzalloc(&pdev->dev, num_leds * sizeof(struct mc13xxx_led) +
+			    sizeof(struct mc13xxx_leds), GFP_KERNEL);
+	if (!leds)
 		return -ENOMEM;
+
+	leds->devtype = devtype;
+	leds->num_leds = num_leds;
+	platform_set_drvdata(pdev, leds);
+
+	mc13xxx_lock(mcdev);
+	for (i = 0; i < devtype->num_regs; i++) {
+		reg = pdata->led_control[i];
+		WARN_ON(reg >= (1 << 24));
+		ret = mc13xxx_reg_write(mcdev, MC13XXX_REG_LED_CONTROL(i), reg);
+		if (ret)
+			break;
 	}
+	mc13xxx_unlock(mcdev);
 
-	ret = mc13783_leds_prepare(pdev);
 	if (ret) {
-		dev_err(&pdev->dev, "unable to init led driver\n");
+		dev_err(&pdev->dev, "Unable to init LED driver\n");
 		return ret;
 	}
 
-	for (i = 0; i < pdata->num_leds; i++) {
-		led_dat = &led[i];
-		led_cur = &pdata->led[i];
+	for (i = 0; i < num_leds; i++) {
+		const char *name, *trig;
+		char max_current;
 
-		if (led_cur->id > MC13783_LED_MAX || led_cur->id < 0) {
-			dev_err(&pdev->dev, "invalid id %d\n", led_cur->id);
-			ret = -EINVAL;
-			goto err_register;
+		ret = -EINVAL;
+
+		id = pdata->led[i].id;
+		name = pdata->led[i].name;
+		trig = pdata->led[i].default_trigger;
+		max_current = pdata->led[i].max_current;
+
+		if ((id > devtype->led_max) || (id < devtype->led_min)) {
+			dev_err(&pdev->dev, "Invalid ID %i\n", id);
+			break;
 		}
 
-		if (init_led & (1 << led_cur->id)) {
-			dev_err(&pdev->dev, "led %d already initialized\n",
-					led_cur->id);
-			ret = -EINVAL;
-			goto err_register;
+		if (init_led & (1 << id)) {
+			dev_warn(&pdev->dev,
+				 "LED %i already initialized\n", id);
+			break;
 		}
 
-		init_led |= 1 << led_cur->id;
-		led_dat->cdev.name = led_cur->name;
-		led_dat->cdev.default_trigger = led_cur->default_trigger;
-		led_dat->cdev.brightness_set = mc13783_led_set;
-		led_dat->cdev.brightness = LED_OFF;
-		led_dat->id = led_cur->id;
-		led_dat->master = dev_get_drvdata(pdev->dev.parent);
+		init_led |= 1 << id;
+		leds->led[i].id = id;
+		leds->led[i].master = mcdev;
+		leds->led[i].cdev.name = name;
+		leds->led[i].cdev.default_trigger = trig;
+		leds->led[i].cdev.brightness_set = mc13xxx_led_set;
+		leds->led[i].cdev.brightness = LED_OFF;
 
-		INIT_WORK(&led_dat->work, mc13783_led_work);
+		INIT_WORK(&leds->led[i].work, mc13xxx_led_work);
 
-		ret = led_classdev_register(pdev->dev.parent, &led_dat->cdev);
+		ret = mc13xxx_led_setup(&leds->led[i], max_current);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to register led %d\n",
-					led_dat->id);
-			goto err_register;
+			dev_err(&pdev->dev, "Unable to setup LED %i\n", id);
+			break;
 		}
-
-		ret = mc13783_led_setup(led_dat, led_cur->max_current);
+		ret = led_classdev_register(pdev->dev.parent,
+					    &leds->led[i].cdev);
 		if (ret) {
-			dev_err(&pdev->dev, "unable to init led %d\n",
-					led_dat->id);
-			i++;
-			goto err_register;
+			dev_err(&pdev->dev, "Failed to register LED %i\n", id);
+			break;
 		}
 	}
 
-	platform_set_drvdata(pdev, led);
-	return 0;
-
-err_register:
-	for (i = i - 1; i >= 0; i--) {
-		led_classdev_unregister(&led[i].cdev);
-		cancel_work_sync(&led[i].work);
-	}
+	if (ret)
+		while (--i >= 0) {
+			led_classdev_unregister(&leds->led[i].cdev);
+			cancel_work_sync(&leds->led[i].work);
+		}
 
 	return ret;
 }
 
-static int mc13783_led_remove(struct platform_device *pdev)
+static int mc13xxx_led_remove(struct platform_device *pdev)
 {
-	struct mc13xxx_leds_platform_data *pdata = dev_get_platdata(&pdev->dev);
-	struct mc13783_led *led = platform_get_drvdata(pdev);
-	struct mc13xxx *dev = dev_get_drvdata(pdev->dev.parent);
+	struct mc13xxx *mcdev = dev_get_drvdata(pdev->dev.parent);
+	struct mc13xxx_leds *leds = platform_get_drvdata(pdev);
 	int i;
 
-	for (i = 0; i < pdata->num_leds; i++) {
-		led_classdev_unregister(&led[i].cdev);
-		cancel_work_sync(&led[i].work);
+	for (i = 0; i < leds->num_leds; i++) {
+		led_classdev_unregister(&leds->led[i].cdev);
+		cancel_work_sync(&leds->led[i].work);
 	}
 
-	mc13xxx_lock(dev);
-
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_0, 0);
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_1, 0);
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_2, 0);
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_3, 0);
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_4, 0);
-	mc13xxx_reg_write(dev, MC13783_REG_LED_CONTROL_5, 0);
-
-	mc13xxx_unlock(dev);
+	mc13xxx_lock(mcdev);
+	for (i = 0; i < leds->devtype->num_regs; i++)
+		mc13xxx_reg_write(mcdev, MC13XXX_REG_LED_CONTROL(i), 0);
+	mc13xxx_unlock(mcdev);
 
 	return 0;
 }
 
-static struct platform_driver mc13783_led_driver = {
+static const struct mc13xxx_led_devtype mc13783_led_devtype = {
+	.led_min	= MC13783_LED_MD,
+	.led_max	= MC13783_LED_B3,
+	.num_regs	= 6,
+};
+
+static const struct platform_device_id mc13xxx_led_id_table[] = {
+	{ "mc13783-led", (kernel_ulong_t)&mc13783_led_devtype, },
+	{ }
+};
+MODULE_DEVICE_TABLE(platform, mc13xxx_led_id_table);
+
+static struct platform_driver mc13xxx_led_driver = {
 	.driver	= {
-		.name	= "mc13783-led",
+		.name	= "mc13xxx-led",
 		.owner	= THIS_MODULE,
 	},
-	.probe		= mc13783_led_probe,
-	.remove		= mc13783_led_remove,
+	.remove		= mc13xxx_led_remove,
+	.id_table	= mc13xxx_led_id_table,
 };
+module_platform_driver_probe(mc13xxx_led_driver, mc13xxx_led_probe);
 
-module_platform_driver(mc13783_led_driver);
-
-MODULE_DESCRIPTION("LEDs driver for Freescale MC13783 PMIC");
+MODULE_DESCRIPTION("LEDs driver for Freescale MC13XXX PMIC");
 MODULE_AUTHOR("Philippe Retornaz <philippe.retornaz@epfl.ch>");
 MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:mc13783-led");
diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h
index bf070755982e..ee280f16350c 100644
--- a/include/linux/mfd/mc13xxx.h
+++ b/include/linux/mfd/mc13xxx.h
@@ -78,20 +78,23 @@ struct mc13xxx_regulator_platform_data {
 	struct mc13xxx_regulator_init_data *regulators;
 };
 
+enum {
+	/* MC13783 LED IDs */
+	MC13783_LED_MD,
+	MC13783_LED_AD,
+	MC13783_LED_KP,
+	MC13783_LED_R1,
+	MC13783_LED_G1,
+	MC13783_LED_B1,
+	MC13783_LED_R2,
+	MC13783_LED_G2,
+	MC13783_LED_B2,
+	MC13783_LED_R3,
+	MC13783_LED_G3,
+	MC13783_LED_B3,
+};
+
 struct mc13xxx_led_platform_data {
-#define MC13783_LED_MD		0
-#define MC13783_LED_AD		1
-#define MC13783_LED_KP		2
-#define MC13783_LED_R1		3
-#define MC13783_LED_G1		4
-#define MC13783_LED_B1		5
-#define MC13783_LED_R2		6
-#define MC13783_LED_G2		7
-#define MC13783_LED_B2		8
-#define MC13783_LED_R3		9
-#define MC13783_LED_G3		10
-#define MC13783_LED_B3		11
-#define MC13783_LED_MAX MC13783_LED_B3
 	int id;
 	const char *name;
 	const char *default_trigger;
@@ -100,46 +103,36 @@ struct mc13xxx_led_platform_data {
 	char max_current;
 };
 
+#define MAX_LED_CONTROL_REGS	6
+
 struct mc13xxx_leds_platform_data {
-	int num_leds;
 	struct mc13xxx_led_platform_data *led;
+	int num_leds;
 
-#define MC13783_LED_TRIODE_MD	(1 << 0)
-#define MC13783_LED_TRIODE_AD	(1 << 1)
-#define MC13783_LED_TRIODE_KP	(1 << 2)
-#define MC13783_LED_BOOST_EN	(1 << 3)
-#define MC13783_LED_TC1HALF	(1 << 4)
-#define MC13783_LED_SLEWLIMTC	(1 << 5)
-#define MC13783_LED_SLEWLIMBL	(1 << 6)
-#define MC13783_LED_TRIODE_TC1	(1 << 7)
-#define MC13783_LED_TRIODE_TC2	(1 << 8)
-#define MC13783_LED_TRIODE_TC3	(1 << 9)
-	int flags;
-
-#define MC13783_LED_AB_DISABLED		0
-#define MC13783_LED_AB_MD1		1
-#define MC13783_LED_AB_MD12		2
-#define MC13783_LED_AB_MD123		3
-#define MC13783_LED_AB_MD1234		4
-#define MC13783_LED_AB_MD1234_AD1	5
-#define MC13783_LED_AB_MD1234_AD12	6
-#define MC13783_LED_AB_MD1_AD		7
-	char abmode;
-
-#define MC13783_LED_ABREF_200MV	0
-#define MC13783_LED_ABREF_400MV	1
-#define MC13783_LED_ABREF_600MV	2
-#define MC13783_LED_ABREF_800MV	3
-	char abref;
-
-#define MC13783_LED_PERIOD_10MS		0
-#define MC13783_LED_PERIOD_100MS	1
-#define MC13783_LED_PERIOD_500MS	2
-#define MC13783_LED_PERIOD_2S		3
-	char bl_period;
-	char tc1_period;
-	char tc2_period;
-	char tc3_period;
+/* LED Control 0 */
+#define MC13783_LED_C0_ENABLE		(1 << 0)
+#define MC13783_LED_C0_TRIODE_MD	(1 << 7)
+#define MC13783_LED_C0_TRIODE_AD	(1 << 8)
+#define MC13783_LED_C0_TRIODE_KP	(1 << 9)
+#define MC13783_LED_C0_BOOST		(1 << 10)
+#define MC13783_LED_C0_ABMODE(x)	(((x) & 0x7) << 11)
+#define MC13783_LED_C0_ABREF(x)		(((x) & 0x3) << 14)
+/* LED Control 1 */
+#define MC13783_LED_C1_TC1HALF		(1 << 18)
+#define MC13783_LED_C1_SLEWLIM		(1 << 23)
+/* LED Control 2 */
+#define MC13783_LED_C2_PERIOD(x)	(((x) & 0x3) << 21)
+#define MC13783_LED_C2_SLEWLIM		(1 << 23)
+/* LED Control 3 */
+#define MC13783_LED_C3_PERIOD(x)	(((x) & 0x3) << 21)
+#define MC13783_LED_C3_TRIODE_TC1	(1 << 23)
+/* LED Control 4 */
+#define MC13783_LED_C4_PERIOD(x)	(((x) & 0x3) << 21)
+#define MC13783_LED_C4_TRIODE_TC2	(1 << 23)
+/* LED Control 5 */
+#define MC13783_LED_C5_PERIOD(x)	(((x) & 0x3) << 21)
+#define MC13783_LED_C5_TRIODE_TC3	(1 << 23)
+	u32 led_control[MAX_LED_CONTROL_REGS];
 };
 
 struct mc13xxx_buttons_platform_data {
-- 
cgit v1.3-7-g2ca7


From ae6cdb03ef1d352c489d6c86e0bcec51365a2c64 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Mon, 10 Jun 2013 09:59:31 -0700
Subject: leds: leds-mc13783: Add MC13892 LED support

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Tested-by: Philippe Retornaz <philippe.retornaz@epfl.ch>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/Kconfig        |  6 ++---
 drivers/leds/leds-mc13783.c | 60 ++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/mfd/mc13xxx.h |  7 ++++++
 3 files changed, 69 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index ef992293598a..e43402dd1dea 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -388,12 +388,12 @@ config LEDS_DELL_NETBOOKS
 	  notebooks that have an external LED.
 
 config LEDS_MC13783
-	tristate "LED Support for MC13783 PMIC"
+	tristate "LED Support for MC13XXX PMIC"
 	depends on LEDS_CLASS
-	depends on MFD_MC13783
+	depends on MFD_MC13XXX
 	help
 	  This option enable support for on-chip LED drivers found
-	  on Freescale Semiconductor MC13783 PMIC.
+	  on Freescale Semiconductor MC13783/MC13892 PMIC.
 
 config LEDS_NS2
 	tristate "LED support for Network Space v2 GPIO LEDs"
diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c
index da8ec244a641..f4de98052aaa 100644
--- a/drivers/leds/leds-mc13783.c
+++ b/drivers/leds/leds-mc13783.c
@@ -1,5 +1,5 @@
 /*
- * LEDs driver for Freescale MC13783
+ * LEDs driver for Freescale MC13783/MC13892
  *
  * Copyright (C) 2010 Philippe Rétornaz
  *
@@ -85,6 +85,34 @@ static void mc13xxx_led_work(struct work_struct *work)
 		value = led->new_brightness >> 3;
 		mask = 0x1f;
 		break;
+	case MC13892_LED_MD:
+		reg = MC13XXX_REG_LED_CONTROL(0);
+		shift = 3;
+		mask = 0x3f;
+		value = led->new_brightness >> 2;
+		break;
+	case MC13892_LED_AD:
+		reg = MC13XXX_REG_LED_CONTROL(0);
+		shift = 15;
+		mask = 0x3f;
+		value = led->new_brightness >> 2;
+		break;
+	case MC13892_LED_KP:
+		reg = MC13XXX_REG_LED_CONTROL(1);
+		shift = 3;
+		mask = 0x3f;
+		value = led->new_brightness >> 2;
+		break;
+	case MC13892_LED_R:
+	case MC13892_LED_G:
+	case MC13892_LED_B:
+		off = led->id - MC13892_LED_R;
+		bank = off / 2;
+		reg = MC13XXX_REG_LED_CONTROL(2) + bank;
+		shift = (off - bank * 2) * 12 + 3;
+		value = led->new_brightness >> 2;
+		mask = 0x3f;
+		break;
 	default:
 		BUG();
 	}
@@ -138,6 +166,29 @@ static int __init mc13xxx_led_setup(struct mc13xxx_led *led, int max_current)
 		shift = ((led->id - MC13783_LED_R1) - bank * 3) * 2;
 		mask = 0x03;
 		break;
+	case MC13892_LED_MD:
+		reg = MC13XXX_REG_LED_CONTROL(0);
+		shift = 9;
+		mask = 0x07;
+		break;
+	case MC13892_LED_AD:
+		reg = MC13XXX_REG_LED_CONTROL(0);
+		shift = 21;
+		mask = 0x07;
+		break;
+	case MC13892_LED_KP:
+		reg = MC13XXX_REG_LED_CONTROL(1);
+		shift = 9;
+		mask = 0x07;
+		break;
+	case MC13892_LED_R:
+	case MC13892_LED_G:
+	case MC13892_LED_B:
+		bank = (led->id - MC13892_LED_R) / 2;
+		reg = MC13XXX_REG_LED_CONTROL(2) + bank;
+		shift = ((led->id - MC13892_LED_R) - bank * 2) * 12 + 9;
+		mask = 0x07;
+		break;
 	default:
 		BUG();
 	}
@@ -276,8 +327,15 @@ static const struct mc13xxx_led_devtype mc13783_led_devtype = {
 	.num_regs	= 6,
 };
 
+static const struct mc13xxx_led_devtype mc13892_led_devtype = {
+	.led_min	= MC13892_LED_MD,
+	.led_max	= MC13892_LED_B,
+	.num_regs	= 4,
+};
+
 static const struct platform_device_id mc13xxx_led_id_table[] = {
 	{ "mc13783-led", (kernel_ulong_t)&mc13783_led_devtype, },
+	{ "mc13892-led", (kernel_ulong_t)&mc13892_led_devtype, },
 	{ }
 };
 MODULE_DEVICE_TABLE(platform, mc13xxx_led_id_table);
diff --git a/include/linux/mfd/mc13xxx.h b/include/linux/mfd/mc13xxx.h
index ee280f16350c..41ed59276c00 100644
--- a/include/linux/mfd/mc13xxx.h
+++ b/include/linux/mfd/mc13xxx.h
@@ -92,6 +92,13 @@ enum {
 	MC13783_LED_R3,
 	MC13783_LED_G3,
 	MC13783_LED_B3,
+	/* MC13892 LED IDs */
+	MC13892_LED_MD,
+	MC13892_LED_AD,
+	MC13892_LED_KP,
+	MC13892_LED_R,
+	MC13892_LED_G,
+	MC13892_LED_B,
 };
 
 struct mc13xxx_led_platform_data {
-- 
cgit v1.3-7-g2ca7


From f5abaa1bfc3dbf26d19d3513f39279ca369f8d65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Thu, 20 Jun 2013 11:44:44 -0400
Subject: tracing: Add DEFINE_EVENT_FN() macro

Each TRACE_EVENT() adds several helper functions. If two or more trace events
share the same structure and print format, they can also share most of these
helper functions and save a lot of space from duplicate code. This is why the
DECLARE_EVENT_CLASS() and DEFINE_EVENT() were created.

Some events require a trigger to be called at registering and unregistering of
the event and to do so they use TRACE_EVENT_FN().

If multiple events require a trigger, they currently have no choice but to use
TRACE_EVENT_FN() as there's no DEFINE_EVENT_FN() available. This unfortunately
causes a lot of wasted duplicate code created.

By adding a DEFINE_EVENT_FN(), these events can still use a
DECLARE_EVENT_CLASS() and then define their own triggers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/51C3236C.8030508@hds.com
Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 include/linux/tracepoint.h   | 2 ++
 include/trace/define_trace.h | 5 +++++
 include/trace/ftrace.h       | 4 ++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index f8e084d0fc77..ebeab360d851 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -378,6 +378,8 @@ static inline void tracepoint_synchronize_unregister(void)
 #define DECLARE_EVENT_CLASS(name, proto, args, tstruct, assign, print)
 #define DEFINE_EVENT(template, name, proto, args)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 #define DEFINE_EVENT_CONDITION(template, name, proto,		\
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 1905ca8dd399..02e1003568a4 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -44,6 +44,10 @@
 #define DEFINE_EVENT(template, name, proto, args) \
 	DEFINE_TRACE(name)
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg) \
+	DEFINE_TRACE_FN(name, reg, unreg)
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_TRACE(name)
@@ -91,6 +95,7 @@
 #undef TRACE_EVENT_CONDITION
 #undef DECLARE_EVENT_CLASS
 #undef DEFINE_EVENT
+#undef DEFINE_EVENT_FN
 #undef DEFINE_EVENT_PRINT
 #undef DEFINE_EVENT_CONDITION
 #undef TRACE_HEADER_MULTI_READ
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 19edd7facaa1..d615f78cc6b6 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -71,6 +71,10 @@
 	static struct ftrace_event_call	__used		\
 	__attribute__((__aligned__(4))) event_##name
 
+#undef DEFINE_EVENT_FN
+#define DEFINE_EVENT_FN(template, name, proto, args, reg, unreg)	\
+	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, name, proto, args, print)	\
 	DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
-- 
cgit v1.3-7-g2ca7


From 7064f6bd86278029348c36d30bd325e7e05b6fee Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 21 Jun 2013 22:32:26 +0200
Subject: clk: tegra: provide tegra_periph_reset_assert alternative

We have some tegra device drivers that are written to be platform
independent but still use the tegra specific tegra_periph_reset_assert
function. In order to build and link them without errors,
this provides a static inline version of these functions that
does nothing when Tegra support is disabled.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
[mturquette@linaro.org: fixed up trivial merge issue]
---
 include/linux/clk/tegra.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 3670a4f5402b..e3cc8721c30e 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -120,8 +120,13 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
+#ifdef ARCH_TEGRA
 void tegra_periph_reset_deassert(struct clk *c);
 void tegra_periph_reset_assert(struct clk *c);
+#else
+static inline void tegra_periph_reset_deassert(struct clk *c) {}
+static inline void tegra_periph_reset_assert(struct clk *c) {}
+#endif
 void tegra_clocks_apply_init_table(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit v1.3-7-g2ca7


From 14c63f17b1fde5a575a28e96547a22b451c71fb5 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Fri, 21 Jun 2013 08:51:36 -0700
Subject: perf: Drop sample rate when sampling is too slow

This patch keeps track of how long perf's NMI handler is taking,
and also calculates how many samples perf can take a second.  If
the sample length times the expected max number of samples
exceeds a configurable threshold, it drops the sample rate.

This way, we don't have a runaway sampling process eating up the
CPU.

This patch can tend to drop the sample rate down to level where
perf doesn't work very well.  *BUT* the alternative is that my
system hangs because it spends all of its time handling NMIs.

I'll take a busted performance tool over an entire system that's
busted and undebuggable any day.

BTW, my suspicion is that there's still an underlying bug here.
Using the HPET instead of the TSC is definitely a contributing
factor, but I suspect there are some other things going on.
But, I can't go dig down on a bug like that with my machine
hanging all the time.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
Cc: acme@ghostprotocols.net
Cc: Dave Hansen <dave@sr71.net>
[ Prettified it a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/sysctl/kernel.txt  | 26 ++++++++++++
 arch/x86/kernel/cpu/perf_event.c | 12 +++++-
 include/linux/perf_event.h       |  7 +++
 kernel/events/core.c             | 92 ++++++++++++++++++++++++++++++++++++++--
 kernel/sysctl.c                  |  9 ++++
 5 files changed, 141 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bcff3f9de550..ab7d16efa96b 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -427,6 +427,32 @@ This file shows up if CONFIG_DEBUG_STACKOVERFLOW is enabled.
 
 ==============================================================
 
+perf_cpu_time_max_percent:
+
+Hints to the kernel how much CPU time it should be allowed to
+use to handle perf sampling events.  If the perf subsystem
+is informed that its samples are exceeding this limit, it
+will drop its sampling frequency to attempt to reduce its CPU
+usage.
+
+Some perf sampling happens in NMIs.  If these samples
+unexpectedly take too long to execute, the NMIs can become
+stacked up next to each other so much that nothing else is
+allowed to execute.
+
+0: disable the mechanism.  Do not monitor or correct perf's
+   sampling rate no matter how CPU time it takes.
+
+1-100: attempt to throttle perf's sample rate to this
+   percentage of CPU.  Note: the kernel calculates an
+   "expected" length of each sample event.  100 here means
+   100% of that expected length.  Even if this is set to
+   100, you may still see sample throttling if this
+   length is exceeded.  Set to 0 if you truly do not care
+   how much CPU is consumed.
+
+==============================================================
+
 
 pid_max:
 
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ab3395295224..afc2413ba00c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1252,10 +1252,20 @@ void perf_events_lapic_init(void)
 static int __kprobes
 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 {
+	int ret;
+	u64 start_clock;
+	u64 finish_clock;
+
 	if (!atomic_read(&active_events))
 		return NMI_DONE;
 
-	return x86_pmu.handle_irq(regs);
+	start_clock = local_clock();
+	ret = x86_pmu.handle_irq(regs);
+	finish_clock = local_clock();
+
+	perf_sample_event_took(finish_clock - start_clock);
+
+	return ret;
 }
 
 struct event_constraint emptyconstraint;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 056f93a7990f..50b3efd14d29 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -706,10 +706,17 @@ static inline void perf_callchain_store(struct perf_callchain_entry *entry, u64
 extern int sysctl_perf_event_paranoid;
 extern int sysctl_perf_event_mlock;
 extern int sysctl_perf_event_sample_rate;
+extern int sysctl_perf_cpu_time_max_percent;
+
+extern void perf_sample_event_took(u64 sample_len_ns);
 
 extern int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 
 static inline bool perf_paranoid_tracepoint_raw(void)
 {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9c8920783317..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,26 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
 /*
  * max perf event sample rate
  */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
-	DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE		100000
+#define DEFAULT_SAMPLE_PERIOD_NS	(NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT	25
+
+int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly	= DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly	= DEFAULT_SAMPLE_PERIOD_NS;
+
+static atomic_t perf_sample_allowed_ns __read_mostly =
+	ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+
+void update_perf_cpu_limits(void)
+{
+	u64 tmp = perf_sample_period_ns;
+
+	tmp *= sysctl_perf_cpu_time_max_percent;
+	tmp = do_div(tmp, 100);
+	atomic_set(&perf_sample_allowed_ns, tmp);
+}
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 
@@ -182,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
 		return ret;
 
 	max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+	update_perf_cpu_limits();
 
 	return 0;
 }
 
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp,
+				loff_t *ppos)
+{
+	int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+	if (ret || !write)
+		return ret;
+
+	update_perf_cpu_limits();
+
+	return 0;
+}
+
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+	u64 avg_local_sample_len;
+	u64 local_samples_len = __get_cpu_var(running_sample_length);
+
+	if (atomic_read(&perf_sample_allowed_ns) == 0)
+		return;
+
+	/* decay the counter by 1 average sample */
+	local_samples_len = __get_cpu_var(running_sample_length);
+	local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+	local_samples_len += sample_len_ns;
+	__get_cpu_var(running_sample_length) = local_samples_len;
+
+	/*
+	 * note: this will be biased artifically low until we have
+	 * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+	 * from having to maintain a count.
+	 */
+	avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+	if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+		return;
+
+	if (max_samples_per_tick <= 1)
+		return;
+
+	max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+	sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+	perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+	printk_ratelimited(KERN_WARNING
+			"perf samples too long (%lld > %d), lowering "
+			"kernel.perf_event_max_sample_rate to %d\n",
+			avg_local_sample_len,
+			atomic_read(&perf_sample_allowed_ns),
+			sysctl_perf_event_sample_rate);
+
+	update_perf_cpu_limits();
+}
+
 static atomic64_t perf_event_id;
 
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b0a1f99907f3..4ce13c3cedb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1043,6 +1043,15 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= perf_proc_update_handler,
 	},
+	{
+		.procname	= "perf_cpu_time_max_percent",
+		.data		= &sysctl_perf_cpu_time_max_percent,
+		.maxlen		= sizeof(sysctl_perf_cpu_time_max_percent),
+		.mode		= 0644,
+		.proc_handler	= perf_cpu_time_max_percent_handler,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #endif
 #ifdef CONFIG_KMEMCHECK
 	{
-- 
cgit v1.3-7-g2ca7


From fa223ec37c9f6710022381a25c352955675817f9 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Mon, 24 Jun 2013 15:42:04 +0200
Subject: mfd: twl6040: Update register bit definitions

Add define for: HSDRV, HFDAC, HFPGA and HFDRV enable bits

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
CC: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 include/linux/mfd/twl6040.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/twl6040.h b/include/linux/mfd/twl6040.h
index 94ac944d12f0..7e7fbce7a308 100644
--- a/include/linux/mfd/twl6040.h
+++ b/include/linux/mfd/twl6040.h
@@ -125,8 +125,15 @@
 
 #define TWL6040_HSDACENA		(1 << 0)
 #define TWL6040_HSDACMODE		(1 << 1)
+#define TWL6040_HSDRVENA		(1 << 2)
 #define TWL6040_HSDRVMODE		(1 << 3)
 
+/* HFLCTL/R (0x14/0x16) fields */
+
+#define TWL6040_HFDACENA		(1 << 0)
+#define TWL6040_HFPGAENA		(1 << 1)
+#define TWL6040_HFDRVENA		(1 << 4)
+
 /* VIBCTLL/R (0x18/0x1A) fields */
 
 #define TWL6040_VIBENA			(1 << 0)
-- 
cgit v1.3-7-g2ca7


From 45e3ec3784aec0d194740b75b547bfabca448ff3 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Mon, 24 Jun 2013 13:05:56 -0600
Subject: clk: tegra: fix ifdef for tegra_periph_reset_assert inline

Commit 7064f6b "clk: tegra: provide tegra_periph_reset_assert
alternative" added ifdef'd static inline versions of some functions,
but tested ARCH_TEGRA rather than CONFIG_ARCH_TEGRA, thus disabling
these function in all cases. In some cases, this caused HW modules to
misbehave; for example, the Tegra I2C driver BUG()d during boot on
Seaboard.

Reported-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Stephen Warren <swarren@nvidia.com>
Tested-by: Paul Walmsley <pwalmsley@nvidia.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 include/linux/clk/tegra.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index e3cc8721c30e..23a0ceee831f 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -120,7 +120,7 @@ static inline void tegra_cpu_clock_resume(void)
 }
 #endif
 
-#ifdef ARCH_TEGRA
+#ifdef CONFIG_ARCH_TEGRA
 void tegra_periph_reset_deassert(struct clk *c);
 void tegra_periph_reset_assert(struct clk *c);
 #else
-- 
cgit v1.3-7-g2ca7


From 02c402d98588bdfd3bebd267db574e13afdef722 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 24 Jun 2013 15:21:47 -0700
Subject: cgroup: convert CFTYPE_* flags to enums

Purely cosmetic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6c2ba52fc5d4..ab27001a2c4a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -385,9 +385,11 @@ struct cgroup_map_cb {
  */
 
 /* cftype->flags */
-#define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
-#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
-#define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
+enum {
+	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cg */
+	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cg */
+	CFTYPE_INSANE		= (1 << 2),	/* don't create if sane_behavior */
+};
 
 #define MAX_CFTYPE_NAME		64
 
-- 
cgit v1.3-7-g2ca7


From a8a648c4acee2095262f7fa65b0d8a68a03c32e4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 24 Jun 2013 15:21:47 -0700
Subject: cgroup: remove cgroup->actual_subsys_mask

cgroup curiously has two subsystem masks, ->subsys_mask and
->actual_subsys_mask.  The latter only exists because the new target
subsys_mask is passed into rebind_subsystems() via @root>subsys_mask.
rebind_subsystems() needs to know what the current mask is to decide
how to reach the target mask so ->actual_subsys_mask is used as the
temp location to remember the current state.

Adding a temporary field to a permanent data structure is rather silly
and can be misleading.  Update rebind_subsystems() to take @added_mask
and @removed_mask instead and remove @root->actual_subsys_mask.

This patch shouldn't introduce any behavior changes.

v2: Comment and description updated as suggested by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  8 +-------
 kernel/cgroup.c        | 22 ++++++++++++----------
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ab27001a2c4a..4c1eceb8c439 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -286,18 +286,12 @@ enum {
 struct cgroupfs_root {
 	struct super_block *sb;
 
-	/*
-	 * The bitmask of subsystems intended to be attached to this
-	 * hierarchy
-	 */
+	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned long subsys_mask;
 
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 
-	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_mask;
-
 	/* A list running through the attached subsystems */
 	struct list_head subsys_list;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8f296b83b6a3..67fc953c816a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -986,17 +986,14 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
  * returns an error, no reference counts are touched.
  */
 static int rebind_subsystems(struct cgroupfs_root *root,
-			      unsigned long final_subsys_mask)
+			     unsigned long added_mask, unsigned removed_mask)
 {
-	unsigned long added_mask, removed_mask;
 	struct cgroup *cgrp = &root->top_cgroup;
 	int i;
 
 	BUG_ON(!mutex_is_locked(&cgroup_mutex));
 	BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
 
-	removed_mask = root->actual_subsys_mask & ~final_subsys_mask;
-	added_mask = final_subsys_mask & ~root->actual_subsys_mask;
 	/* Check that any added subsystems are currently free */
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		unsigned long bit = 1UL << i;
@@ -1032,27 +1029,33 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 			BUG_ON(!cgroup_dummy_top->subsys[i]);
 			BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top);
+
 			cgrp->subsys[i] = cgroup_dummy_top->subsys[i];
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
 			if (ss->bind)
 				ss->bind(cgrp);
+
 			/* refcount was already taken, and we're keeping it */
+			root->subsys_mask |= bit;
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+
 			if (ss->bind)
 				ss->bind(cgroup_dummy_top);
 			cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top;
 			cgrp->subsys[i] = NULL;
 			cgroup_subsys[i]->root = &cgroup_dummy_root;
 			list_move(&ss->sibling, &cgroup_dummy_root.subsys_list);
+
 			/* subsystem is now free - drop reference on module */
 			module_put(ss->module);
-		} else if (bit & final_subsys_mask) {
+			root->subsys_mask &= ~bit;
+		} else if (bit & root->subsys_mask) {
 			/* Subsystem state should already exist */
 			BUG_ON(ss == NULL);
 			BUG_ON(!cgrp->subsys[i]);
@@ -1069,7 +1072,6 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			BUG_ON(cgrp->subsys[i]);
 		}
 	}
-	root->subsys_mask = root->actual_subsys_mask = final_subsys_mask;
 
 	return 0;
 }
@@ -1343,7 +1345,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
-	if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent)
+	if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
 		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
 			   task_tgid_nr(current), current->comm);
 
@@ -1365,7 +1367,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	 */
 	cgroup_clear_directory(cgrp->dentry, false, removed_mask);
 
-	ret = rebind_subsystems(root, opts.subsys_mask);
+	ret = rebind_subsystems(root, added_mask, removed_mask);
 	if (ret) {
 		/* rebind_subsystems failed, re-populate the removed files */
 		cgroup_populate_dir(cgrp, false, removed_mask);
@@ -1634,7 +1636,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		if (ret)
 			goto unlock_drop;
 
-		ret = rebind_subsystems(root, root->subsys_mask);
+		ret = rebind_subsystems(root, root->subsys_mask, 0);
 		if (ret == -EBUSY) {
 			free_cgrp_cset_links(&tmp_links);
 			goto unlock_drop;
@@ -1727,7 +1729,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_lock(&cgroup_root_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
-	ret = rebind_subsystems(root, 0);
+	ret = rebind_subsystems(root, 0, root->subsys_mask);
 	/* Shouldn't be able to fail ... */
 	BUG_ON(ret);
 
-- 
cgit v1.3-7-g2ca7


From 1f6236bfd7c38d5f9f7648fae7215e65274b9e63 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Date: Fri, 14 Jun 2013 18:40:43 +0200
Subject: genirq: Add irq_get_trigger_type() to get IRQ flags

Drivers that want to get the trigger edge/level type flags for a given
interrupt have to call irq_get_irq_data(irq) to get the struct
irq_data and then irqd_get_trigger_type(irq_data) to obtain the IRQ
flags.

This is not only error prone but also unnecessary exposes the struct
irq_data to callers.

It's better to have an irq_get_trigger_type() function to obtain the
edge/level flags for an IRQ.

Signed-off-by: Javier Martinez Canillas <javier.martinez@collabora.co.uk>
Acked-by: Grant Likely <grant.likely@linaro.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Andrew Lunn <andrew@lunn.ch>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-mips@linux-mips.org
Link: http://lkml.kernel.org/r/1371228049-27080-2-git-send-email-javier.martinez@collabora.co.uk
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irq.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 298a9b9ce675..f04d3ba335cb 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -584,6 +584,12 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
 	return d->msi_desc;
 }
 
+static inline u32 irq_get_trigger_type(unsigned int irq)
+{
+	struct irq_data *d = irq_get_irq_data(irq);
+	return d ? irqd_get_trigger_type(d) : 0;
+}
+
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
 		struct module *owner);
 
-- 
cgit v1.3-7-g2ca7


From 80b022e29bfdffb6c9ac0a283bcad3e1030c4c7e Mon Sep 17 00:00:00 2001
From: Jonghwa Lee <jonghwa3.lee@samsung.com>
Date: Tue, 25 Jun 2013 10:08:38 +0900
Subject: regulator: max77693: Add max77693 regualtor driver.

This patch adds new regulator driver to support max77693 chip's regulators.
max77693 has two linear voltage regulators and one current regulator which
can be controlled through I2C bus. This driver also supports device tree.

Signed-off-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Myungjoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Mark Brown <broonie@linaro.org>
---
 Documentation/devicetree/bindings/mfd/max77693.txt |  55 ++++
 drivers/regulator/Kconfig                          |   9 +
 drivers/regulator/Makefile                         |   1 +
 drivers/regulator/max77693.c                       | 324 +++++++++++++++++++++
 include/linux/mfd/max77693-private.h               |  13 +
 include/linux/mfd/max77693.h                       |  18 ++
 6 files changed, 420 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/max77693.txt
 create mode 100644 drivers/regulator/max77693.c

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mfd/max77693.txt b/Documentation/devicetree/bindings/mfd/max77693.txt
new file mode 100644
index 000000000000..11921cc417bf
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/max77693.txt
@@ -0,0 +1,55 @@
+Maxim MAX77693 multi-function device
+
+MAX77693 is a Multifunction device with the following submodules:
+- PMIC,
+- CHARGER,
+- LED,
+- MUIC,
+- HAPTIC
+
+It is interfaced to host controller using i2c.
+This document describes the bindings for the mfd device.
+
+Required properties:
+- compatible : Must be "maxim,max77693".
+- reg : Specifies the i2c slave address of PMIC block.
+- interrupts : This i2c device has an IRQ line connected to the main SoC.
+- interrupt-parent :  The parent interrupt controller.
+
+Optional properties:
+- regulators : The regulators of max77693 have to be instantiated under subnod
+  named "regulators" using the following format.
+
+	regulators {
+		regualtor-compatible = ESAFEOUT1/ESAFEOUT2/CHARGER
+		standard regulator constratints[*].
+	};
+
+	[*] refer Documentation/devicetree/bindings/regulator/regulator.txt
+
+Example:
+	max77693@66 {
+		compatible = "maxim,max77693";
+		reg = <0x66>;
+		interrupt-parent = <&gpx1>;
+		interrupts = <5 2>;
+
+		regulators {
+			esafeout@1 {
+				regulator-compatible = "ESAFEOUT1";
+				regulator-name = "ESAFEOUT1";
+				regulator-boot-on;
+			};
+			esafeout@2 {
+				regulator-compatible = "ESAFEOUT2";
+				regulator-name = "ESAFEOUT2";
+				};
+			charger@0 {
+				regulator-compatible = "CHARGER";
+				regulator-name = "CHARGER";
+				regulator-min-microamp = <60000>;
+				regulator-max-microamp = <2580000>;
+					regulator-boot-on;
+			};
+		};
+	};
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index 8bb26446037e..9744437afe32 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -250,6 +250,15 @@ config REGULATOR_MAX77686
 	  via I2C bus. The provided regulator is suitable for
 	  Exynos-4 chips to control VARM and VINT voltages.
 
+config REGULATOR_MAX77693
+	tristate "Maxim MAX77693 regulator"
+	depends on MFD_MAX77693
+	help
+	  This driver controls a Maxim 77693 regulator via I2C bus.
+	  The regulators include two LDOs, 'SAFEOUT1', 'SAFEOUT2'
+	  and one current regulator 'CHARGER'. This is suitable for
+	  Exynos-4x12 chips.
+
 config REGULATOR_PCAP
 	tristate "Motorola PCAP2 regulator driver"
 	depends on EZX_PCAP
diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 47a34ff88f98..a4094cd274be 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -41,6 +41,7 @@ obj-$(CONFIG_REGULATOR_MAX8973) += max8973-regulator.o
 obj-$(CONFIG_REGULATOR_MAX8997) += max8997.o
 obj-$(CONFIG_REGULATOR_MAX8998) += max8998.o
 obj-$(CONFIG_REGULATOR_MAX77686) += max77686.o
+obj-$(CONFIG_REGULATOR_MAX77693) += max77693.o
 obj-$(CONFIG_REGULATOR_MC13783) += mc13783-regulator.o
 obj-$(CONFIG_REGULATOR_MC13892) += mc13892-regulator.o
 obj-$(CONFIG_REGULATOR_MC13XXX_CORE) +=  mc13xxx-regulator-core.o
diff --git a/drivers/regulator/max77693.c b/drivers/regulator/max77693.c
new file mode 100644
index 000000000000..674ece7c832b
--- /dev/null
+++ b/drivers/regulator/max77693.c
@@ -0,0 +1,324 @@
+/*
+ * max77693.c - Regulator driver for the Maxim 77693
+ *
+ * Copyright (C) 2013 Samsung Electronics
+ * Jonghwa Lee <jonghwa3.lee@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * This driver is based on max77686.c
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/mfd/max77693.h>
+#include <linux/mfd/max77693-private.h>
+#include <linux/regulator/of_regulator.h>
+
+#define CHGIN_ILIM_STEP_20mA			20000
+
+struct max77693_pmic_dev {
+	struct device *dev;
+	struct max77693_dev *iodev;
+	int num_regulators;
+	struct regulator_dev **rdev;
+};
+
+/* CHARGER regulator ops */
+/* CHARGER regulator uses two bits for enabling */
+static int max77693_chg_is_enabled(struct regulator_dev *rdev)
+{
+	int ret;
+	u8 val;
+
+	ret = max77693_read_reg(rdev->regmap, rdev->desc->enable_reg, &val);
+	if (ret)
+		return ret;
+
+	return (val & rdev->desc->enable_mask) == rdev->desc->enable_mask;
+}
+
+/*
+ * CHARGER regulator - Min : 20mA, Max : 2580mA, step : 20mA
+ * 0x00, 0x01, 0x2, 0x03	= 60 mA
+ * 0x04 ~ 0x7E			= (60 + (X - 3) * 20) mA
+ */
+static int max77693_chg_get_current_limit(struct regulator_dev *rdev)
+{
+	unsigned int chg_min_uA = rdev->constraints->min_uA;
+	unsigned int chg_max_uA = rdev->constraints->max_uA;
+	u8 reg, sel;
+	unsigned int val;
+	int ret;
+
+	ret = max77693_read_reg(rdev->regmap,
+				MAX77693_CHG_REG_CHG_CNFG_09, &reg);
+	if (ret < 0)
+		return ret;
+
+	sel = reg & CHG_CNFG_09_CHGIN_ILIM_MASK;
+
+	/* the first four codes for charger current are all 60mA */
+	if (sel <= 3)
+		sel = 0;
+	else
+		sel -= 3;
+
+	val = chg_min_uA + CHGIN_ILIM_STEP_20mA * sel;
+	if (val > chg_max_uA)
+		return -EINVAL;
+
+	return val;
+}
+
+static int max77693_chg_set_current_limit(struct regulator_dev *rdev,
+						int min_uA, int max_uA)
+{
+	unsigned int chg_min_uA = rdev->constraints->min_uA;
+	int sel = 0;
+
+	while (chg_min_uA + CHGIN_ILIM_STEP_20mA * sel < min_uA)
+		sel++;
+
+	if (chg_min_uA * CHGIN_ILIM_STEP_20mA * sel > max_uA)
+		return -EINVAL;
+
+	/* the first four codes for charger current are all 60mA */
+	sel += 3;
+
+	return max77693_write_reg(rdev->regmap,
+				MAX77693_CHG_REG_CHG_CNFG_09, sel);
+}
+/* end of CHARGER regulator ops */
+
+static const unsigned int max77693_safeout_table[] = {
+	4850000,
+	4900000,
+	4950000,
+	3300000,
+};
+
+static struct regulator_ops max77693_safeout_ops = {
+	.list_voltage		= regulator_list_voltage_table,
+	.is_enabled		= regulator_is_enabled_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.get_voltage_sel	= regulator_get_voltage_sel_regmap,
+	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
+};
+
+static struct regulator_ops max77693_charger_ops = {
+	.is_enabled		= max77693_chg_is_enabled,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+	.get_current_limit	= max77693_chg_get_current_limit,
+	.set_current_limit	= max77693_chg_set_current_limit,
+};
+
+#define regulator_desc_esafeout(_num)	{			\
+	.name		= "ESAFEOUT"#_num,			\
+	.id		= MAX77693_ESAFEOUT##_num,		\
+	.n_voltages	= 4,					\
+	.ops		= &max77693_safeout_ops,		\
+	.type		= REGULATOR_VOLTAGE,			\
+	.volt_table	= max77693_safeout_table,		\
+	.vsel_reg	= MAX77693_CHG_REG_SAFEOUT_CTRL,	\
+	.vsel_mask	= SAFEOUT_CTRL_SAFEOUT##_num##_MASK,	\
+	.enable_reg	= MAX77693_CHG_REG_SAFEOUT_CTRL,	\
+	.enable_mask	= SAFEOUT_CTRL_ENSAFEOUT##_num##_MASK ,	\
+}
+
+static struct regulator_desc regulators[] = {
+	regulator_desc_esafeout(1),
+	regulator_desc_esafeout(2),
+	{
+		.name = "CHARGER",
+		.id = MAX77693_CHARGER,
+		.ops = &max77693_charger_ops,
+		.type = REGULATOR_CURRENT,
+		.owner = THIS_MODULE,
+		.enable_reg = MAX77693_CHG_REG_CHG_CNFG_00,
+		.enable_mask = CHG_CNFG_00_CHG_MASK |
+				CHG_CNFG_00_BUCK_MASK,
+	},
+};
+
+#ifdef CONFIG_OF
+static int max77693_pmic_dt_parse_rdata(struct device *dev,
+					struct max77693_regulator_data **rdata)
+{
+	struct device_node *np;
+	struct of_regulator_match *rmatch;
+	struct max77693_regulator_data *tmp;
+	int i, matched = 0;
+
+	np = of_find_node_by_name(dev->parent->of_node, "regulators");
+	if (!np)
+		return -EINVAL;
+
+	rmatch = devm_kzalloc(dev,
+		 sizeof(*rmatch) * ARRAY_SIZE(regulators), GFP_KERNEL);
+	if (!rmatch)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(regulators); i++)
+		rmatch[i].name = regulators[i].name;
+
+	matched = of_regulator_match(dev, np, rmatch, ARRAY_SIZE(regulators));
+	if (matched <= 0)
+		return matched;
+	*rdata = devm_kzalloc(dev, sizeof(**rdata) * matched, GFP_KERNEL);
+	if (!(*rdata))
+		return -ENOMEM;
+
+	tmp = *rdata;
+
+	for (i = 0; i < ARRAY_SIZE(regulators); i++) {
+		if (!rmatch[i].init_data)
+			continue;
+		tmp->initdata = rmatch[i].init_data;
+		tmp->of_node = rmatch[i].of_node;
+		tmp->id = regulators[i].id;
+		tmp++;
+	}
+
+	return matched;
+}
+#else
+static int max77693_pmic_dt_parse_rdata(struct device *dev,
+					struct max77693_regulator_data **rdata)
+{
+	return 0;
+}
+#endif /* CONFIG_OF */
+
+static int max77693_pmic_init_rdata(struct device *dev,
+				    struct max77693_regulator_data **rdata)
+{
+	struct max77693_platform_data *pdata;
+	int num_regulators = 0;
+
+	pdata = dev_get_platdata(dev->parent);
+	if (pdata) {
+		*rdata = pdata->regulators;
+		num_regulators = pdata->num_regulators;
+	}
+
+	if (!(*rdata) && dev->parent->of_node)
+		num_regulators = max77693_pmic_dt_parse_rdata(dev, rdata);
+
+	return num_regulators;
+}
+
+static int max77693_pmic_probe(struct platform_device *pdev)
+{
+	struct max77693_dev *iodev = dev_get_drvdata(pdev->dev.parent);
+	struct max77693_pmic_dev *max77693_pmic;
+	struct max77693_regulator_data *rdata = NULL;
+	int num_rdata, i, ret;
+	struct regulator_config config;
+
+	num_rdata = max77693_pmic_init_rdata(&pdev->dev, &rdata);
+	if (!rdata || num_rdata <= 0) {
+		dev_err(&pdev->dev, "No init data supplied.\n");
+		return -ENODEV;
+	}
+
+	max77693_pmic = devm_kzalloc(&pdev->dev,
+				sizeof(struct max77693_pmic_dev),
+				GFP_KERNEL);
+	if (!max77693_pmic)
+		return -ENOMEM;
+
+	max77693_pmic->rdev = devm_kzalloc(&pdev->dev,
+				sizeof(struct regulator_dev *) * num_rdata,
+				GFP_KERNEL);
+	if (!max77693_pmic->rdev)
+		return -ENOMEM;
+
+	max77693_pmic->dev = &pdev->dev;
+	max77693_pmic->iodev = iodev;
+	max77693_pmic->num_regulators = num_rdata;
+
+	config.dev = &pdev->dev;
+	config.regmap = iodev->regmap;
+	config.driver_data = max77693_pmic;
+	platform_set_drvdata(pdev, max77693_pmic);
+
+	for (i = 0; i < max77693_pmic->num_regulators; i++) {
+		int id = rdata[i].id;
+
+		config.init_data = rdata[i].initdata;
+		config.of_node = rdata[i].of_node;
+
+		max77693_pmic->rdev[i] = regulator_register(&regulators[id],
+							    &config);
+		if (IS_ERR(max77693_pmic->rdev[i])) {
+			ret = PTR_ERR(max77693_pmic->rdev[i]);
+			dev_err(max77693_pmic->dev,
+				"Failed to initialize regulator-%d\n", id);
+			max77693_pmic->rdev[i] = NULL;
+			goto err;
+		}
+	}
+
+	return 0;
+ err:
+	while (--i >= 0)
+		regulator_unregister(max77693_pmic->rdev[i]);
+
+	return ret;
+}
+
+static int max77693_pmic_remove(struct platform_device *pdev)
+{
+	struct max77693_pmic_dev *max77693_pmic = platform_get_drvdata(pdev);
+	struct regulator_dev **rdev = max77693_pmic->rdev;
+	int i;
+
+	for (i = 0; i < max77693_pmic->num_regulators; i++)
+		if (rdev[i])
+			regulator_unregister(rdev[i]);
+
+	return 0;
+}
+
+static const struct platform_device_id max77693_pmic_id[] = {
+	{"max77693-pmic", 0},
+	{},
+};
+
+MODULE_DEVICE_TABLE(platform, max77693_pmic_id);
+
+static struct platform_driver max77693_pmic_driver = {
+	.driver = {
+		   .name = "max77693-pmic",
+		   .owner = THIS_MODULE,
+		   },
+	.probe = max77693_pmic_probe,
+	.remove = max77693_pmic_remove,
+	.id_table = max77693_pmic_id,
+};
+
+module_platform_driver(max77693_pmic_driver);
+
+MODULE_DESCRIPTION("MAXIM MAX77693 regulator driver");
+MODULE_AUTHOR("Jonghwa Lee <jonghwa3.lee@samsung.com>");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mfd/max77693-private.h b/include/linux/mfd/max77693-private.h
index 1aa4f13cdfa6..244fb0d51589 100644
--- a/include/linux/mfd/max77693-private.h
+++ b/include/linux/mfd/max77693-private.h
@@ -85,6 +85,19 @@ enum max77693_pmic_reg {
 	MAX77693_PMIC_REG_END,
 };
 
+/* MAX77693 CHG_CNFG_00 register */
+#define CHG_CNFG_00_CHG_MASK		0x1
+#define CHG_CNFG_00_BUCK_MASK		0x4
+
+/* MAX77693 CHG_CNFG_09 Register */
+#define CHG_CNFG_09_CHGIN_ILIM_MASK	0x7F
+
+/* MAX77693 CHG_CTRL Register */
+#define SAFEOUT_CTRL_SAFEOUT1_MASK	0x3
+#define SAFEOUT_CTRL_SAFEOUT2_MASK	0xC
+#define SAFEOUT_CTRL_ENSAFEOUT1_MASK	0x40
+#define SAFEOUT_CTRL_ENSAFEOUT2_MASK	0x80
+
 /* Slave addr = 0x4A: MUIC */
 enum max77693_muic_reg {
 	MAX77693_MUIC_REG_ID		= 0x00,
diff --git a/include/linux/mfd/max77693.h b/include/linux/mfd/max77693.h
index 3109a6c5c948..676f0f388992 100644
--- a/include/linux/mfd/max77693.h
+++ b/include/linux/mfd/max77693.h
@@ -30,6 +30,20 @@
 #ifndef __LINUX_MFD_MAX77693_H
 #define __LINUX_MFD_MAX77693_H
 
+/* MAX77686 regulator IDs */
+enum max77693_regulators {
+	MAX77693_ESAFEOUT1 = 0,
+	MAX77693_ESAFEOUT2,
+	MAX77693_CHARGER,
+	MAX77693_REG_MAX,
+};
+
+struct max77693_regulator_data {
+	int id;
+	struct regulator_init_data *initdata;
+	struct device_node *of_node;
+};
+
 struct max77693_reg_data {
 	u8 addr;
 	u8 data;
@@ -52,6 +66,10 @@ struct max77693_muic_platform_data {
 struct max77693_platform_data {
 	int wakeup;
 
+	/* regulator data */
+	struct max77693_regulator_data *regulators;
+	int num_regulators;
+
 	/* muic data */
 	struct max77693_muic_platform_data *muic_data;
 };
-- 
cgit v1.3-7-g2ca7


From ad42fc6c84795d19972e7f7dee70fe74bec4c2d8 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Mon, 24 Jun 2013 15:06:19 +0200
Subject: pinctrl: rip out the direct pinconf API

From the inception ot the pin config API there has been the
possibility to get a handle at a pin directly and configure
its electrical characteristics. For this reason we had:

int pin_config_get(const char *dev_name, const char *name,
               unsigned long *config);
int pin_config_set(const char *dev_name, const char *name,
               unsigned long config);
int pin_config_group_get(const char *dev_name,
               const char *pin_group,
               unsigned long *config);
int pin_config_group_set(const char *dev_name,
               const char *pin_group,
               unsigned long config);

After the introduction of the pin control states that will
control pins associated with devices, and its subsequent
introduction to the device core, as well as the
introduction of pin control hogs that can set up states on
boot and optionally also at sleep, this direct pin control
API is a thing of the past.

As could be expected, it has zero in-kernel users.
Let's delete this API and make our world simpler.

Reported-by: Tony Lindgren <tony@atomide.com>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/pinctrl.txt        |  11 +--
 drivers/pinctrl/pinconf.c        | 174 ---------------------------------------
 include/linux/pinctrl/consumer.h |  43 ----------
 3 files changed, 2 insertions(+), 226 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/pinctrl.txt b/Documentation/pinctrl.txt
index 3ee24759be0f..c5948c7d662a 100644
--- a/Documentation/pinctrl.txt
+++ b/Documentation/pinctrl.txt
@@ -203,15 +203,8 @@ using a certain resistor value - pull up and pull down - so that the pin has a
 stable value when nothing is driving the rail it is connected to, or when it's
 unconnected.
 
-Pin configuration can be programmed either using the explicit APIs described
-immediately below, or by adding configuration entries into the mapping table;
-see section "Board/machine configuration" below.
-
-For example, a platform may do the following to pull up a pin to VDD:
-
-#include <linux/pinctrl/consumer.h>
-
-ret = pin_config_set("foo-dev", "FOO_GPIO_PIN", PLATFORM_X_PULL_UP);
+Pin configuration can be programmed by adding configuration entries into the
+mapping table; see section "Board/machine configuration" below.
 
 The format and meaning of the configuration parameter, PLATFORM_X_PULL_UP
 above, is entirely defined by the pin controller driver.
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index 694c3ace4520..e875f21a5908 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -75,98 +75,6 @@ int pin_config_get_for_pin(struct pinctrl_dev *pctldev, unsigned pin,
 	return ops->pin_config_get(pctldev, pin, config);
 }
 
-/**
- * pin_config_get() - get the configuration of a single pin parameter
- * @dev_name: name of the pin controller device for this pin
- * @name: name of the pin to get the config for
- * @config: the config pointed to by this argument will be filled in with the
- *	current pin state, it can be used directly by drivers as a numeral, or
- *	it can be dereferenced to any struct.
- */
-int pin_config_get(const char *dev_name, const char *name,
-			  unsigned long *config)
-{
-	struct pinctrl_dev *pctldev;
-	int pin;
-
-	pctldev = get_pinctrl_dev_from_devname(dev_name);
-	if (!pctldev) {
-		pin = -EINVAL;
-		return pin;
-	}
-
-	mutex_lock(&pctldev->mutex);
-
-	pin = pin_get_from_name(pctldev, name);
-	if (pin < 0)
-		goto unlock;
-
-	pin = pin_config_get_for_pin(pctldev, pin, config);
-
-unlock:
-	mutex_unlock(&pctldev->mutex);
-	return pin;
-}
-EXPORT_SYMBOL(pin_config_get);
-
-static int pin_config_set_for_pin(struct pinctrl_dev *pctldev, unsigned pin,
-			   unsigned long config)
-{
-	const struct pinconf_ops *ops = pctldev->desc->confops;
-	int ret;
-
-	if (!ops || !ops->pin_config_set) {
-		dev_err(pctldev->dev, "cannot configure pin, missing "
-			"config function in driver\n");
-		return -EINVAL;
-	}
-
-	ret = ops->pin_config_set(pctldev, pin, config);
-	if (ret) {
-		dev_err(pctldev->dev,
-			"unable to set pin configuration on pin %d\n", pin);
-		return ret;
-	}
-
-	return 0;
-}
-
-/**
- * pin_config_set() - set the configuration of a single pin parameter
- * @dev_name: name of pin controller device for this pin
- * @name: name of the pin to set the config for
- * @config: the config in this argument will contain the desired pin state, it
- *	can be used directly by drivers as a numeral, or it can be dereferenced
- *	to any struct.
- */
-int pin_config_set(const char *dev_name, const char *name,
-		   unsigned long config)
-{
-	struct pinctrl_dev *pctldev;
-	int pin, ret;
-
-	pctldev = get_pinctrl_dev_from_devname(dev_name);
-	if (!pctldev) {
-		ret = -EINVAL;
-		return ret;
-	}
-
-	mutex_lock(&pctldev->mutex);
-
-	pin = pin_get_from_name(pctldev, name);
-	if (pin < 0) {
-		ret = pin;
-		goto unlock;
-	}
-
-	ret = pin_config_set_for_pin(pctldev, pin, config);
-
-unlock:
-	mutex_unlock(&pctldev->mutex);
-	return ret;
-}
-EXPORT_SYMBOL(pin_config_set);
-
 int pin_config_group_get(const char *dev_name, const char *pin_group,
 			 unsigned long *config)
 {
@@ -204,88 +112,6 @@ unlock:
 	mutex_unlock(&pctldev->mutex);
 	return ret;
 }
-EXPORT_SYMBOL(pin_config_group_get);
-
-int pin_config_group_set(const char *dev_name, const char *pin_group,
-			 unsigned long config)
-{
-	struct pinctrl_dev *pctldev;
-	const struct pinconf_ops *ops;
-	const struct pinctrl_ops *pctlops;
-	int selector;
-	const unsigned *pins;
-	unsigned num_pins;
-	int ret;
-	int i;
-
-	pctldev = get_pinctrl_dev_from_devname(dev_name);
-	if (!pctldev) {
-		ret = -EINVAL;
-		return ret;
-	}
-
-	mutex_lock(&pctldev->mutex);
-
-	ops = pctldev->desc->confops;
-	pctlops = pctldev->desc->pctlops;
-
-	if (!ops || (!ops->pin_config_group_set && !ops->pin_config_set)) {
-		dev_err(pctldev->dev, "cannot configure pin group, missing "
-			"config function in driver\n");
-		ret = -EINVAL;
-		goto unlock;
-	}
-
-	selector = pinctrl_get_group_selector(pctldev, pin_group);
-	if (selector < 0) {
-		ret = selector;
-		goto unlock;
-	}
-
-	ret = pctlops->get_group_pins(pctldev, selector, &pins, &num_pins);
-	if (ret) {
-		dev_err(pctldev->dev, "cannot configure pin group, error "
-			"getting pins\n");
-		goto unlock;
-	}
-
-	/*
-	 * If the pin controller supports handling entire groups we use that
-	 * capability.
-	 */
-	if (ops->pin_config_group_set) {
-		ret = ops->pin_config_group_set(pctldev, selector, config);
-		/*
-		 * If the pin controller prefer that a certain group be handled
-		 * pin-by-pin as well, it returns -EAGAIN.
-		 */
-		if (ret != -EAGAIN)
-			goto unlock;
-	}
-
-	/*
-	 * If the controller cannot handle entire groups, we configure each pin
-	 * individually.
-	 */
-	if (!ops->pin_config_set) {
-		ret = 0;
-		goto unlock;
-	}
-
-	for (i = 0; i < num_pins; i++) {
-		ret = ops->pin_config_set(pctldev, pins[i], config);
-		if (ret < 0)
-			goto unlock;
-	}
-
-	ret = 0;
-
-unlock:
-	mutex_unlock(&pctldev->mutex);
-
-	return ret;
-}
-EXPORT_SYMBOL(pin_config_group_set);
 
 int pinconf_map_to_setting(struct pinctrl_map const *map,
 			  struct pinctrl_setting *setting)
diff --git a/include/linux/pinctrl/consumer.h b/include/linux/pinctrl/consumer.h
index d071d850d1de..18eccefea06e 100644
--- a/include/linux/pinctrl/consumer.h
+++ b/include/linux/pinctrl/consumer.h
@@ -192,47 +192,4 @@ static inline struct pinctrl * __must_check devm_pinctrl_get_select_default(
 	return devm_pinctrl_get_select(dev, PINCTRL_STATE_DEFAULT);
 }
 
-#ifdef CONFIG_PINCONF
-
-extern int pin_config_get(const char *dev_name, const char *name,
-			  unsigned long *config);
-extern int pin_config_set(const char *dev_name, const char *name,
-			  unsigned long config);
-extern int pin_config_group_get(const char *dev_name,
-				const char *pin_group,
-				unsigned long *config);
-extern int pin_config_group_set(const char *dev_name,
-				const char *pin_group,
-				unsigned long config);
-
-#else
-
-static inline int pin_config_get(const char *dev_name, const char *name,
-				 unsigned long *config)
-{
-	return 0;
-}
-
-static inline int pin_config_set(const char *dev_name, const char *name,
-				 unsigned long config)
-{
-	return 0;
-}
-
-static inline int pin_config_group_get(const char *dev_name,
-				       const char *pin_group,
-				       unsigned long *config)
-{
-	return 0;
-}
-
-static inline int pin_config_group_set(const char *dev_name,
-				       const char *pin_group,
-				       unsigned long config)
-{
-	return 0;
-}
-
-#endif
-
 #endif /* __LINUX_PINCTRL_CONSUMER_H */
-- 
cgit v1.3-7-g2ca7


From 70637a6dcaa6e4cb707d0d50a7ea448f13438786 Mon Sep 17 00:00:00 2001
From: Heiko Stübner <heiko@sntech.de>
Date: Tue, 25 Jun 2013 14:55:42 +0200
Subject: pinctrl: more clarifications for generic pull configs

PULL_PIN_DEFAULT is meant for hardware completely hiding any pull
settings from the driver, so that it's really only possible to turn
the pull on or off, but it not being possible to determine any
pull settings from software.

Also the binding-documentation for the pull arguments did not match
the changes to the expected values.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 5 ++---
 include/linux/pinctrl/pinconf-generic.h                        | 5 ++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index 2d730e3dd496..7498bdc00e19 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -165,9 +165,8 @@ output-high		- set the pin to output mode with high level
 
 Arguments for parameters:
 
-- bias-pull-up, -down and -pin-default take as optional argument 0 to disable
-  the pull, on hardware supporting it the pull strength in Ohm. bias-disable
-  will also disable any active pull.
+- bias-pull-up, -down and -pin-default take as optional argument on hardware
+  supporting it the pull strength in Ohm. bias-disable will disable the pull.
 
 - drive-strength takes as argument the target strength in mA.
 
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 10ad996afee4..48aa4ba7b089 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -41,7 +41,10 @@
  *	impedance to GROUND). If the argument is != 0 pull-down is enabled,
  *	if it is 0, pull-down is total, i.e. the pin is connected to GROUND.
  * @PIN_CONFIG_BIAS_PULL_PIN_DEFAULT: the pin will be pulled up or down based
- *	on embedded knowledge of the controller, like current mux function.
+ *	on embedded knowledge of the controller hardware, like current mux
+ *	function. The pull direction and possibly strength too will normally
+ *	be decided completely inside the hardware block and not be readable
+ *	from the kernel side.
  *	If the argument is != 0 pull up/down is enabled, if it is 0, the
  *	configuration is ignored. The proper way to disable it is to use
  *	@PIN_CONFIG_BIAS_DISABLE.
-- 
cgit v1.3-7-g2ca7


From 256aeb648741bf095e884793862d3dfa6b1c1fb5 Mon Sep 17 00:00:00 2001
From: Heiko Stübner <heiko@sntech.de>
Date: Tue, 25 Jun 2013 14:56:11 +0200
Subject: pinctrl: set unit for debounce time pinconfig to usec

Currently the debounce time pinconfig option uses an unspecified
"time units" unit. As pinconfig options should use SI units and a
real unit is also necessary for generic dt bindings, change it
to usec. Currently no driver is using the generic pinconfig option
for this, so the unit change is safe to do.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt | 3 ++-
 drivers/pinctrl/pinconf-generic.c                              | 2 +-
 include/linux/pinctrl/pinconf-generic.h                        | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
index 7498bdc00e19..788ab09e4c1c 100644
--- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
@@ -173,7 +173,8 @@ Arguments for parameters:
 - input-schmitt takes as argument the adjustable hysteresis in a
   driver-specific format
 
-- input-debounce takes the debounce time as argument or 0 to disable debouncing
+- input-debounce takes the debounce time in usec as argument
+  or 0 to disable debouncing
 
 - power-source argument is the custom value describing the source to select
 
diff --git a/drivers/pinctrl/pinconf-generic.c b/drivers/pinctrl/pinconf-generic.c
index 794dad7d68d8..2b271d5d90bf 100644
--- a/drivers/pinctrl/pinconf-generic.c
+++ b/drivers/pinctrl/pinconf-generic.c
@@ -49,7 +49,7 @@ static struct pin_config_item conf_items[] = {
 	PCONFDUMP(PIN_CONFIG_DRIVE_STRENGTH, "output drive strength", "mA"),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT_ENABLE, "input schmitt enabled", NULL),
 	PCONFDUMP(PIN_CONFIG_INPUT_SCHMITT, "input schmitt trigger", NULL),
-	PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "time units"),
+	PCONFDUMP(PIN_CONFIG_INPUT_DEBOUNCE, "input debounce", "usec"),
 	PCONFDUMP(PIN_CONFIG_POWER_SOURCE, "pin power source", "selector"),
 	PCONFDUMP(PIN_CONFIG_SLEW_RATE, "slew rate", NULL),
 	PCONFDUMP(PIN_CONFIG_LOW_POWER_MODE, "pin low power", "mode"),
diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h
index 48aa4ba7b089..bf7e989abcb5 100644
--- a/include/linux/pinctrl/pinconf-generic.h
+++ b/include/linux/pinctrl/pinconf-generic.h
@@ -70,7 +70,7 @@
  *	setting pins to this mode.
  * @PIN_CONFIG_INPUT_DEBOUNCE: this will configure the pin to debounce mode,
  *	which means it will wait for signals to settle when reading inputs. The
- *	argument gives the debounce time on a custom format. Setting the
+ *	argument gives the debounce time in usecs. Setting the
  *	argument to zero turns debouncing off.
  * @PIN_CONFIG_POWER_SOURCE: if the pin can select between different power
  *	supplies, the argument to this parameter (on a custom format) tells
-- 
cgit v1.3-7-g2ca7


From 13d60f4b6ab5b702dc8d2ee20999f98a93728aec Mon Sep 17 00:00:00 2001
From: Zhang Yi <wetpzy@gmail.com>
Date: Tue, 25 Jun 2013 21:19:31 +0800
Subject: futex: Take hugepages into account when generating futex_key

The futex_keys of process shared futexes are generated from the page
offset, the mapping host and the mapping index of the futex user space
address. This should result in an unique identifier for each futex.

Though this is not true when futexes are located in different subpages
of an hugepage. The reason is, that the mapping index for all those
futexes evaluates to the index of the base page of the hugetlbfs
mapping. So a futex at offset 0 of the hugepage mapping and another
one at offset PAGE_SIZE of the same hugepage mapping have identical
futex_keys. This happens because the futex code blindly uses
page->index.

Steps to reproduce the bug:

1. Map a file from hugetlbfs. Initialize pthread_mutex1 at offset 0
   and pthread_mutex2 at offset PAGE_SIZE of the hugetlbfs
   mapping.

   The mutexes must be initialized as PTHREAD_PROCESS_SHARED because
   PTHREAD_PROCESS_PRIVATE mutexes are not affected by this issue as
   their keys solely depend on the user space address.

2. Lock mutex1 and mutex2

3. Create thread1 and in the thread function lock mutex1, which
   results in thread1 blocking on the locked mutex1.

4. Create thread2 and in the thread function lock mutex2, which
   results in thread2 blocking on the locked mutex2.

5. Unlock mutex2. Despite the fact that mutex2 got unlocked, thread2
   still blocks on mutex2 because the futex_key points to mutex1.

To solve this issue we need to take the normal page index of the page
which contains the futex into account, if the futex is in an hugetlbfs
mapping. In other words, we calculate the normal page mapping index of
the subpage in the hugetlbfs mapping.

Mappings which are not based on hugetlbfs are not affected and still
use page->index.

Thanks to Mel Gorman who provided a patch for adding proper evaluation
functions to the hugetlbfs code to avoid exposing hugetlbfs specific
details to the futex code.

[ tglx: Massaged changelog ]

Signed-off-by: Zhang Yi <zhang.yi20@zte.com.cn>
Reviewed-by: Jiang Biao <jiang.biao2@zte.com.cn>
Tested-by: Ma Chenggong <ma.chenggong@zte.com.cn>
Reviewed-by: 'Mel Gorman' <mgorman@suse.de>
Acked-by: 'Darren Hart' <dvhart@linux.intel.com>
Cc: 'Peter Zijlstra' <peterz@infradead.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/000101ce71a6%24a83c5880%24f8b50980%24@com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hugetlb.h | 16 ++++++++++++++++
 kernel/futex.c          |  3 ++-
 mm/hugetlb.c            | 17 +++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b4890fa57e7..feaf0c7fb7d8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -358,6 +358,17 @@ static inline int hstate_index(struct hstate *h)
 	return h - hstates;
 }
 
+pgoff_t __basepage_index(struct page *page);
+
+/* Return page->index in PAGE_SIZE units */
+static inline pgoff_t basepage_index(struct page *page)
+{
+	if (!PageCompound(page))
+		return page->index;
+
+	return __basepage_index(page);
+}
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
@@ -378,6 +389,11 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 }
 #define hstate_index_to_shift(index) 0
 #define hstate_index(h) 0
+
+static inline pgoff_t basepage_index(struct page *page)
+{
+	return page->index;
+}
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 #endif /* _LINUX_HUGETLB_H */
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c94..49dacfb45745 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,6 +61,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ptrace.h>
 #include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
 
 #include <asm/futex.h>
 
@@ -365,7 +366,7 @@ again:
 	} else {
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
 		key->shared.inode = page_head->mapping->host;
-		key->shared.pgoff = page_head->index;
+		key->shared.pgoff = basepage_index(page);
 	}
 
 	get_futex_key_refs(key);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f8feeeca6686..aea87ced4268 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
 }
 EXPORT_SYMBOL_GPL(PageHuge);
 
+pgoff_t __basepage_index(struct page *page)
+{
+	struct page *page_head = compound_head(page);
+	pgoff_t index = page_index(page_head);
+	unsigned long compound_idx;
+
+	if (!PageHuge(page_head))
+		return page_index(page);
+
+	if (compound_order(page_head) >= MAX_ORDER)
+		compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
+	else
+		compound_idx = page - page_head;
+
+	return (index << compound_order(page_head)) + compound_idx;
+}
+
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
-- 
cgit v1.3-7-g2ca7


From 040a0a37100563754bb1fee6ff6427420bcfa609 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Date: Mon, 24 Jun 2013 10:30:04 +0200
Subject: mutex: Add support for wound/wait style locks

Wound/wait mutexes are used when other multiple lock
acquisitions of a similar type can be done in an arbitrary
order. The deadlock handling used here is called wait/wound in
the RDBMS literature: The older tasks waits until it can acquire
the contended lock. The younger tasks needs to back off and drop
all the locks it is currently holding, i.e. the younger task is
wounded.

For full documentation please read Documentation/ww-mutex-design.txt.

References: https://lwn.net/Articles/548909/
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Rob Clark <robdclark@gmail.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Cc: rostedt@goodmis.org
Cc: daniel@ffwll.ch
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/51C8038C.9000106@canonical.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/ww-mutex-design.txt | 344 ++++++++++++++++++++++++++++++++++++
 include/linux/mutex-debug.h       |   1 +
 include/linux/mutex.h             | 355 +++++++++++++++++++++++++++++++++++++-
 kernel/mutex.c                    | 318 ++++++++++++++++++++++++++++++++--
 lib/debug_locks.c                 |   2 +
 5 files changed, 1003 insertions(+), 17 deletions(-)
 create mode 100644 Documentation/ww-mutex-design.txt

(limited to 'include/linux')

diff --git a/Documentation/ww-mutex-design.txt b/Documentation/ww-mutex-design.txt
new file mode 100644
index 000000000000..8a112dc304c3
--- /dev/null
+++ b/Documentation/ww-mutex-design.txt
@@ -0,0 +1,344 @@
+Wait/Wound Deadlock-Proof Mutex Design
+======================================
+
+Please read mutex-design.txt first, as it applies to wait/wound mutexes too.
+
+Motivation for WW-Mutexes
+-------------------------
+
+GPU's do operations that commonly involve many buffers.  Those buffers
+can be shared across contexts/processes, exist in different memory
+domains (for example VRAM vs system memory), and so on.  And with
+PRIME / dmabuf, they can even be shared across devices.  So there are
+a handful of situations where the driver needs to wait for buffers to
+become ready.  If you think about this in terms of waiting on a buffer
+mutex for it to become available, this presents a problem because
+there is no way to guarantee that buffers appear in a execbuf/batch in
+the same order in all contexts.  That is directly under control of
+userspace, and a result of the sequence of GL calls that an application
+makes.	Which results in the potential for deadlock.  The problem gets
+more complex when you consider that the kernel may need to migrate the
+buffer(s) into VRAM before the GPU operates on the buffer(s), which
+may in turn require evicting some other buffers (and you don't want to
+evict other buffers which are already queued up to the GPU), but for a
+simplified understanding of the problem you can ignore this.
+
+The algorithm that the TTM graphics subsystem came up with for dealing with
+this problem is quite simple.  For each group of buffers (execbuf) that need
+to be locked, the caller would be assigned a unique reservation id/ticket,
+from a global counter.  In case of deadlock while locking all the buffers
+associated with a execbuf, the one with the lowest reservation ticket (i.e.
+the oldest task) wins, and the one with the higher reservation id (i.e. the
+younger task) unlocks all of the buffers that it has already locked, and then
+tries again.
+
+In the RDBMS literature this deadlock handling approach is called wait/wound:
+The older tasks waits until it can acquire the contended lock. The younger tasks
+needs to back off and drop all the locks it is currently holding, i.e. the
+younger task is wounded.
+
+Concepts
+--------
+
+Compared to normal mutexes two additional concepts/objects show up in the lock
+interface for w/w mutexes:
+
+Acquire context: To ensure eventual forward progress it is important the a task
+trying to acquire locks doesn't grab a new reservation id, but keeps the one it
+acquired when starting the lock acquisition. This ticket is stored in the
+acquire context. Furthermore the acquire context keeps track of debugging state
+to catch w/w mutex interface abuse.
+
+W/w class: In contrast to normal mutexes the lock class needs to be explicit for
+w/w mutexes, since it is required to initialize the acquire context.
+
+Furthermore there are three different class of w/w lock acquire functions:
+
+* Normal lock acquisition with a context, using ww_mutex_lock.
+
+* Slowpath lock acquisition on the contending lock, used by the wounded task
+  after having dropped all already acquired locks. These functions have the
+  _slow postfix.
+
+  From a simple semantics point-of-view the _slow functions are not strictly
+  required, since simply calling the normal ww_mutex_lock functions on the
+  contending lock (after having dropped all other already acquired locks) will
+  work correctly. After all if no other ww mutex has been acquired yet there's
+  no deadlock potential and hence the ww_mutex_lock call will block and not
+  prematurely return -EDEADLK. The advantage of the _slow functions is in
+  interface safety:
+  - ww_mutex_lock has a __must_check int return type, whereas ww_mutex_lock_slow
+    has a void return type. Note that since ww mutex code needs loops/retries
+    anyway the __must_check doesn't result in spurious warnings, even though the
+    very first lock operation can never fail.
+  - When full debugging is enabled ww_mutex_lock_slow checks that all acquired
+    ww mutex have been released (preventing deadlocks) and makes sure that we
+    block on the contending lock (preventing spinning through the -EDEADLK
+    slowpath until the contended lock can be acquired).
+
+* Functions to only acquire a single w/w mutex, which results in the exact same
+  semantics as a normal mutex. This is done by calling ww_mutex_lock with a NULL
+  context.
+
+  Again this is not strictly required. But often you only want to acquire a
+  single lock in which case it's pointless to set up an acquire context (and so
+  better to avoid grabbing a deadlock avoidance ticket).
+
+Of course, all the usual variants for handling wake-ups due to signals are also
+provided.
+
+Usage
+-----
+
+Three different ways to acquire locks within the same w/w class. Common
+definitions for methods #1 and #2:
+
+static DEFINE_WW_CLASS(ww_class);
+
+struct obj {
+	struct ww_mutex lock;
+	/* obj data */
+};
+
+struct obj_entry {
+	struct list_head head;
+	struct obj *obj;
+};
+
+Method 1, using a list in execbuf->buffers that's not allowed to be reordered.
+This is useful if a list of required objects is already tracked somewhere.
+Furthermore the lock helper can use propagate the -EALREADY return code back to
+the caller as a signal that an object is twice on the list. This is useful if
+the list is constructed from userspace input and the ABI requires userspace to
+not have duplicate entries (e.g. for a gpu commandbuffer submission ioctl).
+
+int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj *res_obj = NULL;
+	struct obj_entry *contended_entry = NULL;
+	struct obj_entry *entry;
+
+	ww_acquire_init(ctx, &ww_class);
+
+retry:
+	list_for_each_entry (entry, list, head) {
+		if (entry->obj == res_obj) {
+			res_obj = NULL;
+			continue;
+		}
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			contended_entry = entry;
+			goto err;
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+
+err:
+	list_for_each_entry_continue_reverse (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	if (res_obj)
+		ww_mutex_unlock(&res_obj->lock);
+
+	if (ret == -EDEADLK) {
+		/* we lost out in a seqno race, lock and retry.. */
+		ww_mutex_lock_slow(&contended_entry->obj->lock, ctx);
+		res_obj = contended_entry->obj;
+		goto retry;
+	}
+	ww_acquire_fini(ctx);
+
+	return ret;
+}
+
+Method 2, using a list in execbuf->buffers that can be reordered. Same semantics
+of duplicate entry detection using -EALREADY as method 1 above. But the
+list-reordering allows for a bit more idiomatic code.
+
+int lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj_entry *entry, *entry2;
+
+	ww_acquire_init(ctx, &ww_class);
+
+	list_for_each_entry (entry, list, head) {
+		ret = ww_mutex_lock(&entry->obj->lock, ctx);
+		if (ret < 0) {
+			entry2 = entry;
+
+			list_for_each_entry_continue_reverse (entry2, list, head)
+				ww_mutex_unlock(&entry2->obj->lock);
+
+			if (ret != -EDEADLK) {
+				ww_acquire_fini(ctx);
+				return ret;
+			}
+
+			/* we lost out in a seqno race, lock and retry.. */
+			ww_mutex_lock_slow(&entry->obj->lock, ctx);
+
+			/*
+			 * Move buf to head of the list, this will point
+			 * buf->next to the first unlocked entry,
+			 * restarting the for loop.
+			 */
+			list_del(&entry->head);
+			list_add(&entry->head, list);
+		}
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+}
+
+Unlocking works the same way for both methods #1 and #2:
+
+void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj_entry *entry;
+
+	list_for_each_entry (entry, list, head)
+		ww_mutex_unlock(&entry->obj->lock);
+
+	ww_acquire_fini(ctx);
+}
+
+Method 3 is useful if the list of objects is constructed ad-hoc and not upfront,
+e.g. when adjusting edges in a graph where each node has its own ww_mutex lock,
+and edges can only be changed when holding the locks of all involved nodes. w/w
+mutexes are a natural fit for such a case for two reasons:
+- They can handle lock-acquisition in any order which allows us to start walking
+  a graph from a starting point and then iteratively discovering new edges and
+  locking down the nodes those edges connect to.
+- Due to the -EALREADY return code signalling that a given objects is already
+  held there's no need for additional book-keeping to break cycles in the graph
+  or keep track off which looks are already held (when using more than one node
+  as a starting point).
+
+Note that this approach differs in two important ways from the above methods:
+- Since the list of objects is dynamically constructed (and might very well be
+  different when retrying due to hitting the -EDEADLK wound condition) there's
+  no need to keep any object on a persistent list when it's not locked. We can
+  therefore move the list_head into the object itself.
+- On the other hand the dynamic object list construction also means that the -EALREADY return
+  code can't be propagated.
+
+Note also that methods #1 and #2 and method #3 can be combined, e.g. to first lock a
+list of starting nodes (passed in from userspace) using one of the above
+methods. And then lock any additional objects affected by the operations using
+method #3 below. The backoff/retry procedure will be a bit more involved, since
+when the dynamic locking step hits -EDEADLK we also need to unlock all the
+objects acquired with the fixed list. But the w/w mutex debug checks will catch
+any interface misuse for these cases.
+
+Also, method 3 can't fail the lock acquisition step since it doesn't return
+-EALREADY. Of course this would be different when using the _interruptible
+variants, but that's outside of the scope of these examples here.
+
+struct obj {
+	struct ww_mutex ww_mutex;
+	struct list_head locked_list;
+};
+
+static DEFINE_WW_CLASS(ww_class);
+
+void __unlock_objs(struct list_head *list)
+{
+	struct obj *entry, *temp;
+
+	list_for_each_entry_safe (entry, temp, list, locked_list) {
+		/* need to do that before unlocking, since only the current lock holder is
+		allowed to use object */
+		list_del(&entry->locked_list);
+		ww_mutex_unlock(entry->ww_mutex)
+	}
+}
+
+void lock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	struct obj *obj;
+
+	ww_acquire_init(ctx, &ww_class);
+
+retry:
+	/* re-init loop start state */
+	loop {
+		/* magic code which walks over a graph and decides which objects
+		 * to lock */
+
+		ret = ww_mutex_lock(obj->ww_mutex, ctx);
+		if (ret == -EALREADY) {
+			/* we have that one already, get to the next object */
+			continue;
+		}
+		if (ret == -EDEADLK) {
+			__unlock_objs(list);
+
+			ww_mutex_lock_slow(obj, ctx);
+			list_add(&entry->locked_list, list);
+			goto retry;
+		}
+
+		/* locked a new object, add it to the list */
+		list_add_tail(&entry->locked_list, list);
+	}
+
+	ww_acquire_done(ctx);
+	return 0;
+}
+
+void unlock_objs(struct list_head *list, struct ww_acquire_ctx *ctx)
+{
+	__unlock_objs(list);
+	ww_acquire_fini(ctx);
+}
+
+Method 4: Only lock one single objects. In that case deadlock detection and
+prevention is obviously overkill, since with grabbing just one lock you can't
+produce a deadlock within just one class. To simplify this case the w/w mutex
+api can be used with a NULL context.
+
+Implementation Details
+----------------------
+
+Design:
+  ww_mutex currently encapsulates a struct mutex, this means no extra overhead for
+  normal mutex locks, which are far more common. As such there is only a small
+  increase in code size if wait/wound mutexes are not used.
+
+  In general, not much contention is expected. The locks are typically used to
+  serialize access to resources for devices. The only way to make wakeups
+  smarter would be at the cost of adding a field to struct mutex_waiter. This
+  would add overhead to all cases where normal mutexes are used, and
+  ww_mutexes are generally less performance sensitive.
+
+Lockdep:
+  Special care has been taken to warn for as many cases of api abuse
+  as possible. Some common api abuses will be caught with
+  CONFIG_DEBUG_MUTEXES, but CONFIG_PROVE_LOCKING is recommended.
+
+  Some of the errors which will be warned about:
+   - Forgetting to call ww_acquire_fini or ww_acquire_init.
+   - Attempting to lock more mutexes after ww_acquire_done.
+   - Attempting to lock the wrong mutex after -EDEADLK and
+     unlocking all mutexes.
+   - Attempting to lock the right mutex after -EDEADLK,
+     before unlocking all mutexes.
+
+   - Calling ww_mutex_lock_slow before -EDEADLK was returned.
+
+   - Unlocking mutexes with the wrong unlock function.
+   - Calling one of the ww_acquire_* twice on the same context.
+   - Using a different ww_class for the mutex than for the ww_acquire_ctx.
+   - Normal lockdep errors that can result in deadlocks.
+
+  Some of the lockdep errors that can result in deadlocks:
+   - Calling ww_acquire_init to initialize a second ww_acquire_ctx before
+     having called ww_acquire_fini on the first.
+   - 'normal' deadlocks that can occur.
+
+FIXME: Update this section once we have the TASK_DEADLOCK task state flag magic
+implemented.
diff --git a/include/linux/mutex-debug.h b/include/linux/mutex-debug.h
index 731d77d6e155..4ac8b1977b73 100644
--- a/include/linux/mutex-debug.h
+++ b/include/linux/mutex-debug.h
@@ -3,6 +3,7 @@
 
 #include <linux/linkage.h>
 #include <linux/lockdep.h>
+#include <linux/debug_locks.h>
 
 /*
  * Mutexes - debugging helpers:
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 433da8a1a426..a56b0ccc8a6c 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -10,6 +10,7 @@
 #ifndef __LINUX_MUTEX_H
 #define __LINUX_MUTEX_H
 
+#include <asm/current.h>
 #include <linux/list.h>
 #include <linux/spinlock_types.h>
 #include <linux/linkage.h>
@@ -77,6 +78,36 @@ struct mutex_waiter {
 #endif
 };
 
+struct ww_class {
+	atomic_long_t stamp;
+	struct lock_class_key acquire_key;
+	struct lock_class_key mutex_key;
+	const char *acquire_name;
+	const char *mutex_name;
+};
+
+struct ww_acquire_ctx {
+	struct task_struct *task;
+	unsigned long stamp;
+	unsigned acquired;
+#ifdef CONFIG_DEBUG_MUTEXES
+	unsigned done_acquire;
+	struct ww_class *ww_class;
+	struct ww_mutex *contending_lock;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+};
+
+struct ww_mutex {
+	struct mutex base;
+	struct ww_acquire_ctx *ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct ww_class *ww_class;
+#endif
+};
+
 #ifdef CONFIG_DEBUG_MUTEXES
 # include <linux/mutex-debug.h>
 #else
@@ -101,8 +132,11 @@ static inline void mutex_destroy(struct mutex *lock) {}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
 		, .dep_map = { .name = #lockname }
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
+		, .ww_class = &ww_class
 #else
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
@@ -112,12 +146,48 @@ static inline void mutex_destroy(struct mutex *lock) {}
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
 		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
 
+#define __WW_CLASS_INITIALIZER(ww_class) \
+		{ .stamp = ATOMIC_LONG_INIT(0) \
+		, .acquire_name = #ww_class "_acquire" \
+		, .mutex_name = #ww_class "_mutex" }
+
+#define __WW_MUTEX_INITIALIZER(lockname, class) \
+		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
+
 #define DEFINE_MUTEX(mutexname) \
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
+#define DEFINE_WW_CLASS(classname) \
+	struct ww_class classname = __WW_CLASS_INITIALIZER(classname)
+
+#define DEFINE_WW_MUTEX(mutexname, ww_class) \
+	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
+
+
 extern void __mutex_init(struct mutex *lock, const char *name,
 			 struct lock_class_key *key);
 
+/**
+ * ww_mutex_init - initialize the w/w mutex
+ * @lock: the mutex to be initialized
+ * @ww_class: the w/w class the mutex should belong to
+ *
+ * Initialize the w/w mutex to unlocked state and associate it with the given
+ * class.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+static inline void ww_mutex_init(struct ww_mutex *lock,
+				 struct ww_class *ww_class)
+{
+	__mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
+	lock->ctx = NULL;
+#ifdef CONFIG_DEBUG_MUTEXES
+	lock->ww_class = ww_class;
+#endif
+}
+
 /**
  * mutex_is_locked - is the mutex locked
  * @lock: the mutex to be queried
@@ -136,6 +206,7 @@ static inline int mutex_is_locked(struct mutex *lock)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
 extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);
+
 extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
 					unsigned int subclass);
 extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
@@ -147,7 +218,7 @@ extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
 
 #define mutex_lock_nest_lock(lock, nest_lock)				\
 do {									\
-	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);		\
+	typecheck(struct lockdep_map *, &(nest_lock)->dep_map);	\
 	_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);		\
 } while (0)
 
@@ -170,6 +241,288 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+
+/**
+ * ww_acquire_init - initialize a w/w acquire context
+ * @ctx: w/w acquire context to initialize
+ * @ww_class: w/w class of the context
+ *
+ * Initializes an context to acquire multiple mutexes of the given w/w class.
+ *
+ * Context-based w/w mutex acquiring can be done in any order whatsoever within
+ * a given lock class. Deadlocks will be detected and handled with the
+ * wait/wound logic.
+ *
+ * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
+ * result in undetected deadlocks and is so forbidden. Mixing different contexts
+ * for the same w/w class when acquiring mutexes can also result in undetected
+ * deadlocks, and is hence also forbidden. Both types of abuse will be caught by
+ * enabling CONFIG_PROVE_LOCKING.
+ *
+ * Nesting of acquire contexts for _different_ w/w classes is possible, subject
+ * to the usual locking rules between different lock classes.
+ *
+ * An acquire context must be released with ww_acquire_fini by the same task
+ * before the memory is freed. It is recommended to allocate the context itself
+ * on the stack.
+ */
+static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
+				   struct ww_class *ww_class)
+{
+	ctx->task = current;
+	ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
+	ctx->acquired = 0;
+#ifdef CONFIG_DEBUG_MUTEXES
+	ctx->ww_class = ww_class;
+	ctx->done_acquire = 0;
+	ctx->contending_lock = NULL;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
+	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
+			 &ww_class->acquire_key, 0);
+	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+#endif
+}
+
+/**
+ * ww_acquire_done - marks the end of the acquire phase
+ * @ctx: the acquire context
+ *
+ * Marks the end of the acquire phase, any further w/w mutex lock calls using
+ * this context are forbidden.
+ *
+ * Calling this function is optional, it is just useful to document w/w mutex
+ * code and clearly designated the acquire phase from actually using the locked
+ * data structures.
+ */
+static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	lockdep_assert_held(ctx);
+
+	DEBUG_LOCKS_WARN_ON(ctx->done_acquire);
+	ctx->done_acquire = 1;
+#endif
+}
+
+/**
+ * ww_acquire_fini - releases a w/w acquire context
+ * @ctx: the acquire context to free
+ *
+ * Releases a w/w acquire context. This must be called _after_ all acquired w/w
+ * mutexes have been released with ww_mutex_unlock.
+ */
+static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
+
+	DEBUG_LOCKS_WARN_ON(ctx->acquired);
+	if (!config_enabled(CONFIG_PROVE_LOCKING))
+		/*
+		 * lockdep will normally handle this,
+		 * but fail without anyway
+		 */
+		ctx->done_acquire = 1;
+
+	if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC))
+		/* ensure ww_acquire_fini will still fail if called twice */
+		ctx->acquired = ~0U;
+#endif
+}
+
+extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
+					struct ww_acquire_ctx *ctx);
+extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						      struct ww_acquire_ctx *ctx);
+
+/**
+ * ww_mutex_lock - acquire the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context, or NULL to acquire only a single lock.
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this
+ * lock and proceed with trying to acquire further w/w mutexes (e.g. when
+ * scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides must not be freed with the mutex still
+ * locked. The mutex must first be initialized (or statically defined) before it
+ * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
+ * of the same w/w lock class as was used to initialize the acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	if (ctx)
+		return __ww_mutex_lock(lock, ctx);
+	else {
+		mutex_lock(&lock->base);
+		return 0;
+	}
+}
+
+/**
+ * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a
+ * signal arrives while waiting for the lock then this function returns -EINTR.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to
+ * not acquire this lock and proceed with trying to acquire further w/w mutexes
+ * (e.g. when scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides must not be freed with the mutex still
+ * locked. The mutex must first be initialized (or statically defined) before it
+ * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
+ * of the same w/w lock class as was used to initialize the acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+							   struct ww_acquire_ctx *ctx)
+{
+	if (ctx)
+		return __ww_mutex_lock_interruptible(lock, ctx);
+	else
+		return mutex_lock_interruptible(&lock->base);
+}
+
+/**
+ * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the context held. It is forbidden to call this on anything else than the
+ * contending mutex.
+ *
+ * Note that the slowpath lock acquiring can also be done by calling
+ * ww_mutex_lock directly. This function here is simply to help w/w mutex
+ * locking code readability by clearly denoting the slowpath.
+ */
+static inline void
+ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
+#endif
+	ret = ww_mutex_lock(lock, ctx);
+	(void)ret;
+}
+
+/**
+ * ww_mutex_lock_slow_interruptible - slowpath acquiring of the w/w mutex,
+ * 				      interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available and returns 0 when the lock has
+ * been acquired. If a signal arrives while waiting for the lock then this
+ * function returns -EINTR.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the given context held. It is forbidden to call this on anything else
+ * than the contending mutex.
+ *
+ * Note that the slowpath lock acquiring can also be done by calling
+ * ww_mutex_lock_interruptible directly. This function here is simply to help
+ * w/w mutex locking code readability by clearly denoting the slowpath.
+ */
+static inline int __must_check
+ww_mutex_lock_slow_interruptible(struct ww_mutex *lock,
+				 struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
+#endif
+	return ww_mutex_lock_interruptible(lock, ctx);
+}
+
+extern void ww_mutex_unlock(struct ww_mutex *lock);
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context
+ * @lock: mutex to lock
+ *
+ * Trylocks a mutex without acquire context, so no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ */
+static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
+{
+	return mutex_trylock(&lock->base);
+}
+
+/***
+ * ww_mutex_destroy - mark a w/w mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+static inline void ww_mutex_destroy(struct ww_mutex *lock)
+{
+	mutex_destroy(&lock->base);
+}
+
+/**
+ * ww_mutex_is_locked - is the w/w mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline bool ww_mutex_is_locked(struct ww_mutex *lock)
+{
+	return mutex_is_locked(&lock->base);
+}
+
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 42f8dda2467b..fc801aafe8fd 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -254,16 +254,165 @@ void __sched mutex_unlock(struct mutex *lock)
 
 EXPORT_SYMBOL(mutex_unlock);
 
+/**
+ * ww_mutex_unlock - release the w/w mutex
+ * @lock: the mutex to be released
+ *
+ * Unlock a mutex that has been locked by this task previously with any of the
+ * ww_mutex_lock* functions (with or without an acquire context). It is
+ * forbidden to release the locks after releasing the acquire context.
+ *
+ * This function must not be used in interrupt context. Unlocking
+ * of a unlocked mutex is not allowed.
+ */
+void __sched ww_mutex_unlock(struct ww_mutex *lock)
+{
+	/*
+	 * The unlocking fastpath is the 0->1 transition from 'locked'
+	 * into 'unlocked' state:
+	 */
+	if (lock->ctx) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired);
+#endif
+		if (lock->ctx->acquired > 0)
+			lock->ctx->acquired--;
+		lock->ctx = NULL;
+	}
+
+#ifndef CONFIG_DEBUG_MUTEXES
+	/*
+	 * When debugging is enabled we must not clear the owner before time,
+	 * the slow path will always be taken, and that clears the owner field
+	 * after verifying that it was indeed current.
+	 */
+	mutex_clear_owner(&lock->base);
+#endif
+	__mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath);
+}
+EXPORT_SYMBOL(ww_mutex_unlock);
+
+static inline int __sched
+__mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
+	struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx);
+
+	if (!hold_ctx)
+		return 0;
+
+	if (unlikely(ctx == hold_ctx))
+		return -EALREADY;
+
+	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&
+	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {
+#ifdef CONFIG_DEBUG_MUTEXES
+		DEBUG_LOCKS_WARN_ON(ctx->contending_lock);
+		ctx->contending_lock = ww;
+#endif
+		return -EDEADLK;
+	}
+
+	return 0;
+}
+
+static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
+						   struct ww_acquire_ctx *ww_ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	/*
+	 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
+	 * but released with a normal mutex_unlock in this call.
+	 *
+	 * This should never happen, always use ww_mutex_unlock.
+	 */
+	DEBUG_LOCKS_WARN_ON(ww->ctx);
+
+	/*
+	 * Not quite done after calling ww_acquire_done() ?
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
+
+	if (ww_ctx->contending_lock) {
+		/*
+		 * After -EDEADLK you tried to
+		 * acquire a different ww_mutex? Bad!
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
+
+		/*
+		 * You called ww_mutex_lock after receiving -EDEADLK,
+		 * but 'forgot' to unlock everything else first?
+		 */
+		DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
+		ww_ctx->contending_lock = NULL;
+	}
+
+	/*
+	 * Naughty, using a different class will lead to undefined behavior!
+	 */
+	DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
+#endif
+	ww_ctx->acquired++;
+}
+
+/*
+ * after acquiring lock with fastpath or when we lost out in contested
+ * slowpath, set ctx and wake up any waiters so they can recheck.
+ *
+ * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
+ * as the fastpath and opportunistic spinning are disabled in that case.
+ */
+static __always_inline void
+ww_mutex_set_context_fastpath(struct ww_mutex *lock,
+			       struct ww_acquire_ctx *ctx)
+{
+	unsigned long flags;
+	struct mutex_waiter *cur;
+
+	ww_mutex_lock_acquired(lock, ctx);
+
+	lock->ctx = ctx;
+
+	/*
+	 * The lock->ctx update should be visible on all cores before
+	 * the atomic read is done, otherwise contended waiters might be
+	 * missed. The contended waiters will either see ww_ctx == NULL
+	 * and keep spinning, or it will acquire wait_lock, add itself
+	 * to waiter list and sleep.
+	 */
+	smp_mb(); /* ^^^ */
+
+	/*
+	 * Check if lock is contended, if not there is nobody to wake up
+	 */
+	if (likely(atomic_read(&lock->base.count) == 0))
+		return;
+
+	/*
+	 * Uh oh, we raced in fastpath, wake up everyone in this case,
+	 * so they can see the new lock->ctx.
+	 */
+	spin_lock_mutex(&lock->base.wait_lock, flags);
+	list_for_each_entry(cur, &lock->base.wait_list, list) {
+		debug_mutex_wake_waiter(&lock->base, cur);
+		wake_up_process(cur->task);
+	}
+	spin_unlock_mutex(&lock->base.wait_lock, flags);
+}
+
 /*
  * Lock a mutex (possibly interruptible), slowpath:
  */
-static inline int __sched
+static __always_inline int __sched
 __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
-		    struct lockdep_map *nest_lock, unsigned long ip)
+		    struct lockdep_map *nest_lock, unsigned long ip,
+		    struct ww_acquire_ctx *ww_ctx)
 {
 	struct task_struct *task = current;
 	struct mutex_waiter waiter;
 	unsigned long flags;
+	int ret;
 
 	preempt_disable();
 	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
@@ -298,6 +447,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		struct task_struct *owner;
 		struct mspin_node  node;
 
+		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+			struct ww_mutex *ww;
+
+			ww = container_of(lock, struct ww_mutex, base);
+			/*
+			 * If ww->ctx is set the contents are undefined, only
+			 * by acquiring wait_lock there is a guarantee that
+			 * they are not invalid when reading.
+			 *
+			 * As such, when deadlock detection needs to be
+			 * performed the optimistic spinning cannot be done.
+			 */
+			if (ACCESS_ONCE(ww->ctx))
+				break;
+		}
+
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
@@ -312,6 +477,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		if ((atomic_read(&lock->count) == 1) &&
 		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
 			lock_acquired(&lock->dep_map, ip);
+			if (!__builtin_constant_p(ww_ctx == NULL)) {
+				struct ww_mutex *ww;
+				ww = container_of(lock, struct ww_mutex, base);
+
+				ww_mutex_set_context_fastpath(ww, ww_ctx);
+			}
+
 			mutex_set_owner(lock);
 			mspin_unlock(MLOCK(lock), &node);
 			preempt_enable();
@@ -371,15 +543,16 @@ slowpath:
 		 * TASK_UNINTERRUPTIBLE case.)
 		 */
 		if (unlikely(signal_pending_state(state, task))) {
-			mutex_remove_waiter(lock, &waiter,
-					    task_thread_info(task));
-			mutex_release(&lock->dep_map, 1, ip);
-			spin_unlock_mutex(&lock->wait_lock, flags);
+			ret = -EINTR;
+			goto err;
+		}
 
-			debug_mutex_free_waiter(&waiter);
-			preempt_enable();
-			return -EINTR;
+		if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) {
+			ret = __mutex_lock_check_stamp(lock, ww_ctx);
+			if (ret)
+				goto err;
 		}
+
 		__set_task_state(task, state);
 
 		/* didn't get the lock, go to sleep: */
@@ -394,6 +567,30 @@ done:
 	mutex_remove_waiter(lock, &waiter, current_thread_info());
 	mutex_set_owner(lock);
 
+	if (!__builtin_constant_p(ww_ctx == NULL)) {
+		struct ww_mutex *ww = container_of(lock,
+						      struct ww_mutex,
+						      base);
+		struct mutex_waiter *cur;
+
+		/*
+		 * This branch gets optimized out for the common case,
+		 * and is only important for ww_mutex_lock.
+		 */
+
+		ww_mutex_lock_acquired(ww, ww_ctx);
+		ww->ctx = ww_ctx;
+
+		/*
+		 * Give any possible sleeping processes the chance to wake up,
+		 * so they can recheck if they have to back off.
+		 */
+		list_for_each_entry(cur, &lock->wait_list, list) {
+			debug_mutex_wake_waiter(lock, cur);
+			wake_up_process(cur->task);
+		}
+	}
+
 	/* set it to 0 if there are no waiters left: */
 	if (likely(list_empty(&lock->wait_list)))
 		atomic_set(&lock->count, 0);
@@ -404,6 +601,14 @@ done:
 	preempt_enable();
 
 	return 0;
+
+err:
+	mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+	spin_unlock_mutex(&lock->wait_lock, flags);
+	debug_mutex_free_waiter(&waiter);
+	mutex_release(&lock->dep_map, 1, ip);
+	preempt_enable();
+	return ret;
 }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -411,7 +616,8 @@ void __sched
 mutex_lock_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    subclass, NULL, _RET_IP_, NULL);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -420,7 +626,8 @@ void __sched
 _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
 {
 	might_sleep();
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
+			    0, nest, _RET_IP_, NULL);
 }
 
 EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -429,7 +636,8 @@ int __sched
 mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
-	return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_KILLABLE,
+				   subclass, NULL, _RET_IP_, NULL);
 }
 EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
 
@@ -438,10 +646,30 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 {
 	might_sleep();
 	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
-				   subclass, NULL, _RET_IP_);
+				   subclass, NULL, _RET_IP_, NULL);
 }
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
+
+
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+				   0, &ctx->dep_map, _RET_IP_, ctx);
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	might_sleep();
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+				   0, &ctx->dep_map, _RET_IP_, ctx);
+}
+EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
+
 #endif
 
 /*
@@ -544,20 +772,39 @@ __mutex_lock_slowpath(atomic_t *lock_count)
 {
 	struct mutex *lock = container_of(lock_count, struct mutex, count);
 
-	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
+	__mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
+			    NULL, _RET_IP_, NULL);
 }
 
 static noinline int __sched
 __mutex_lock_killable_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_KILLABLE, 0,
+				   NULL, _RET_IP_, NULL);
 }
 
 static noinline int __sched
 __mutex_lock_interruptible_slowpath(struct mutex *lock)
 {
-	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
+	return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, NULL);
+}
+
+static noinline int __sched
+__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, ctx);
 }
+
+static noinline int __sched
+__ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
+					    struct ww_acquire_ctx *ctx)
+{
+	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
+				   NULL, _RET_IP_, ctx);
+}
+
 #endif
 
 /*
@@ -613,6 +860,45 @@ int __sched mutex_trylock(struct mutex *lock)
 }
 EXPORT_SYMBOL(mutex_trylock);
 
+#ifndef CONFIG_DEBUG_LOCK_ALLOC
+int __sched
+__ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+	if (likely(!ret)) {
+		ww_mutex_set_context_fastpath(lock, ctx);
+		mutex_set_owner(&lock->base);
+	} else
+		ret = __ww_mutex_lock_slowpath(lock, ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock);
+
+int __sched
+__ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+
+	might_sleep();
+
+	ret = __mutex_fastpath_lock_retval(&lock->base.count);
+
+	if (likely(!ret)) {
+		ww_mutex_set_context_fastpath(lock, ctx);
+		mutex_set_owner(&lock->base);
+	} else
+		ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx);
+	return ret;
+}
+EXPORT_SYMBOL(__ww_mutex_lock_interruptible);
+
+#endif
+
 /**
  * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
  * @cnt: the atomic which we are to dec
diff --git a/lib/debug_locks.c b/lib/debug_locks.c
index f2fa60c59343..96c4c633d95e 100644
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(debug_locks);
  * a locking bug is detected.
  */
 int debug_locks_silent;
+EXPORT_SYMBOL_GPL(debug_locks_silent);
 
 /*
  * Generic 'turn off all lock debugging' function:
@@ -44,3 +45,4 @@ int debug_locks_off(void)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(debug_locks_off);
-- 
cgit v1.3-7-g2ca7


From 230100276955529d5a7c69207421756b9a61a8e5 Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 20 Jun 2013 13:31:17 +0200
Subject: mutex: Add w/w mutex slowpath debugging

Injects EDEADLK conditions at pseudo-random interval, with
exponential backoff up to UINT_MAX (to ensure that every lock
operation still completes in a reasonable time).

This way we can test the wound slowpath even for ww mutex users
where contention is never expected, and the ww deadlock
avoidance algorithm is only needed for correctness against
malicious userspace. An example would be protecting kernel
modesetting properties, which thanks to single-threaded X isn't
really expected to contend, ever.

I've looked into using the CONFIG_FAULT_INJECTION
infrastructure, but decided against it for two reasons:

- EDEADLK handling is mandatory for ww mutex users and should
  never affect the outcome of a syscall. This is in contrast to -ENOMEM
  injection. So fine configurability isn't required.

- The fault injection framework only allows to set a simple
  probability for failure. Now the probability that a ww mutex acquire
  stage with N locks will never complete (due to too many injected
  EDEADLK backoffs) is zero. But the expected number of ww_mutex_lock
  operations for the completely uncontended case would be O(exp(N)).
  The per-acuiqire ctx exponential backoff solution choosen here only
  results in O(log N) overhead due to injection and so O(log N * N)
  lock operations. This way we can fail with high probability (and so
  have good test coverage even for fancy backoff and lock acquisition
  paths) without running into patalogical cases.

Note that EDEADLK will only ever be injected when we managed to
acquire the lock. This prevents any behaviour changes for users
which rely on the EALREADY semantics.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: dri-devel@lists.freedesktop.org
Cc: linaro-mm-sig@lists.linaro.org
Cc: rostedt@goodmis.org
Cc: daniel@ffwll.ch
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20130620113117.4001.21681.stgit@patser
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h |  8 ++++++++
 kernel/mutex.c        | 44 +++++++++++++++++++++++++++++++++++++++++---
 lib/Kconfig.debug     | 13 +++++++++++++
 3 files changed, 62 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index a56b0ccc8a6c..3793ed7feeeb 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -98,6 +98,10 @@ struct ww_acquire_ctx {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
 #endif
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	unsigned deadlock_inject_interval;
+	unsigned deadlock_inject_countdown;
+#endif
 };
 
 struct ww_mutex {
@@ -283,6 +287,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
 			 &ww_class->acquire_key, 0);
 	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
 #endif
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	ctx->deadlock_inject_interval = 1;
+	ctx->deadlock_inject_countdown = ctx->stamp & 0xf;
+#endif
 }
 
 /**
diff --git a/kernel/mutex.c b/kernel/mutex.c
index fc801aafe8fd..e581ada5faf4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -651,22 +651,60 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
 
 EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
 
+static inline int
+ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	unsigned tmp;
+
+	if (ctx->deadlock_inject_countdown-- == 0) {
+		tmp = ctx->deadlock_inject_interval;
+		if (tmp > UINT_MAX/4)
+			tmp = UINT_MAX;
+		else
+			tmp = tmp*2 + tmp + tmp/2;
+
+		ctx->deadlock_inject_interval = tmp;
+		ctx->deadlock_inject_countdown = tmp;
+		ctx->contending_lock = lock;
+
+		ww_mutex_unlock(lock);
+
+		return -EDEADLK;
+	}
+#endif
+
+	return 0;
+}
 
 int __sched
 __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
+	ret =  __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
 				   0, &ctx->dep_map, _RET_IP_, ctx);
+	if (!ret && ctx->acquired > 0)
+		return ww_mutex_deadlock_injection(lock, ctx);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__ww_mutex_lock);
 
 int __sched
 __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
 {
+	int ret;
+
 	might_sleep();
-	return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
-				   0, &ctx->dep_map, _RET_IP_, ctx);
+	ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
+				  0, &ctx->dep_map, _RET_IP_, ctx);
+
+	if (!ret && ctx->acquired > 0)
+		return ww_mutex_deadlock_injection(lock, ctx);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 566cf2bc08ea..7154f799541a 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -547,6 +547,19 @@ config DEBUG_MUTEXES
 	 This feature allows mutex semantics violations to be detected and
 	 reported.
 
+config DEBUG_WW_MUTEX_SLOWPATH
+	bool "Wait/wound mutex debugging: Slowpath testing"
+	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
+	select DEBUG_LOCK_ALLOC
+	select DEBUG_SPINLOCK
+	select DEBUG_MUTEXES
+	help
+	 This feature enables slowpath testing for w/w mutex users by
+	 injecting additional -EDEADLK wound/backoff cases. Together with
+	 the full mutex checks enabled with (CONFIG_PROVE_LOCKING) this
+	 will test all possible w/w mutex interface abuse with the
+	 exception of simply not acquiring all the required locks.
+
 config DEBUG_LOCK_ALLOC
 	bool "Lock debugging: detect incorrect freeing of live locks"
 	depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT
-- 
cgit v1.3-7-g2ca7


From 1672d040709b789671c0502e7aac9d632c2f9175 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jun 2013 18:04:54 -0700
Subject: cgroup: fix cgroupfs_root early destruction path

cgroupfs_root used to have ->actual_subsys_mask in addition to
->subsys_mask.  a8a648c4ac ("cgroup: remove
cgroup->actual_subsys_mask") removed it noting that the subsys_mask is
essentially temporary and doesn't belong in cgroupfs_root; however,
the patch made it impossible to tell whether a cgroupfs_root actually
has the subsystems bound or just have the bits set leading to the
following BUG when trying to mount with subsystems which are already
mounted elsewhere.

 kernel BUG at kernel/cgroup.c:1038!
 invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
 ...
 CPU: 1 PID: 7973 Comm: mount Tainted: G        W    3.10.0-rc7-next-20130625-sasha-00011-g1c1dc0e #1105
 task: ffff880fc0ae8000 ti: ffff880fc0b9a000 task.ti: ffff880fc0b9a000
 RIP: 0010:[<ffffffff81249b29>]  [<ffffffff81249b29>] rebind_subsystems+0x409/0x5f0
 ...
 Call Trace:
  [<ffffffff8124bd4f>] cgroup_kill_sb+0xff/0x210
  [<ffffffff813d21af>] deactivate_locked_super+0x4f/0x90
  [<ffffffff8124f3b3>] cgroup_mount+0x673/0x6e0
  [<ffffffff81257169>] cpuset_mount+0xd9/0x110
  [<ffffffff813d2580>] mount_fs+0xb0/0x2d0
  [<ffffffff81404afd>] vfs_kern_mount+0xbd/0x180
  [<ffffffff814070b5>] do_new_mount+0x145/0x2c0
  [<ffffffff814085d6>] do_mount+0x356/0x3c0
  [<ffffffff8140873d>] SyS_mount+0xfd/0x140
  [<ffffffff854eb600>] tracesys+0xdd/0xe2

We still want rebind_subsystems() to take added/removed masks, so
let's fix it by marking whether a cgroupfs_root has finished binding
or not.  Also, document what's going on around ->subsys_mask
initialization so that similar mistakes aren't repeated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 22 +++++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4c1eceb8c439..8e4fd5e67384 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -276,6 +276,7 @@ enum {
 
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+	CGRP_ROOT_SUBSYS_BOUND	= (1 << 3), /* subsystems finished binding */
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f9c99abc38ab..e801ecfa36ef 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1086,6 +1086,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 		}
 	}
 
+	/*
+	 * Mark @root has finished binding subsystems.  @root->subsys_mask
+	 * now matches the bound subsystems.
+	 */
+	root->flags |= CGRP_ROOT_SUBSYS_BOUND;
+
 	return 0;
 }
 
@@ -1485,6 +1491,14 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
 
 	init_cgroup_root(root);
 
+	/*
+	 * We need to set @root->subsys_mask now so that @root can be
+	 * matched by cgroup_test_super() before it finishes
+	 * initialization; otherwise, competing mounts with the same
+	 * options may try to bind the same subsystems instead of waiting
+	 * for the first one leading to unexpected mount errors.
+	 * SUBSYS_BOUND will be set once actual binding is complete.
+	 */
 	root->subsys_mask = opts->subsys_mask;
 	root->flags = opts->flags;
 	ida_init(&root->cgroup_ida);
@@ -1734,9 +1748,11 @@ static void cgroup_kill_sb(struct super_block *sb) {
 	mutex_lock(&cgroup_root_mutex);
 
 	/* Rebind all subsystems back to the default hierarchy */
-	ret = rebind_subsystems(root, 0, root->subsys_mask);
-	/* Shouldn't be able to fail ... */
-	BUG_ON(ret);
+	if (root->flags & CGRP_ROOT_SUBSYS_BOUND) {
+		ret = rebind_subsystems(root, 0, root->subsys_mask);
+		/* Shouldn't be able to fail ... */
+		BUG_ON(ret);
+	}
 
 	/*
 	 * Release all the links from cset_links to this hierarchy's
-- 
cgit v1.3-7-g2ca7


From 14611e51a57df10240817d8ada510842faf0ec51 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 25 Jun 2013 11:48:32 -0700
Subject: cgroup: fix RCU accesses to task->cgroups

task->cgroups is a RCU pointer pointing to struct css_set.  A task
switches to a different css_set on cgroup migration but a css_set
doesn't change once created and its pointers to cgroup_subsys_states
aren't RCU protected.

task_subsys_state[_check]() is the macro to acquire css given a task
and subsys_id pair.  It RCU-dereferences task->cgroups->subsys[] not
task->cgroups, so the RCU pointer task->cgroups ends up being
dereferenced without read_barrier_depends() after it.  It's broken.

Fix it by introducing task_css_set[_check]() which does
RCU-dereference on task->cgroups.  task_subsys_state[_check]() is
reimplemented to directly dereference ->subsys[] of the css_set
returned from task_css_set[_check]().

This removes some of sparse RCU warnings in cgroup.

v2: Fixed unbalanced parenthsis and there's no need to use
    rcu_dereference_raw() when !CONFIG_PROVE_RCU.  Both spotted by Li.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: stable@vger.kernel.org
---
 include/linux/cgroup.h | 58 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8e4fd5e67384..ad3555bc21f4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -635,22 +635,60 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
 	return cgrp->subsys[subsys_id];
 }
 
-/*
- * function to get the cgroup_subsys_state which allows for extra
- * rcu_dereference_check() conditions, such as locks used during the
- * cgroup_subsys::attach() methods.
+/**
+ * task_css_set_check - obtain a task's css_set with extra access conditions
+ * @task: the task to obtain css_set for
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * A task's css_set is RCU protected, initialized and exited while holding
+ * task_lock(), and can only be modified while holding both cgroup_mutex
+ * and task_lock() while the task is alive.  This macro verifies that the
+ * caller is inside proper critical section and returns @task's css_set.
+ *
+ * The caller can also specify additional allowed conditions via @__c, such
+ * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
-#define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference_check((task)->cgroups->subsys[(subsys_id)],	\
-			      lockdep_is_held(&(task)->alloc_lock) ||	\
-			      lockdep_is_held(&cgroup_mutex) || (__c))
+#define task_css_set_check(task, __c)					\
+	rcu_dereference_check((task)->cgroups,				\
+		lockdep_is_held(&(task)->alloc_lock) ||			\
+		lockdep_is_held(&cgroup_mutex) || (__c))
 #else
-#define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference((task)->cgroups->subsys[(subsys_id)])
+#define task_css_set_check(task, __c)					\
+	rcu_dereference((task)->cgroups)
 #endif
 
+/**
+ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ * @__c: extra condition expression to be passed to rcu_dereference_check()
+ *
+ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
+ * synchronization rules are the same as task_css_set_check().
+ */
+#define task_subsys_state_check(task, subsys_id, __c)			\
+	task_css_set_check((task), (__c))->subsys[(subsys_id)]
+
+/**
+ * task_css_set - obtain a task's css_set
+ * @task: the task to obtain css_set for
+ *
+ * See task_css_set_check().
+ */
+static inline struct css_set *task_css_set(struct task_struct *task)
+{
+	return task_css_set_check(task, false);
+}
+
+/**
+ * task_subsys_state - obtain css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * See task_subsys_state_check().
+ */
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
 {
-- 
cgit v1.3-7-g2ca7


From 44b9ca47534d478bf9954d380515cb282fcb48a2 Mon Sep 17 00:00:00 2001
From: Sebastian Ott <sebott@linux.vnet.ibm.com>
Date: Wed, 5 Jun 2013 16:05:32 +0200
Subject: pci: add pcibios_release_device

Platforms may want to provide architecture-specific functionality when
a pci device is released. Add a pcibios_release_device() call that
architectures can override to do so.

Signed-off-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/pci/pci.c   | 10 ++++++++++
 drivers/pci/probe.c |  1 +
 include/linux/pci.h |  1 +
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index a899d8bb190d..95057a925d80 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -1334,6 +1334,16 @@ int __weak pcibios_add_device (struct pci_dev *dev)
 	return 0;
 }
 
+/**
+ * pcibios_release_device - provide arch specific hooks when releasing device dev
+ * @dev: the PCI device being released
+ *
+ * Permits the platform to provide architecture specific functionality when
+ * devices are released. This is the default implementation. Architecture
+ * implementations can override this.
+ */
+void __weak pcibios_release_device(struct pci_dev *dev) {}
+
 /**
  * pcibios_disable_device - disable arch specific PCI resources for device dev
  * @dev: the PCI device to disable
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 70f10fa3c1b2..58cc0a8a0979 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1132,6 +1132,7 @@ static void pci_release_dev(struct device *dev)
 	pci_dev = to_pci_dev(dev);
 	pci_release_capabilities(pci_dev);
 	pci_release_of_node(pci_dev);
+	pcibios_release_device(pci_dev);
 	kfree(pci_dev);
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 3a24e4ff3248..8f170e9073a5 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1643,6 +1643,7 @@ void pcibios_set_master(struct pci_dev *dev);
 int pcibios_set_pcie_reset_state(struct pci_dev *dev,
 				 enum pcie_reset_state state);
 int pcibios_add_device(struct pci_dev *dev);
+void pcibios_release_device(struct pci_dev *dev);
 
 #ifdef CONFIG_PCI_MMCONFIG
 void __init pci_mmcfg_early_init(void);
-- 
cgit v1.3-7-g2ca7


From 91bbe923d18cfff4286a84e59b9d5b61389c3027 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhart@linux.intel.com>
Date: Tue, 25 Jun 2013 20:08:46 -0600
Subject: PCI: Add CircuitCo vendor ID and subsystem ID

Add CircuitCo's newly created VENDOR ID and their first board subsystem
ID for the MinnowBoard.

[bhelgaas: sort, change DEVICE_ID to SUBSYSTEM_ID]
Signed-off-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci_ids.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c12916248469..4c7b3492452d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2476,6 +2476,9 @@
 
 #define PCI_VENDOR_ID_ASMEDIA		0x1b21
 
+#define PCI_VENDOR_ID_CIRCUITCO		0x1cc8
+#define PCI_SUBSYSTEM_ID_CIRCUITCO_MINNOWBOARD	0x0001
+
 #define PCI_VENDOR_ID_TEKRAM		0x1de1
 #define PCI_DEVICE_ID_TEKRAM_DC290	0xdc29
 
-- 
cgit v1.3-7-g2ca7


From 141965c7494d984b2bf24efd361a3125278869c6 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Wed, 26 Jun 2013 13:05:39 +0800
Subject: Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for
 load-tracking"

Remove CONFIG_FAIR_GROUP_SCHED that covers the runnable info, then
we can use runnable load variables.

Also remove 2 CONFIG_FAIR_GROUP_SCHED setting which is not in reverted
patch(introduced in 9ee474f), but also need to revert.

Signed-off-by: Alex Shi <alex.shi@intel.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/51CA76A3.3050207@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  7 +------
 kernel/sched/core.c   |  7 +------
 kernel/sched/fair.c   | 17 ++++-------------
 kernel/sched/sched.h  | 19 ++-----------------
 4 files changed, 8 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d909f14..0019befea59a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -994,12 +994,7 @@ struct sched_entity {
 	struct cfs_rq		*my_q;
 #endif
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	/* Per-entity load-tracking */
 	struct sched_avg	avg;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ceeaf0f45be0..0241b1b55a04 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1611,12 +1611,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	p->se.avg.runnable_avg_period = 0;
 	p->se.avg.runnable_avg_sum = 0;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c0ac2c3b56e1..36eadaaa4e5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1128,8 +1128,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
@@ -3430,12 +3429,6 @@ unlock:
 	return new_cpu;
 }
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
@@ -3459,7 +3452,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
 	}
 }
-#endif
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -5861,7 +5853,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 		se->vruntime -= cfs_rq->min_vruntime;
 	}
 
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	/*
 	* Remove our load from contribution when we leave sched_fair
 	* and ensure we don't carry in an old decay_count if we
@@ -5920,7 +5912,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 #endif
-#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	atomic64_set(&cfs_rq->decay_counter, 1);
 	atomic64_set(&cfs_rq->removed_load, 0);
 #endif
@@ -6162,9 +6154,8 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	.migrate_task_rq	= migrate_task_rq_fair,
-#endif
+
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 029601a61587..77ce668ba302 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -269,12 +269,6 @@ struct cfs_rq {
 #endif
 
 #ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * CFS Load tracking
 	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -284,9 +278,9 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	/* Required to track per-cpu representation of a task_group */
 	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1027,17 +1021,8 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq, int cpu);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
 
-/*
- * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
- * becomes useful in lb
- */
-#if defined(CONFIG_FAIR_GROUP_SCHED)
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
-#else
-static inline void idle_enter_fair(struct rq *this_rq) {}
-static inline void idle_exit_fair(struct rq *this_rq) {}
-#endif
 
 #else	/* CONFIG_SMP */
 
-- 
cgit v1.3-7-g2ca7


From 239003ea2e30374d1cdfee788867e497cca2366c Mon Sep 17 00:00:00 2001
From: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Date: Thu, 27 Jun 2013 11:34:09 +0530
Subject: sched: Fix typo in struct sched_avg member description

Remove extra 'for' from the description about member of
struct sched_avg.

Signed-off-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Cc: pjt@google.com
Cc: peterz@infradead.org
Link: http://lkml.kernel.org/r/20130627060409.GB18582@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0019befea59a..ec80684a0127 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -924,7 +924,7 @@ struct load_weight {
 struct sched_avg {
 	/*
 	 * These sums represent an infinite geometric series and so are bound
-	 * above by 1024/(1-y).  Thus we only need a u32 to store them for for all
+	 * above by 1024/(1-y).  Thus we only need a u32 to store them for all
 	 * choices of y < 1-2^(-32)*1024.
 	 */
 	u32 runnable_avg_sum, runnable_avg_period;
-- 
cgit v1.3-7-g2ca7


From 594fbe713bf60073ed884dc317a74dd5b327aba7 Mon Sep 17 00:00:00 2001
From: Arnaud Ebalard <arno@natisbad.org>
Date: Thu, 20 Jun 2013 22:21:04 +0200
Subject: Add support for GMT G762/G763 PWM fan controllers

GMT G762/763 fan speed PWM controller is connected directly to a fan
and performs closed-loop or open-loop control of the fan speed. Two
modes - PWM or DC - are supported by the chip. Introduced driver
provides various knobs to control the operations of the chip (via
sysfs interface). Specific characteristics of the system can be passed
either using board init code or via DT. Documentation for both the
driver and DT bindings are also provided.

Signed-off-by: Arnaud Ebalard <arno@natisbad.org>
Tested-by: Simon Guinot <simon.guinot@sequanux.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 Documentation/devicetree/bindings/hwmon/g762.txt |   47 +
 Documentation/hwmon/g762                         |   65 ++
 drivers/hwmon/Kconfig                            |   10 +
 drivers/hwmon/Makefile                           |    1 +
 drivers/hwmon/g762.c                             | 1149 ++++++++++++++++++++++
 include/linux/platform_data/g762.h               |   37 +
 6 files changed, 1309 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/hwmon/g762.txt
 create mode 100644 Documentation/hwmon/g762
 create mode 100644 drivers/hwmon/g762.c
 create mode 100644 include/linux/platform_data/g762.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/hwmon/g762.txt b/Documentation/devicetree/bindings/hwmon/g762.txt
new file mode 100644
index 000000000000..25cc6d8ee575
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/g762.txt
@@ -0,0 +1,47 @@
+GMT G762/G763 PWM Fan controller
+
+Required node properties:
+
+ - "compatible": must be either "gmt,g762" or "gmt,g763"
+ - "reg": I2C bus address of the device
+ - "clocks": a fixed clock providing input clock frequency
+	     on CLK pin of the chip.
+
+Optional properties:
+
+ - "fan_startv": fan startup voltage. Accepted values are 0, 1, 2 and 3.
+	       The higher the more.
+
+ - "pwm_polarity": pwm polarity. Accepted values are 0 (positive duty)
+	       and 1 (negative duty).
+
+ - "fan_gear_mode": fan gear mode. Supported values are 0, 1 and 2.
+
+If an optional property is not set in .dts file, then current value is kept
+unmodified (e.g. u-boot installed value).
+
+Additional information on operational parameters for the device is available
+in Documentation/hwmon/g762. A detailed datasheet for the device is available
+at http://natisbad.org/NAS/refs/GMT_EDS-762_763-080710-0.2.pdf.
+
+Example g762 node:
+
+   clocks {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	g762_clk: fixedclk {
+		 compatible = "fixed-clock";
+		 #clock-cells = <0>;
+		 clock-frequency = <8192>;
+	}
+   }
+
+   g762: g762@3e {
+	compatible = "gmt,g762";
+	reg = <0x3e>;
+	clocks = <&g762_clk>
+	fan_gear_mode = <0>; /* chip default */
+	fan_startv = <1>;    /* chip default */
+	pwm_polarity = <0>;  /* chip default */
+   };
diff --git a/Documentation/hwmon/g762 b/Documentation/hwmon/g762
new file mode 100644
index 000000000000..923db9c5b5bc
--- /dev/null
+++ b/Documentation/hwmon/g762
@@ -0,0 +1,65 @@
+Kernel driver g762
+==================
+
+The GMT G762 Fan Speed PWM Controller is connected directly to a fan
+and performs closed-loop or open-loop control of the fan speed. Two
+modes - PWM or DC - are supported by the device.
+
+For additional information, a detailed datasheet is available at
+http://natisbad.org/NAS/ref/GMT_EDS-762_763-080710-0.2.pdf. sysfs
+bindings are described in Documentation/hwmon/sysfs-interface.
+
+The following entries are available to the user in a subdirectory of
+/sys/bus/i2c/drivers/g762/ to control the operation of the device.
+This can be done manually using the following entries but is usually
+done via a userland daemon like fancontrol.
+
+Note that those entries do not provide ways to setup the specific
+hardware characteristics of the system (reference clock, pulses per
+fan revolution, ...); Those can be modified via devicetree bindings
+documented in Documentation/devicetree/bindings/hwmon/g762.txt or
+using a specific platform_data structure in board initialization
+file (see include/linux/platform_data/g762.h).
+
+  fan1_target: set desired fan speed. This only makes sense in closed-loop
+            fan speed control (i.e. when pwm1_enable is set to 2).
+
+  fan1_input: provide current fan rotation value in RPM as reported by
+            the fan to the device.
+
+  fan1_div: fan clock divisor. Supported value are 1, 2, 4 and 8.
+
+  fan1_pulses: number of pulses per fan revolution. Supported values
+            are 2 and 4.
+
+  fan1_fault: reports fan failure, i.e. no transition on fan gear pin for
+            about 0.7s (if the fan is not voluntarily set off).
+
+  fan1_alarm: in closed-loop control mode, if fan RPM value is 25% out
+            of the programmed value for over 6 seconds 'fan1_alarm' is
+            set to 1.
+
+  pwm1_enable: set current fan speed control mode i.e. 1 for manual fan
+            speed control (open-loop) via pwm1 described below, 2 for
+            automatic fan speed control (closed-loop) via fan1_target
+            above.
+
+  pwm1_mode: set or get fan driving mode: 1 for PWM mode, 0 for DC mode.
+
+  pwm1: get or set PWM fan control value in open-loop mode. This is an
+            integer value between 0 and 255. 0 stops the fan, 255 makes
+            it run at full speed.
+
+Both in PWM mode ('pwm1_mode' set to 1) and DC mode ('pwm1_mode' set to 0),
+when current fan speed control mode is open-loop ('pwm1_enable' set to 1),
+the fan speed is programmed by setting a value between 0 and 255 via 'pwm1'
+entry (0 stops the fan, 255 makes it run at full speed). In closed-loop mode
+('pwm1_enable' set to 2), the expected rotation speed in RPM can be passed to
+the chip via 'fan1_target'. In closed-loop mode, the target speed is compared
+with current speed (available via 'fan1_input') by the device and a feedback
+is performed to match that target value. The fan speed value is computed
+based on the parameters associated with the physical characteristics of the
+system: a reference clock source frequency, a number of pulses per fan
+revolution, etc.
+
+Note that the driver will update its values at most once per second.
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index 683c769a8fca..e989f7fd645b 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -461,6 +461,16 @@ config SENSORS_G760A
 	  This driver can also be built as a module.  If so, the module
 	  will be called g760a.
 
+config SENSORS_G762
+	tristate "GMT G762 and G763"
+	depends on I2C
+	help
+	  If you say yes here you get support for Global Mixed-mode
+	  Technology Inc G762 and G763 fan speed PWM controller chips.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called g762.
+
 config SENSORS_GL518SM
 	tristate "Genesys Logic GL518SM"
 	depends on I2C
diff --git a/drivers/hwmon/Makefile b/drivers/hwmon/Makefile
index d17d3e64f9f4..4f0fb5235f42 100644
--- a/drivers/hwmon/Makefile
+++ b/drivers/hwmon/Makefile
@@ -60,6 +60,7 @@ obj-$(CONFIG_SENSORS_F75375S)	+= f75375s.o
 obj-$(CONFIG_SENSORS_FAM15H_POWER) += fam15h_power.o
 obj-$(CONFIG_SENSORS_FSCHMD)	+= fschmd.o
 obj-$(CONFIG_SENSORS_G760A)	+= g760a.o
+obj-$(CONFIG_SENSORS_G762)	+= g762.o
 obj-$(CONFIG_SENSORS_GL518SM)	+= gl518sm.o
 obj-$(CONFIG_SENSORS_GL520SM)	+= gl520sm.o
 obj-$(CONFIG_SENSORS_GPIO_FAN)	+= gpio-fan.o
diff --git a/drivers/hwmon/g762.c b/drivers/hwmon/g762.c
new file mode 100644
index 000000000000..73adf01b0ef2
--- /dev/null
+++ b/drivers/hwmon/g762.c
@@ -0,0 +1,1149 @@
+/*
+ * g762 - Driver for the Global Mixed-mode Technology Inc. fan speed
+ *        PWM controller chips from G762 family, i.e. G762 and G763
+ *
+ * Copyright (C) 2013, Arnaud EBALARD <arno@natisbad.org>
+ *
+ * This work is based on a basic version for 2.6.31 kernel developed
+ * by Olivier Mouchet for LaCie. Updates and correction have been
+ * performed to run on recent kernels. Additional features, like the
+ * ability to configure various characteristics via .dts file or
+ * board init file have been added. Detailed datasheet on which this
+ * development is based is available here:
+ *
+ *  http://natisbad.org/NAS/refs/GMT_EDS-762_763-080710-0.2.pdf
+ *
+ * Headers from previous developments have been kept below:
+ *
+ * Copyright (c) 2009 LaCie
+ *
+ * Author: Olivier Mouchet <olivier.mouchet@gmail.com>
+ *
+ * based on g760a code written by Herbert Valerio Riedel <hvr@gnu.org>
+ * Copyright (C) 2007  Herbert Valerio Riedel <hvr@gnu.org>
+ *
+ * g762: minimal datasheet available at:
+ *       http://www.gmt.com.tw/product/datasheet/EDS-762_3.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/i2c.h>
+#include <linux/hwmon.h>
+#include <linux/hwmon-sysfs.h>
+#include <linux/err.h>
+#include <linux/mutex.h>
+#include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_data/g762.h>
+
+#define DRVNAME "g762"
+
+static const struct i2c_device_id g762_id[] = {
+	{ "g762", 0 },
+	{ "g763", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, g762_id);
+
+enum g762_regs {
+	G762_REG_SET_CNT  = 0x00,
+	G762_REG_ACT_CNT  = 0x01,
+	G762_REG_FAN_STA  = 0x02,
+	G762_REG_SET_OUT  = 0x03,
+	G762_REG_FAN_CMD1 = 0x04,
+	G762_REG_FAN_CMD2 = 0x05,
+};
+
+/* Config register bits */
+#define G762_REG_FAN_CMD1_DET_FAN_FAIL  0x80 /* enable fan_fail signal */
+#define G762_REG_FAN_CMD1_DET_FAN_OOC   0x40 /* enable fan_out_of_control */
+#define G762_REG_FAN_CMD1_OUT_MODE      0x20 /* out mode: PWM or DC */
+#define G762_REG_FAN_CMD1_FAN_MODE      0x10 /* fan mode: closed/open-loop */
+#define G762_REG_FAN_CMD1_CLK_DIV_ID1   0x08 /* clock divisor value */
+#define G762_REG_FAN_CMD1_CLK_DIV_ID0   0x04
+#define G762_REG_FAN_CMD1_PWM_POLARITY  0x02 /* PWM polarity */
+#define G762_REG_FAN_CMD1_PULSE_PER_REV 0x01 /* pulse per fan revolution */
+
+#define G762_REG_FAN_CMD2_GEAR_MODE_1   0x08 /* fan gear mode */
+#define G762_REG_FAN_CMD2_GEAR_MODE_0   0x04
+#define G762_REG_FAN_CMD2_FAN_STARTV_1  0x02 /* fan startup voltage */
+#define G762_REG_FAN_CMD2_FAN_STARTV_0  0x01
+
+#define G762_REG_FAN_STA_FAIL           0x02 /* fan fail */
+#define G762_REG_FAN_STA_OOC            0x01 /* fan out of control */
+
+/* Config register values */
+#define G762_OUT_MODE_PWM            1
+#define G762_OUT_MODE_DC             0
+
+#define G762_FAN_MODE_CLOSED_LOOP    2
+#define G762_FAN_MODE_OPEN_LOOP      1
+
+#define G762_PWM_POLARITY_NEGATIVE   1
+#define G762_PWM_POLARITY_POSITIVE   0
+
+/* Register data is read (and cached) at most once per second. */
+#define G762_UPDATE_INTERVAL    HZ
+
+/*
+ * Extract pulse count per fan revolution value (2 or 4) from given
+ * FAN_CMD1 register value.
+ */
+#define G762_PULSE_FROM_REG(reg) \
+	((((reg) & G762_REG_FAN_CMD1_PULSE_PER_REV) + 1) << 1)
+
+/*
+ * Extract fan clock divisor (1, 2, 4 or 8) from given FAN_CMD1
+ * register value.
+ */
+#define G762_CLKDIV_FROM_REG(reg) \
+	(1 << (((reg) & (G762_REG_FAN_CMD1_CLK_DIV_ID0 |	\
+			 G762_REG_FAN_CMD1_CLK_DIV_ID1)) >> 2))
+
+/*
+ * Extract fan gear mode multiplier value (0, 2 or 4) from given
+ * FAN_CMD2 register value.
+ */
+#define G762_GEARMULT_FROM_REG(reg) \
+	(1 << (((reg) & (G762_REG_FAN_CMD2_GEAR_MODE_0 |	\
+			 G762_REG_FAN_CMD2_GEAR_MODE_1)) >> 2))
+
+struct g762_data {
+	struct i2c_client *client;
+	struct device *hwmon_dev;
+	struct clk *clk;
+
+	/* update mutex */
+	struct mutex update_lock;
+
+	/* board specific parameters. */
+	u32 clk_freq;
+
+	/* g762 register cache */
+	bool valid;
+	unsigned long last_updated; /* in jiffies */
+
+	u8 set_cnt;  /* controls fan rotation speed in closed-loop mode */
+	u8 act_cnt;  /* provides access to current fan RPM value */
+	u8 fan_sta;  /* bit 0: set when actual fan speed is more than
+		      *        25% outside requested fan speed
+		      * bit 1: set when no transition occurs on fan
+		      *        pin for 0.7s
+		      */
+	u8 set_out;  /* controls fan rotation speed in open-loop mode */
+	u8 fan_cmd1; /*   0: FG_PLS_ID0 FG pulses count per revolution
+		      *      0: 2 counts per revolution
+		      *      1: 4 counts per revolution
+		      *   1: PWM_POLARITY 1: negative_duty
+		      *                   0: positive_duty
+		      * 2,3: [FG_CLOCK_ID0, FG_CLK_ID1]
+		      *         00: Divide fan clock by 1
+		      *         01: Divide fan clock by 2
+		      *         10: Divide fan clock by 4
+		      *         11: Divide fan clock by 8
+		      *   4: FAN_MODE 1:closed-loop, 0:open-loop
+		      *   5: OUT_MODE 1:PWM, 0:DC
+		      *   6: DET_FAN_OOC enable "fan ooc" status
+		      *   7: DET_FAN_FAIL enable "fan fail" status
+		      */
+	u8 fan_cmd2; /* 0,1: FAN_STARTV 0,1,2,3 -> 0,32,64,96 dac_code
+		      * 2,3: FG_GEAR_MODE
+		      *         00: multiplier = 1
+		      *         01: multiplier = 2
+		      *         10: multiplier = 4
+		      *   4: Mask ALERT# (g763 only)
+		      */
+};
+
+/*
+ * Convert count value from fan controller register (FAN_SET_CNT) into fan
+ * speed RPM value. Note that the datasheet documents a basic formula;
+ * influence of additional parameters (fan clock divisor, fan gear mode)
+ * have been infered from examples in the datasheet and tests.
+ */
+static inline unsigned int rpm_from_cnt(u8 cnt, u32 clk_freq, u16 p,
+					u8 clk_div, u8 gear_mult)
+{
+	if (cnt == 0xff)  /* setting cnt to 255 stops the fan */
+		return 0;
+
+	return (clk_freq * 30 * gear_mult) / ((cnt ? cnt : 1) * p * clk_div);
+}
+
+/*
+ * Convert fan RPM value from sysfs into count value for fan controller
+ * register (FAN_SET_CNT).
+ */
+static inline unsigned char cnt_from_rpm(u32 rpm, u32 clk_freq, u16 p,
+					 u8 clk_div, u8 gear_mult)
+{
+	if (!rpm)         /* to stop the fan, set cnt to 255 */
+		return 0xff;
+
+	return clamp_val(((clk_freq * 30 * gear_mult) / (rpm * p * clk_div)),
+			 0, 255);
+}
+
+/* helper to grab and cache data, at most one time per second */
+static struct g762_data *g762_update_client(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = i2c_get_clientdata(client);
+	int ret = 0;
+
+	mutex_lock(&data->update_lock);
+	if (time_before(jiffies, data->last_updated + G762_UPDATE_INTERVAL) &&
+	    likely(data->valid))
+		goto out;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_SET_CNT);
+	if (ret < 0)
+		goto out;
+	data->set_cnt = ret;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_ACT_CNT);
+	if (ret < 0)
+		goto out;
+	data->act_cnt = ret;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_FAN_STA);
+	if (ret < 0)
+		goto out;
+	data->fan_sta = ret;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_SET_OUT);
+	if (ret < 0)
+		goto out;
+	data->set_out = ret;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_FAN_CMD1);
+	if (ret < 0)
+		goto out;
+	data->fan_cmd1 = ret;
+
+	ret = i2c_smbus_read_byte_data(client, G762_REG_FAN_CMD2);
+	if (ret < 0)
+		goto out;
+	data->fan_cmd2 = ret;
+
+	data->last_updated = jiffies;
+	data->valid = true;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	if (ret < 0) /* upon error, encode it in return value */
+		data = ERR_PTR(ret);
+
+	return data;
+}
+
+/* helpers for writing hardware parameters */
+
+/*
+ * Set input clock frequency received on CLK pin of the chip. Accepted values
+ * are between 0 and 0xffffff. If zero is given, then default frequency
+ * (32,768Hz) is used. Note that clock frequency is a characteristic of the
+ * system but an internal parameter, i.e. value is not passed to the device.
+ */
+static int do_set_clk_freq(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = i2c_get_clientdata(client);
+
+	if (val > 0xffffff)
+		return -EINVAL;
+	if (!val)
+		val = 32768;
+
+	data->clk_freq = val;
+
+	return 0;
+}
+
+/* Set pwm mode. Accepts either 0 (PWM mode) or 1 (DC mode) */
+static int do_set_pwm_mode(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case G762_OUT_MODE_PWM:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_OUT_MODE;
+		break;
+	case G762_OUT_MODE_DC:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_OUT_MODE;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					data->fan_cmd1);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set fan clock divisor. Accepts either 1, 2, 4 or 8. */
+static int do_set_fan_div(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case 1:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_CLK_DIV_ID0;
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_CLK_DIV_ID1;
+		break;
+	case 2:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_CLK_DIV_ID0;
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_CLK_DIV_ID1;
+		break;
+	case 4:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_CLK_DIV_ID0;
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_CLK_DIV_ID1;
+		break;
+	case 8:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_CLK_DIV_ID0;
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_CLK_DIV_ID1;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					data->fan_cmd1);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set fan gear mode. Accepts either 0, 1 or 2. */
+static int do_set_fan_gear_mode(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case 0:
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_GEAR_MODE_0;
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_GEAR_MODE_1;
+		break;
+	case 1:
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_GEAR_MODE_0;
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_GEAR_MODE_1;
+		break;
+	case 2:
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_GEAR_MODE_0;
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_GEAR_MODE_1;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD2,
+					data->fan_cmd2);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set number of fan pulses per revolution. Accepts either 2 or 4. */
+static int do_set_fan_pulses(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case 2:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_PULSE_PER_REV;
+		break;
+	case 4:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_PULSE_PER_REV;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					data->fan_cmd1);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set fan mode. Accepts either 1 (open-loop) or 2 (closed-loop). */
+static int do_set_pwm_enable(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case G762_FAN_MODE_CLOSED_LOOP:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_FAN_MODE;
+		break;
+	case G762_FAN_MODE_OPEN_LOOP:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_FAN_MODE;
+		/*
+		 * BUG FIX: if SET_CNT register value is 255 then, for some
+		 * unknown reason, fan will not rotate as expected, no matter
+		 * the value of SET_OUT (to be specific, this seems to happen
+		 * only in PWM mode). To workaround this bug, we give SET_CNT
+		 * value of 254 if it is 255 when switching to open-loop.
+		 */
+		if (data->set_cnt == 0xff)
+			i2c_smbus_write_byte_data(client, G762_REG_SET_CNT,
+						  254);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					data->fan_cmd1);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set PWM polarity. Accepts either 0 (positive duty) or 1 (negative duty) */
+static int do_set_pwm_polarity(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case G762_PWM_POLARITY_POSITIVE:
+		data->fan_cmd1 &= ~G762_REG_FAN_CMD1_PWM_POLARITY;
+		break;
+	case G762_PWM_POLARITY_NEGATIVE:
+		data->fan_cmd1 |=  G762_REG_FAN_CMD1_PWM_POLARITY;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					data->fan_cmd1);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/*
+ * Set pwm value. Accepts values between 0 (stops the fan) and
+ * 255 (full speed). This only makes sense in open-loop mode.
+ */
+static int do_set_pwm(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = i2c_get_clientdata(client);
+	int ret;
+
+	if (val > 255)
+		return -EINVAL;
+
+	mutex_lock(&data->update_lock);
+	ret = i2c_smbus_write_byte_data(client, G762_REG_SET_OUT, val);
+	data->valid = false;
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/*
+ * Set fan RPM value. Can be called both in closed and open-loop mode
+ * but effect will only be seen after closed-loop mode is configured.
+ */
+static int do_set_fan_target(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	data->set_cnt = cnt_from_rpm(val, data->clk_freq,
+				     G762_PULSE_FROM_REG(data->fan_cmd1),
+				     G762_CLKDIV_FROM_REG(data->fan_cmd1),
+				     G762_GEARMULT_FROM_REG(data->fan_cmd2));
+	ret = i2c_smbus_write_byte_data(client, G762_REG_SET_CNT,
+					data->set_cnt);
+	data->valid = false;
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/* Set fan startup voltage. Accepted values are either 0, 1, 2 or 3. */
+static int do_set_fan_startv(struct device *dev, unsigned long val)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+	int ret;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	switch (val) {
+	case 0:
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_FAN_STARTV_0;
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_FAN_STARTV_1;
+		break;
+	case 1:
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_FAN_STARTV_0;
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_FAN_STARTV_1;
+		break;
+	case 2:
+		data->fan_cmd2 &= ~G762_REG_FAN_CMD2_FAN_STARTV_0;
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_FAN_STARTV_1;
+		break;
+	case 3:
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_FAN_STARTV_0;
+		data->fan_cmd2 |=  G762_REG_FAN_CMD2_FAN_STARTV_1;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD2,
+					data->fan_cmd2);
+	data->valid = false;
+ out:
+	mutex_unlock(&data->update_lock);
+
+	return ret;
+}
+
+/*
+ * Helper to import hardware characteristics from .dts file and push
+ * those to the chip.
+ */
+
+#ifdef CONFIG_OF
+static struct of_device_id g762_dt_match[] = {
+	{ .compatible = "gmt,g762" },
+	{ .compatible = "gmt,g763" },
+	{ },
+};
+
+/*
+ * Grab clock (a required property), enable it, get (fixed) clock frequency
+ * and store it. Note: upon success, clock has been prepared and enabled; it
+ * must later be unprepared and disabled (e.g. during module unloading) by a
+ * call to g762_of_clock_disable(). Note that a reference to clock is kept
+ * in our private data structure to be used in this function.
+ */
+static int g762_of_clock_enable(struct i2c_client *client)
+{
+	struct g762_data *data;
+	unsigned long clk_freq;
+	struct clk *clk;
+	int ret;
+
+	if (!client->dev.of_node)
+		return 0;
+
+	clk = of_clk_get(client->dev.of_node, 0);
+	if (IS_ERR(clk)) {
+		dev_err(&client->dev, "failed to get clock\n");
+		return PTR_ERR(clk);
+	}
+
+	ret = clk_prepare_enable(clk);
+	if (ret) {
+		dev_err(&client->dev, "failed to enable clock\n");
+		goto clk_put;
+	}
+
+	clk_freq = clk_get_rate(clk);
+	ret = do_set_clk_freq(&client->dev, clk_freq);
+	if (ret) {
+		dev_err(&client->dev, "invalid clock freq %lu\n", clk_freq);
+		goto clk_unprep;
+	}
+
+	data = i2c_get_clientdata(client);
+	data->clk = clk;
+
+	return 0;
+
+ clk_unprep:
+	clk_disable_unprepare(clk);
+
+ clk_put:
+	clk_put(clk);
+
+	return ret;
+}
+
+static void g762_of_clock_disable(struct i2c_client *client)
+{
+	struct g762_data *data = i2c_get_clientdata(client);
+
+	if (!data->clk)
+		return;
+
+	clk_disable_unprepare(data->clk);
+	clk_put(data->clk);
+}
+
+static int g762_of_prop_import_one(struct i2c_client *client,
+				   const char *pname,
+				   int (*psetter)(struct device *dev,
+						  unsigned long val))
+{
+	const __be32 *prop;
+	int len, ret;
+	u32 pval;
+
+	prop = of_get_property(client->dev.of_node, pname, &len);
+	if (!prop || len != sizeof(u32))
+		return 0;
+
+	pval = be32_to_cpu(prop[0]);
+	dev_dbg(&client->dev, "found %s (%d)\n", pname, pval);
+	ret = (*psetter)(&client->dev, pval);
+	if (ret)
+		dev_err(&client->dev, "unable to set %s (%d)\n", pname, pval);
+
+	return ret;
+}
+
+static int g762_of_prop_import(struct i2c_client *client)
+{
+	int ret;
+
+	if (!client->dev.of_node)
+		return 0;
+
+	ret = g762_of_prop_import_one(client, "fan_gear_mode",
+				      do_set_fan_gear_mode);
+	if (ret)
+		return ret;
+
+	ret = g762_of_prop_import_one(client, "pwm_polarity",
+				      do_set_pwm_polarity);
+	if (ret)
+		return ret;
+
+	return g762_of_prop_import_one(client, "fan_startv",
+				       do_set_fan_startv);
+}
+
+#else
+static int g762_of_prop_import(struct i2c_client *client)
+{
+	return 0;
+}
+
+static int g762_of_clock_enable(struct i2c_client *client)
+{
+	return 0;
+}
+
+static void g762_of_clock_disable(struct i2c_client *client) { }
+#endif
+
+/*
+ * Helper to import hardware characteristics from .dts file and push
+ * those to the chip.
+ */
+
+static int g762_pdata_prop_import(struct i2c_client *client)
+{
+	struct g762_platform_data *pdata = client->dev.platform_data;
+	int ret;
+
+	if (!pdata)
+		return 0;
+
+	ret = do_set_fan_gear_mode(&client->dev, pdata->fan_gear_mode);
+	if (ret)
+		return ret;
+
+	ret = do_set_pwm_polarity(&client->dev, pdata->pwm_polarity);
+	if (ret)
+		return ret;
+
+	ret = do_set_fan_startv(&client->dev, pdata->fan_startv);
+	if (ret)
+		return ret;
+
+	return do_set_clk_freq(&client->dev, pdata->clk_freq);
+}
+
+/*
+ * sysfs attributes
+ */
+
+/*
+ * Read function for fan1_input sysfs file. Return current fan RPM value, or
+ * 0 if fan is out of control.
+ */
+static ssize_t get_fan_rpm(struct device *dev, struct device_attribute *da,
+			   char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+	unsigned int rpm = 0;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	/* reverse logic: fan out of control reporting is enabled low */
+	if (data->fan_sta & G762_REG_FAN_STA_OOC) {
+		rpm = rpm_from_cnt(data->act_cnt, data->clk_freq,
+				   G762_PULSE_FROM_REG(data->fan_cmd1),
+				   G762_CLKDIV_FROM_REG(data->fan_cmd1),
+				   G762_GEARMULT_FROM_REG(data->fan_cmd2));
+	}
+	mutex_unlock(&data->update_lock);
+
+	return sprintf(buf, "%u\n", rpm);
+}
+
+/*
+ * Read and write functions for pwm1_mode sysfs file. Get and set fan speed
+ * control mode i.e. PWM (1) or DC (0).
+ */
+static ssize_t get_pwm_mode(struct device *dev, struct device_attribute *da,
+			    char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n",
+		       !!(data->fan_cmd1 & G762_REG_FAN_CMD1_OUT_MODE));
+}
+
+static ssize_t set_pwm_mode(struct device *dev, struct device_attribute *da,
+			    const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_pwm_mode(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Read and write functions for fan1_div sysfs file. Get and set fan
+ * controller prescaler value
+ */
+static ssize_t get_fan_div(struct device *dev,
+			   struct device_attribute *da, char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", G762_CLKDIV_FROM_REG(data->fan_cmd1));
+}
+
+static ssize_t set_fan_div(struct device *dev,
+			   struct device_attribute *da,
+			   const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_fan_div(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Read and write functions for fan1_pulses sysfs file. Get and set number
+ * of tachometer pulses per fan revolution.
+ */
+static ssize_t get_fan_pulses(struct device *dev,
+			      struct device_attribute *da, char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", G762_PULSE_FROM_REG(data->fan_cmd1));
+}
+
+static ssize_t set_fan_pulses(struct device *dev,
+			      struct device_attribute *da,
+			      const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_fan_pulses(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Read and write functions for pwm1_enable. Get and set fan speed control mode
+ * (i.e. closed or open-loop).
+ *
+ * Following documentation about hwmon's sysfs interface, a pwm1_enable node
+ * should accept followings:
+ *
+ *  0 : no fan speed control (i.e. fan at full speed)
+ *  1 : manual fan speed control enabled (use pwm[1-*]) (open-loop)
+ *  2+: automatic fan speed control enabled (use fan[1-*]_target) (closed-loop)
+ *
+ * but we do not accept 0 as this mode is not natively supported by the chip
+ * and it is not emulated by g762 driver. -EINVAL is returned in this case.
+ */
+static ssize_t get_pwm_enable(struct device *dev,
+			      struct device_attribute *da, char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n",
+		       (!!(data->fan_cmd1 & G762_REG_FAN_CMD1_FAN_MODE)) + 1);
+}
+
+static ssize_t set_pwm_enable(struct device *dev,
+			      struct device_attribute *da,
+			      const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_pwm_enable(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Read and write functions for pwm1 sysfs file. Get and set pwm value
+ * (which affects fan speed) in open-loop mode. 0 stops the fan and 255
+ * makes it run at full speed.
+ */
+static ssize_t get_pwm(struct device *dev, struct device_attribute *da,
+		       char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%d\n", data->set_out);
+}
+
+static ssize_t set_pwm(struct device *dev, struct device_attribute *da,
+		       const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_pwm(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/*
+ * Read and write function for fan1_target sysfs file. Get/set the fan speed in
+ * closed-loop mode. Speed is given as a RPM value; then the chip will regulate
+ * the fan speed using pulses from fan tachometer.
+ *
+ * Refer to rpm_from_cnt() implementation above to get info about count number
+ * calculation.
+ *
+ * Also note that due to rounding errors it is possible that you don't read
+ * back exactly the value you have set.
+ */
+static ssize_t get_fan_target(struct device *dev, struct device_attribute *da,
+			      char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+	unsigned int rpm;
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	mutex_lock(&data->update_lock);
+	rpm = rpm_from_cnt(data->set_cnt, data->clk_freq,
+			   G762_PULSE_FROM_REG(data->fan_cmd1),
+			   G762_CLKDIV_FROM_REG(data->fan_cmd1),
+			   G762_GEARMULT_FROM_REG(data->fan_cmd2));
+	mutex_unlock(&data->update_lock);
+
+	return sprintf(buf, "%u\n", rpm);
+}
+
+static ssize_t set_fan_target(struct device *dev, struct device_attribute *da,
+			      const char *buf, size_t count)
+{
+	unsigned long val;
+	int ret;
+
+	if (kstrtoul(buf, 10, &val))
+		return -EINVAL;
+
+	ret = do_set_fan_target(dev, val);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+/* read function for fan1_fault sysfs file. */
+static ssize_t get_fan_failure(struct device *dev, struct device_attribute *da,
+			       char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%u\n", !!(data->fan_sta & G762_REG_FAN_STA_FAIL));
+}
+
+/*
+ * read function for fan1_alarm sysfs file. Note that OOC condition is
+ * enabled low
+ */
+static ssize_t get_fan_ooc(struct device *dev, struct device_attribute *da,
+			   char *buf)
+{
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	return sprintf(buf, "%u\n", !(data->fan_sta & G762_REG_FAN_STA_OOC));
+}
+
+static DEVICE_ATTR(pwm1, S_IWUSR | S_IRUGO, get_pwm, set_pwm);
+static DEVICE_ATTR(pwm1_mode, S_IWUSR | S_IRUGO, get_pwm_mode, set_pwm_mode);
+static DEVICE_ATTR(pwm1_enable, S_IWUSR | S_IRUGO,
+		   get_pwm_enable, set_pwm_enable);
+static DEVICE_ATTR(fan1_input, S_IRUGO, get_fan_rpm, NULL);
+static DEVICE_ATTR(fan1_alarm, S_IRUGO, get_fan_ooc, NULL);
+static DEVICE_ATTR(fan1_fault, S_IRUGO, get_fan_failure, NULL);
+static DEVICE_ATTR(fan1_target, S_IWUSR | S_IRUGO,
+		   get_fan_target, set_fan_target);
+static DEVICE_ATTR(fan1_div, S_IWUSR | S_IRUGO, get_fan_div, set_fan_div);
+static DEVICE_ATTR(fan1_pulses, S_IWUSR | S_IRUGO,
+		   get_fan_pulses, set_fan_pulses);
+
+/* Driver data */
+static struct attribute *g762_attributes[] = {
+	&dev_attr_fan1_input.attr,
+	&dev_attr_fan1_alarm.attr,
+	&dev_attr_fan1_fault.attr,
+	&dev_attr_fan1_target.attr,
+	&dev_attr_fan1_div.attr,
+	&dev_attr_fan1_pulses.attr,
+	&dev_attr_pwm1.attr,
+	&dev_attr_pwm1_mode.attr,
+	&dev_attr_pwm1_enable.attr,
+	NULL
+};
+
+static const struct attribute_group g762_group = {
+	.attrs = g762_attributes,
+};
+
+/*
+ * Enable both fan failure detection and fan out of control protection. The
+ * function does not protect change/access to data structure; it must thus
+ * only be called during initialization.
+ */
+static inline int g762_fan_init(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev);
+	struct g762_data *data = g762_update_client(dev);
+
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	data->fan_cmd1 |= G762_REG_FAN_CMD1_DET_FAN_FAIL;
+	data->fan_cmd1 |= G762_REG_FAN_CMD1_DET_FAN_OOC;
+	data->valid = false;
+
+	return i2c_smbus_write_byte_data(client, G762_REG_FAN_CMD1,
+					 data->fan_cmd1);
+}
+
+static int g762_probe(struct i2c_client *client, const struct i2c_device_id *id)
+{
+	struct g762_data *data;
+	int ret;
+
+	if (!i2c_check_functionality(client->adapter,
+				     I2C_FUNC_SMBUS_BYTE_DATA))
+		return -ENODEV;
+
+	data = devm_kzalloc(&client->dev, sizeof(struct g762_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	i2c_set_clientdata(client, data);
+	data->client = client;
+	mutex_init(&data->update_lock);
+
+	/* Enable fan failure detection and fan out of control protection */
+	ret = g762_fan_init(&client->dev);
+	if (ret)
+		return ret;
+
+	/* Get configuration via DT ... */
+	ret = g762_of_clock_enable(client);
+	if (ret)
+		return ret;
+	ret = g762_of_prop_import(client);
+	if (ret)
+		goto clock_dis;
+	/* ... or platform_data */
+	ret = g762_pdata_prop_import(client);
+	if (ret)
+		goto clock_dis;
+
+	/* Register sysfs hooks */
+	ret = sysfs_create_group(&client->dev.kobj, &g762_group);
+	if (ret)
+		goto clock_dis;
+
+	data->hwmon_dev = hwmon_device_register(&client->dev);
+	if (IS_ERR(data->hwmon_dev)) {
+		ret = PTR_ERR(data->hwmon_dev);
+		goto sysfs_rem;
+	}
+
+	return 0;
+
+ sysfs_rem:
+	sysfs_remove_group(&client->dev.kobj, &g762_group);
+
+ clock_dis:
+	g762_of_clock_disable(client);
+
+	return ret;
+}
+
+static int g762_remove(struct i2c_client *client)
+{
+	struct g762_data *data = i2c_get_clientdata(client);
+
+	hwmon_device_unregister(data->hwmon_dev);
+	sysfs_remove_group(&client->dev.kobj, &g762_group);
+	g762_of_clock_disable(client);
+
+	return 0;
+}
+
+static struct i2c_driver g762_driver = {
+	.driver = {
+		.name = DRVNAME,
+		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(g762_dt_match),
+	},
+	.probe	  = g762_probe,
+	.remove	  = g762_remove,
+	.id_table = g762_id,
+};
+
+module_i2c_driver(g762_driver);
+
+MODULE_AUTHOR("Arnaud EBALARD <arno@natisbad.org>");
+MODULE_DESCRIPTION("GMT G762/G763 driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/platform_data/g762.h b/include/linux/platform_data/g762.h
new file mode 100644
index 000000000000..d3c51283764d
--- /dev/null
+++ b/include/linux/platform_data/g762.h
@@ -0,0 +1,37 @@
+/*
+ * Platform data structure for g762 fan controller driver
+ *
+ * Copyright (C) 2013, Arnaud EBALARD <arno@natisbad.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#ifndef __LINUX_PLATFORM_DATA_G762_H__
+#define __LINUX_PLATFORM_DATA_G762_H__
+
+/*
+ * Following structure can be used to set g762 driver platform specific data
+ * during board init. Note that passing a sparse structure is possible but
+ * will result in non-specified attributes to be set to default value, hence
+ * overloading those installed during boot (e.g. by u-boot).
+ */
+
+struct g762_platform_data {
+	u32 fan_startv;
+	u32 fan_gear_mode;
+	u32 pwm_polarity;
+	u32 clk_freq;
+};
+
+#endif /* __LINUX_PLATFORM_DATA_G762_H__ */
-- 
cgit v1.3-7-g2ca7


From 7c30ed532cf798a8d924562f2f44d03d7652f7a7 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 19 Jun 2013 10:16:55 +0530
Subject: cpufreq: make sure frequency transitions are serialized

Whenever we are changing frequency of a cpu, we are calling PRECHANGE and
POSTCHANGE notifiers. They must be serialized. i.e. PRECHANGE or POSTCHANGE
shouldn't be called twice contiguously.

This can happen due to bugs in users of __cpufreq_driver_target() or actual
cpufreq drivers who are sending these notifiers.

This patch adds some protection against this. Now, we keep track of the last
transaction and see if something went wrong.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 14 ++++++++++++++
 include/linux/cpufreq.h   |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index d976e222f10f..03b3b69f64a7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -312,6 +312,12 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 	switch (state) {
 
 	case CPUFREQ_PRECHANGE:
+		if (WARN(policy->transition_ongoing,
+				"In middle of another frequency transition\n"))
+			return;
+
+		policy->transition_ongoing = true;
+
 		/* detect if the driver reported a value as "old frequency"
 		 * which is not equal to what the cpufreq core thinks is
 		 * "old frequency".
@@ -331,6 +337,12 @@ static void __cpufreq_notify_transition(struct cpufreq_policy *policy,
 		break;
 
 	case CPUFREQ_POSTCHANGE:
+		if (WARN(!policy->transition_ongoing,
+				"No frequency transition in progress\n"))
+			return;
+
+		policy->transition_ongoing = false;
+
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
 		pr_debug("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
 			(unsigned long)freqs->cpu);
@@ -1539,6 +1551,8 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 
 	if (cpufreq_disabled())
 		return -ENODEV;
+	if (policy->transition_ongoing)
+		return -EBUSY;
 
 	/* Make sure that target_freq is within supported range */
 	if (target_freq > policy->max)
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 3c7ee2f90370..c0bc7374445e 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -119,6 +119,7 @@ struct cpufreq_policy {
 
 	struct kobject		kobj;
 	struct completion	kobj_unregister;
+	bool			transition_ongoing; /* Tracks transition status */
 };
 
 #define CPUFREQ_ADJUST			(0)
-- 
cgit v1.3-7-g2ca7


From f4fd3797848aa04e72e942c855fd279840a47fe4 Mon Sep 17 00:00:00 2001
From: Lan Tianyu <tianyu.lan@intel.com>
Date: Thu, 27 Jun 2013 15:08:54 +0800
Subject: acpi-cpufreq: Add new sysfs attribute freqdomain_cpus

Commits fcf8058 (cpufreq: Simplify cpufreq_add_dev()) and aa77a52
(cpufreq: acpi-cpufreq: Don't set policy->related_cpus from .init())
changed the contents of the "related_cpus" sysfs attribute on systems
where acpi-cpufreq is used and user space can't get the list of CPUs
which are in the same hardware coordination CPU domain (provided by
the ACPI AML method _PSD) via "related_cpus" any more.

To make up for that loss add a new sysfs attribute "freqdomian_cpus"
for the acpi-cpufreq driver which exposes the list of CPUs in the
same domain regardless of whether it is coordinated by hardware or
software.

[rjw: Changelog, documentation]
References: https://bugzilla.kernel.org/show_bug.cgi?id=58761
Reported-by: Jean-Philippe Halimi <jean-philippe.halimi@exascale-computing.eu>
Signed-off-by: Lan Tianyu <tianyu.lan@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 15 ++++++++++++++
 drivers/cpufreq/acpi-cpufreq.c                     | 23 +++++++++++++++++++++-
 drivers/cpufreq/cpufreq.c                          |  7 ++++---
 include/linux/cpufreq.h                            |  3 +++
 4 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2447698aed41..468e4d48f884 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -144,6 +144,21 @@ Description:	Discover and change clock speed of CPUs
 		to learn how to control the knobs.
 
 
+What:		/sys/devices/system/cpu/cpu#/cpufreq/freqdomain_cpus
+Date:		June 2013
+Contact:	cpufreq@vger.kernel.org
+Description:	Discover CPUs in the same CPU frequency coordination domain
+
+		freqdomain_cpus is the list of CPUs (online+offline) that share
+		the same clock/freq domain (possibly at the hardware level).
+		That information may be hidden from the cpufreq core and the
+		value of related_cpus may be different from freqdomain_cpus. This
+		attribute is useful for user space DVFS controllers to get better
+		power/performance results for platforms using acpi-cpufreq.
+
+		This file is only present if the acpi-cpufreq driver is in use.
+
+
 What:		/sys/devices/system/cpu/cpu*/cache/index3/cache_disable_{0,1}
 Date:		August 2008
 KernelVersion:	2.6.27
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 2c5906d71397..403dad646abe 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,7 @@ struct acpi_cpufreq_data {
 	struct cpufreq_frequency_table *freq_table;
 	unsigned int resume;
 	unsigned int cpu_feature;
+	cpumask_var_t freqdomain_cpus;
 };
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
@@ -176,6 +177,15 @@ static struct global_attr global_boost = __ATTR(boost, 0644,
 						show_global_boost,
 						store_global_boost);
 
+static ssize_t show_freqdomain_cpus(struct cpufreq_policy *policy, char *buf)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
+
+	return cpufreq_show_cpus(data->freqdomain_cpus, buf);
+}
+
+cpufreq_freq_attr_ro(freqdomain_cpus);
+
 #ifdef CONFIG_X86_ACPI_CPUFREQ_CPB
 static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
 			 size_t count)
@@ -704,6 +714,11 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
+	if (!zalloc_cpumask_var(&data->freqdomain_cpus, GFP_KERNEL)) {
+		result = -ENOMEM;
+		goto err_free;
+	}
+
 	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(acfreq_data, cpu) = data;
 
@@ -712,7 +727,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	result = acpi_processor_register_performance(data->acpi_data, cpu);
 	if (result)
-		goto err_free;
+		goto err_free_mask;
 
 	perf = data->acpi_data;
 	policy->shared_type = perf->shared_type;
@@ -725,6 +740,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
 		cpumask_copy(policy->cpus, perf->shared_cpu_map);
 	}
+	cpumask_copy(data->freqdomain_cpus, perf->shared_cpu_map);
 
 #ifdef CONFIG_SMP
 	dmi_check_system(sw_any_bug_dmi_table);
@@ -736,6 +752,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (check_amd_hwpstate_cpu(cpu) && !acpi_pstate_strict) {
 		cpumask_clear(policy->cpus);
 		cpumask_set_cpu(cpu, policy->cpus);
+		cpumask_copy(data->freqdomain_cpus, cpu_sibling_mask(cpu));
 		policy->shared_type = CPUFREQ_SHARED_TYPE_HW;
 		pr_info_once(PFX "overriding BIOS provided _PSD data\n");
 	}
@@ -870,6 +887,8 @@ err_freqfree:
 	kfree(data->freq_table);
 err_unreg:
 	acpi_processor_unregister_performance(perf, cpu);
+err_free_mask:
+	free_cpumask_var(data->freqdomain_cpus);
 err_free:
 	kfree(data);
 	per_cpu(acfreq_data, cpu) = NULL;
@@ -888,6 +907,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 		per_cpu(acfreq_data, policy->cpu) = NULL;
 		acpi_processor_unregister_performance(data->acpi_data,
 						      policy->cpu);
+		free_cpumask_var(data->freqdomain_cpus);
 		kfree(data->freq_table);
 		kfree(data);
 	}
@@ -908,6 +928,7 @@ static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
 
 static struct freq_attr *acpi_cpufreq_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
+	&freqdomain_cpus,
 	NULL,	/* this is a placeholder for cpb, do not remove */
 	NULL,
 };
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 03b3b69f64a7..6a015ada5285 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -585,7 +585,7 @@ out:
 	return i;
 }
 
-static ssize_t show_cpus(const struct cpumask *mask, char *buf)
+ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf)
 {
 	ssize_t i = 0;
 	unsigned int cpu;
@@ -600,6 +600,7 @@ static ssize_t show_cpus(const struct cpumask *mask, char *buf)
 	i += sprintf(&buf[i], "\n");
 	return i;
 }
+EXPORT_SYMBOL_GPL(cpufreq_show_cpus);
 
 /**
  * show_related_cpus - show the CPUs affected by each transition even if
@@ -607,7 +608,7 @@ static ssize_t show_cpus(const struct cpumask *mask, char *buf)
  */
 static ssize_t show_related_cpus(struct cpufreq_policy *policy, char *buf)
 {
-	return show_cpus(policy->related_cpus, buf);
+	return cpufreq_show_cpus(policy->related_cpus, buf);
 }
 
 /**
@@ -615,7 +616,7 @@ static ssize_t show_related_cpus(struct cpufreq_policy *policy, char *buf)
  */
 static ssize_t show_affected_cpus(struct cpufreq_policy *policy, char *buf)
 {
-	return show_cpus(policy->cpus, buf);
+	return cpufreq_show_cpus(policy->cpus, buf);
 }
 
 static ssize_t store_scaling_setspeed(struct cpufreq_policy *policy,
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index c0bc7374445e..4d7390bc1727 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -439,4 +439,7 @@ void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table,
 void cpufreq_frequency_table_update_policy_cpu(struct cpufreq_policy *policy);
 
 void cpufreq_frequency_table_put_attr(unsigned int cpu);
+
+ssize_t cpufreq_show_cpus(const struct cpumask *mask, char *buf);
+
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.3-7-g2ca7


From 0ce6cba35777cf96a54ce0d5856dc962566b8717 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 27 Jun 2013 19:37:26 -0700
Subject: cgroup: CGRP_ROOT_SUBSYS_BOUND should be ignored when comparing mount
 options

1672d04070 ("cgroup: fix cgroupfs_root early destruction path")
introduced CGRP_ROOT_SUBSYS_BOUND which is used to mark completion of
subsys binding on a new root; however, this broke remounts.
cgroup_remount() doesn't allow changing root options via remount and
CGRP_ROOT_SUBSYS_BOUND, which is set on all fully initialized roots,
makes the function reject all remounts.

Fix it by putting the options part in the lower 16 bits of root->flags
and masking the comparions.  While at it, make cgroup_remount() emit
an error message explaining why it's rejecting a remount request, so
that it's less of a mystery.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 6 +++++-
 kernel/cgroup.c        | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ad3555bc21f4..8db53974f7b5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -276,7 +276,11 @@ enum {
 
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-	CGRP_ROOT_SUBSYS_BOUND	= (1 << 3), /* subsystems finished binding */
+
+	/* mount options live below bit 16 */
+	CGRP_ROOT_OPTION_MASK	= (1 << 16) - 1,
+
+	CGRP_ROOT_SUBSYS_BOUND	= (1 << 16), /* subsystems finished binding */
 };
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1b7b567208cd..5a2fcf5bcc4a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1362,8 +1362,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	removed_mask = root->subsys_mask & ~opts.subsys_mask;
 
 	/* Don't allow flags or name to change at remount */
-	if (opts.flags != root->flags ||
+	if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
 	    (opts.name && strcmp(opts.name, root->name))) {
+		pr_err("cgroup: option or name mismatch, new: 0x%lx \"%s\", old: 0x%lx \"%s\"\n",
+		       opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
+		       root->flags & CGRP_ROOT_OPTION_MASK, root->name);
 		ret = -EINVAL;
 		goto out_unlock;
 	}
-- 
cgit v1.3-7-g2ca7


From 60545d0d4610b02e55f65d141c95b18ccf855b6e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 7 Jun 2013 01:20:27 -0400
Subject: [O_TMPFILE] it's still short a few helpers, but infrastructure should
 be OK now...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/uapi/asm/fcntl.h  |  1 +
 arch/parisc/include/uapi/asm/fcntl.h |  1 +
 arch/sparc/include/uapi/asm/fcntl.h  |  1 +
 fs/dcache.c                          | 16 ++++++++++
 fs/ext2/namei.c                      | 24 +++++++++++++++
 fs/minix/namei.c                     | 13 ++++++++
 fs/namei.c                           | 60 ++++++++++++++++++++++++++++++++++++
 fs/open.c                            | 14 ++++++---
 include/linux/dcache.h               |  2 ++
 include/linux/fs.h                   |  1 +
 include/uapi/asm-generic/fcntl.h     |  4 +++
 mm/shmem.c                           | 32 +++++++++++++++++++
 12 files changed, 164 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/uapi/asm/fcntl.h b/arch/alpha/include/uapi/asm/fcntl.h
index 6d9e805f18a7..dfdadb0b4bef 100644
--- a/arch/alpha/include/uapi/asm/fcntl.h
+++ b/arch/alpha/include/uapi/asm/fcntl.h
@@ -32,6 +32,7 @@
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 
 #define O_PATH		040000000
+#define O_TMPFILE	0100000000
 
 #define F_GETLK		7
 #define F_SETLK		8
diff --git a/arch/parisc/include/uapi/asm/fcntl.h b/arch/parisc/include/uapi/asm/fcntl.h
index 0304b92ccfea..cc61c475f277 100644
--- a/arch/parisc/include/uapi/asm/fcntl.h
+++ b/arch/parisc/include/uapi/asm/fcntl.h
@@ -20,6 +20,7 @@
 #define O_INVISIBLE	004000000 /* invisible I/O, for DMAPI/XDSM */
 
 #define O_PATH		020000000
+#define O_TMPFILE	040000000
 
 #define F_GETLK64	8
 #define F_SETLK64	9
diff --git a/arch/sparc/include/uapi/asm/fcntl.h b/arch/sparc/include/uapi/asm/fcntl.h
index d0b83f66f356..d73e5e008b0d 100644
--- a/arch/sparc/include/uapi/asm/fcntl.h
+++ b/arch/sparc/include/uapi/asm/fcntl.h
@@ -35,6 +35,7 @@
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 
 #define O_PATH		0x1000000
+#define O_TMPFILE	0x2000000
 
 #define F_GETOWN	5	/*  for sockets. */
 #define F_SETOWN	6	/*  for sockets. */
diff --git a/fs/dcache.c b/fs/dcache.c
index f09b9085f7d8..b7f049c31526 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2968,6 +2968,22 @@ rename_retry:
 	goto again;
 }
 
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
+{
+	inode_dec_link_count(inode);
+	BUG_ON(dentry->d_name.name != dentry->d_iname ||
+		!hlist_unhashed(&dentry->d_alias) ||
+		!d_unlinked(dentry));
+	spin_lock(&dentry->d_parent->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	dentry->d_name.len = sprintf(dentry->d_iname, "#%llu",
+				(unsigned long long)inode->i_ino);
+	spin_unlock(&dentry->d_lock);
+	spin_unlock(&dentry->d_parent->d_lock);
+	d_instantiate(dentry, inode);
+}
+EXPORT_SYMBOL(d_tmpfile);
+
 /**
  * find_inode_number - check for dentry with name
  * @dir: directory to check
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 73b0d9519836..256dd5f4c1c4 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -119,6 +119,29 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
 	return ext2_add_nondir(dentry, inode);
 }
 
+static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode = ext2_new_inode(dir, mode, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	inode->i_op = &ext2_file_inode_operations;
+	if (ext2_use_xip(inode->i_sb)) {
+		inode->i_mapping->a_ops = &ext2_aops_xip;
+		inode->i_fop = &ext2_xip_file_operations;
+	} else if (test_opt(inode->i_sb, NOBH)) {
+		inode->i_mapping->a_ops = &ext2_nobh_aops;
+		inode->i_fop = &ext2_file_operations;
+	} else {
+		inode->i_mapping->a_ops = &ext2_aops;
+		inode->i_fop = &ext2_file_operations;
+	}
+	mark_inode_dirty(inode);
+	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
+	return 0;
+}
+
 static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev)
 {
 	struct inode * inode;
@@ -398,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
 #endif
 	.setattr	= ext2_setattr,
 	.get_acl	= ext2_get_acl,
+	.tmpfile	= ext2_tmpfile,
 };
 
 const struct inode_operations ext2_special_inode_operations = {
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 0db73d9dd668..cd950e2331b6 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -54,6 +54,18 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode,
 	return error;
 }
 
+static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int error;
+	struct inode *inode = minix_new_inode(dir, mode, &error);
+	if (inode) {
+		minix_set_inode(inode, 0);
+		mark_inode_dirty(inode);
+		d_tmpfile(dentry, inode);
+	}
+	return error;
+}
+
 static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		bool excl)
 {
@@ -254,4 +266,5 @@ const struct inode_operations minix_dir_inode_operations = {
 	.mknod		= minix_mknod,
 	.rename		= minix_rename,
 	.getattr	= minix_getattr,
+	.tmpfile	= minix_tmpfile,
 };
diff --git a/fs/namei.c b/fs/namei.c
index 402eda351d07..778e253e3d48 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2902,6 +2902,61 @@ stale_open:
 	goto retry_lookup;
 }
 
+static int do_tmpfile(int dfd, struct filename *pathname,
+		struct nameidata *nd, int flags,
+		const struct open_flags *op,
+		struct file *file, int *opened)
+{
+	static const struct qstr name = QSTR_INIT("/", 1);
+	struct dentry *dentry, *child;
+	struct inode *dir;
+	int error = path_lookupat(dfd, pathname->name,
+				  flags | LOOKUP_DIRECTORY, nd);
+	if (unlikely(error))
+		return error;
+	error = mnt_want_write(nd->path.mnt);
+	if (unlikely(error))
+		goto out;
+	/* we want directory to be writable */
+	error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out2;
+	dentry = nd->path.dentry;
+	dir = dentry->d_inode;
+	if (!dir->i_op->tmpfile) {
+		error = -EOPNOTSUPP;
+		goto out2;
+	}
+	child = d_alloc(dentry, &name);
+	if (unlikely(!child)) {
+		error = -ENOMEM;
+		goto out2;
+	}
+	nd->flags &= ~LOOKUP_DIRECTORY;
+	nd->flags |= op->intent;
+	dput(nd->path.dentry);
+	nd->path.dentry = child;
+	error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode);
+	if (error)
+		goto out2;
+	audit_inode(pathname, nd->path.dentry, 0);
+	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	if (error)
+		goto out2;
+	file->f_path.mnt = nd->path.mnt;
+	error = finish_open(file, nd->path.dentry, NULL, opened);
+	if (error)
+		goto out2;
+	error = open_check_o_direct(file);
+	if (error)
+		fput(file);
+out2:
+	mnt_drop_write(nd->path.mnt);
+out:
+	path_put(&nd->path);
+	return error;
+}
+
 static struct file *path_openat(int dfd, struct filename *pathname,
 		struct nameidata *nd, const struct open_flags *op, int flags)
 {
@@ -2917,6 +2972,11 @@ static struct file *path_openat(int dfd, struct filename *pathname,
 
 	file->f_flags = op->open_flag;
 
+	if (unlikely(file->f_flags & O_TMPFILE)) {
+		error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened);
+		goto out;
+	}
+
 	error = path_init(dfd, pathname->name, flags | LOOKUP_PARENT, nd, &base);
 	if (unlikely(error))
 		goto out;
diff --git a/fs/open.c b/fs/open.c
index 5a40a4a51757..fca72c4d3f17 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,11 +840,15 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
-	/*
-	 * If we have O_PATH in the open flag. Then we
-	 * cannot have anything other than the below set of flags
-	 */
-	if (flags & O_PATH) {
+	if (flags & O_TMPFILE) {
+		if (!(flags & O_CREAT))
+			return -EINVAL;
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	} else if (flags & O_PATH) {
+		/*
+		 * If we have O_PATH in the open flag. Then we
+		 * cannot have anything other than the below set of flags
+		 */
 		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
 		acc_mode = 0;
 	} else {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 1a6bb81f0fe5..86da7595ba31 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -246,6 +246,8 @@ extern struct dentry * d_make_root(struct inode *);
 /* <clickety>-<click> the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
+extern void d_tmpfile(struct dentry *, struct inode *);
+
 extern struct dentry *d_find_alias(struct inode *);
 extern void d_prune_aliases(struct inode *);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c30e3a62baf..dd6615f0fd13 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1580,6 +1580,7 @@ struct inode_operations {
 	int (*atomic_open)(struct inode *, struct dentry *,
 			   struct file *, unsigned open_flag,
 			   umode_t create_mode, int *opened);
+	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
 } ____cacheline_aligned;
 
 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index a48937d4a5ea..06632beaa6d5 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -84,6 +84,10 @@
 #define O_PATH		010000000
 #endif
 
+#ifndef O_TMPFILE
+#define O_TMPFILE	020000000
+#endif
+
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
 #endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 5e6a8422658b..f887358dabc5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1965,6 +1965,37 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	return error;
 }
 
+static int
+shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	struct inode *inode;
+	int error = -ENOSPC;
+
+	inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
+	if (inode) {
+		error = security_inode_init_security(inode, dir,
+						     NULL,
+						     shmem_initxattrs, NULL);
+		if (error) {
+			if (error != -EOPNOTSUPP) {
+				iput(inode);
+				return error;
+			}
+		}
+#ifdef CONFIG_TMPFS_POSIX_ACL
+		error = generic_acl_init(inode, dir);
+		if (error) {
+			iput(inode);
+			return error;
+		}
+#else
+		error = 0;
+#endif
+		d_tmpfile(dentry, inode);
+	}
+	return error;
+}
+
 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int error;
@@ -2723,6 +2754,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
 	.rmdir		= shmem_rmdir,
 	.mknod		= shmem_mknod,
 	.rename		= shmem_rename,
+	.tmpfile	= shmem_tmpfile,
 #endif
 #ifdef CONFIG_TMPFS_XATTR
 	.setxattr	= shmem_setxattr,
-- 
cgit v1.3-7-g2ca7


From f4e0c30c191f87851c4a53454abb55ee276f4a7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 11 Jun 2013 08:34:36 +0400
Subject: allow the temp files created by open() to be linked to

O_TMPFILE | O_CREAT => linkat() with AT_SYMLINK_FOLLOW and /proc/self/fd/<n>
as oldpath (i.e. flink()) will create a link
O_TMPFILE | O_CREAT | O_EXCL => ENOENT on attempt to link those guys

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         |  4 +++-
 fs/namei.c         | 16 ++++++++++++++--
 include/linux/fs.h |  1 +
 3 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 00d5fc3b86e1..d6dfb09c8280 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -333,8 +333,10 @@ EXPORT_SYMBOL(set_nlink);
  */
 void inc_nlink(struct inode *inode)
 {
-	if (WARN_ON(inode->i_nlink == 0))
+	if (unlikely(inode->i_nlink == 0)) {
+		WARN_ON(!(inode->i_state & I_LINKABLE));
 		atomic_long_dec(&inode->i_sb->s_remove_count);
+	}
 
 	inode->__i_nlink++;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 778e253e3d48..66998b06d822 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2948,8 +2948,14 @@ static int do_tmpfile(int dfd, struct filename *pathname,
 	if (error)
 		goto out2;
 	error = open_check_o_direct(file);
-	if (error)
+	if (error) {
 		fput(file);
+	} else if (!(op->open_flag & O_EXCL)) {
+		struct inode *inode = file_inode(file);
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
 out2:
 	mnt_drop_write(nd->path.mnt);
 out:
@@ -3628,12 +3634,18 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 
 	mutex_lock(&inode->i_mutex);
 	/* Make sure we don't allow creating hardlink to an unlinked file */
-	if (inode->i_nlink == 0)
+	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
 		error =  -ENOENT;
 	else if (max_links && inode->i_nlink >= max_links)
 		error = -EMLINK;
 	else
 		error = dir->i_op->link(old_dentry, dir, new_dentry);
+
+	if (!error && (inode->i_state & I_LINKABLE)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd6615f0fd13..ab11c44b0697 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1744,6 +1744,7 @@ struct super_operations {
 #define I_REFERENCED		(1 << 8)
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
+#define I_LINKABLE		(1 << 10)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
-- 
cgit v1.3-7-g2ca7


From c77cecee52e9b599da1f8ffd9170d4374c99a345 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 13 Jun 2013 23:37:49 +0100
Subject: Replace a bunch of file->dentry->d_inode refs with file_inode()

Replace a bunch of file->dentry->d_inode refs with file_inode().

In __fput(), use file->f_inode instead so as not to be affected by any tricks
that file_inode() might grow.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c                   | 2 +-
 include/linux/fsnotify.h          | 8 ++++----
 security/integrity/ima/ima_main.c | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index 485dc0eddd67..08e719b884ca 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -227,7 +227,7 @@ static void __fput(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file->f_inode;
 
 	might_sleep();
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index a78680a92dba..1c804b057fb1 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -38,7 +38,7 @@ static inline int fsnotify_parent(struct path *path, struct dentry *dentry, __u3
 static inline int fsnotify_perm(struct file *file, int mask)
 {
 	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	__u32 fsnotify_mask = 0;
 	int ret;
 
@@ -192,7 +192,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 static inline void fsnotify_access(struct file *file)
 {
 	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
@@ -210,7 +210,7 @@ static inline void fsnotify_access(struct file *file)
 static inline void fsnotify_modify(struct file *file)
 {
 	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
@@ -228,7 +228,7 @@ static inline void fsnotify_modify(struct file *file)
 static inline void fsnotify_open(struct file *file)
 {
 	struct path *path = &file->f_path;
-	struct inode *inode = path->dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 6c491a63128e..e9508d5bbfcf 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -57,7 +57,7 @@ __setup("ima_hash=", hash_setup);
 static void ima_rdwr_violation_check(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
+	struct inode *inode = file_inode(file);
 	fmode_t mode = file->f_mode;
 	int must_measure;
 	bool send_tomtou = false, send_writers = false;
-- 
cgit v1.3-7-g2ca7


From 0b3fca1fd1499f0f5a7486d494f96538f2b7e5b9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 15 Jun 2013 11:37:47 +0400
Subject: kill find_inode_number()

the only remaining caller (in ncpfs) is guaranteed to return 0 -
we only hit it if we'd just checked that there's no dentry with
such name.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c        | 29 -----------------------------
 fs/ncpfs/dir.c     |  2 --
 include/linux/fs.h |  1 -
 3 files changed, 32 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index b7f049c31526..b692c7e097c5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2984,35 +2984,6 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode)
 }
 EXPORT_SYMBOL(d_tmpfile);
 
-/**
- * find_inode_number - check for dentry with name
- * @dir: directory to check
- * @name: Name to find.
- *
- * Check whether a dentry already exists for the given name,
- * and return the inode number if it has an inode. Otherwise
- * 0 is returned.
- *
- * This routine is used to post-process directory listings for
- * filesystems using synthetic inode numbers, and is necessary
- * to keep getcwd() working.
- */
- 
-ino_t find_inode_number(struct dentry *dir, struct qstr *name)
-{
-	struct dentry * dentry;
-	ino_t ino = 0;
-
-	dentry = d_hash_and_lookup(dir, name);
-	if (!IS_ERR_OR_NULL(dentry)) {
-		if (dentry->d_inode)
-			ino = dentry->d_inode->i_ino;
-		dput(dentry);
-	}
-	return ino;
-}
-EXPORT_SYMBOL(find_inode_number);
-
 static __initdata unsigned long dhash_entries;
 static int __init set_dhash_entries(char *str)
 {
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index e5d488530580..3bc105d36f10 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -659,8 +659,6 @@ end_advance:
 	if (!valid)
 		ctl.valid = 0;
 	if (!ctl.filled && (ctl.fpos == ctx->pos)) {
-		if (!ino)
-			ino = find_inode_number(dentry, &qname);
 		if (!ino)
 			ino = iunique(dir->i_sb, 2);
 		ctl.filled = !dir_emit(ctx, qname.name, qname.len,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ab11c44b0697..1db01c13ddce 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2311,7 +2311,6 @@ extern struct file * open_exec(const char *);
 /* fs/dcache.c -- generic fs support functions */
 extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
-extern ino_t find_inode_number(struct dentry *, struct qstr *);
 
 #include <linux/err.h>
 
-- 
cgit v1.3-7-g2ca7


From 1bf9d14dff4a2c4de6152c6f751bdaf6896b68bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 16 Jun 2013 20:27:42 +0400
Subject: new helper: fixed_size_llseek()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c    | 20 ++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index f646c8b565b9..782dfc3acebc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -144,6 +144,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
 }
 EXPORT_SYMBOL(generic_file_llseek);
 
+/**
+ * fixed_size_llseek - llseek implementation for fixed-sized devices
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @size:	size of the file
+ *
+ */
+loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
+{
+	switch (whence) {
+	case SEEK_SET: case SEEK_CUR: case SEEK_END:
+		return generic_file_llseek_size(file, offset, whence,
+						size, size);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(fixed_size_llseek);
+
 /**
  * noop_llseek - No Operation Performed llseek implementation
  * @file:	file structure to seek on
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1db01c13ddce..803b7fa2520a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2428,6 +2428,8 @@ extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
 		int whence, loff_t maxsize, loff_t eof);
+extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
+		int whence, loff_t size);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
-- 
cgit v1.3-7-g2ca7


From 68d70d03f8f5bd10a0e7337210b13f536fd4aeb9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 19 Jun 2013 15:26:04 +0400
Subject: constify rw_verify_area()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c           | 2 ++
 fs/internal.h      | 1 +
 fs/read_write.c    | 2 +-
 include/linux/fs.h | 1 -
 4 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf74d0c..a8ecc8313fb0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -39,6 +39,8 @@
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
 
+#include "internal.h"
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
diff --git a/fs/internal.h b/fs/internal.h
index f6ad34362823..7c5f01cf619d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,6 +131,7 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
  * read_write.c
  */
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
 
 /*
  * splice.c
diff --git a/fs/read_write.c b/fs/read_write.c
index 782dfc3acebc..fd72b592aa1b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -316,7 +316,7 @@ out_putf:
  * them to something that fits in "int" so that others
  * won't have to do range checks all the time.
  */
-int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
+int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
 {
 	struct inode *inode;
 	loff_t pos;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803b7fa2520a..68f10204ab29 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1898,7 +1898,6 @@ extern int current_umask(void);
 extern struct kobject *fs_kobj;
 
 #define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
-extern int rw_verify_area(int, struct file *, loff_t *, size_t);
 
 #define FLOCK_VERIFY_READ  1
 #define FLOCK_VERIFY_WRITE 2
-- 
cgit v1.3-7-g2ca7


From da53be12bbb4fabbe2e9f6f908de0cf478b5161d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 21 May 2013 15:22:44 -0700
Subject: Don't pass inode to ->d_hash() and ->d_compare()

Instances either don't look at it at all (the majority of cases) or
only want it to find the superblock (which can be had as dentry->d_sb).
A few cases that want more are actually safe with dentry->d_inode -
the only precaution needed is the check that it hadn't been replaced with
NULL by rmdir() or by overwriting rename(), which case should be simply
treated as cache miss.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  6 ++---
 Documentation/filesystems/vfs.txt | 19 +++++++---------
 fs/adfs/dir.c                     |  6 ++---
 fs/affs/namei.c                   | 26 +++++++--------------
 fs/cifs/dir.c                     |  9 +++-----
 fs/dcache.c                       | 27 ++++++++--------------
 fs/efivarfs/super.c               |  9 ++++----
 fs/fat/namei_msdos.c              |  6 ++---
 fs/fat/namei_vfat.c               | 12 ++++------
 fs/gfs2/dentry.c                  |  3 +--
 fs/hfs/hfs_fs.h                   |  7 ++----
 fs/hfs/string.c                   |  6 ++---
 fs/hfsplus/hfsplus_fs.h           |  7 ++----
 fs/hfsplus/unicode.c              |  7 ++----
 fs/hpfs/dentry.c                  |  7 ++----
 fs/isofs/inode.c                  | 48 +++++++++++++--------------------------
 fs/isofs/namei.c                  |  3 +--
 fs/jfs/namei.c                    |  7 ++----
 fs/namei.c                        |  7 +++---
 fs/ncpfs/dir.c                    | 32 +++++++++++++++++++-------
 fs/proc/proc_sysctl.c             |  7 +++---
 fs/sysv/namei.c                   |  3 +--
 include/linux/dcache.h            |  9 +++-----
 23 files changed, 108 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index bdd82b2339d9..f94a362f408e 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -11,10 +11,8 @@ be able to use diff(1).
 prototypes:
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, const struct inode *,
-			struct qstr *);
-	int (*d_compare)(const struct dentry *, const struct inode *,
-			const struct dentry *, const struct inode *,
+	int (*d_hash)(const struct dentry *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct dentry *,
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(struct dentry *);
 	void (*d_release)(struct dentry *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4a35f6614a66..51ba44e3fc40 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -901,10 +901,8 @@ defined:
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, const struct inode *,
-			struct qstr *);
-	int (*d_compare)(const struct dentry *, const struct inode *,
-			const struct dentry *, const struct inode *,
+	int (*d_hash)(const struct dentry *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct dentry *,
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
@@ -949,25 +947,24 @@ struct dentry_operations {
 
   d_hash: called when the VFS adds a dentry to the hash table. The first
 	dentry passed to d_hash is the parent directory that the name is
-	to be hashed into. The inode is the dentry's inode.
+	to be hashed into.
 
 	Same locking and synchronisation rules as d_compare regarding
 	what is safe to dereference etc.
 
   d_compare: called to compare a dentry name with a given name. The first
 	dentry is the parent of the dentry to be compared, the second is
-	the parent's inode, then the dentry and inode (may be NULL) of the
-	child dentry. len and name string are properties of the dentry to be
-	compared. qstr is the name to compare it with.
+	the child dentry. len and name string are properties of the dentry
+	to be compared. qstr is the name to compare it with.
 
 	Must be constant and idempotent, and should not take locks if
-	possible, and should not or store into the dentry or inodes.
-	Should not dereference pointers outside the dentry or inodes without
+	possible, and should not or store into the dentry.
+	Should not dereference pointers outside the dentry without
 	lots of care (eg.  d_parent, d_inode, d_name should not be used).
 
 	However, our vfsmount is pinned, and RCU held, so the dentries and
 	inodes won't disappear, neither will our sb or filesystem module.
-	->i_sb and ->d_sb may be used.
+	->d_sb may be used.
 
 	It is a tricky calling convention because it needs to be called under
 	"rcu-walk", ie. without any locks or references on things.
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index ade28bb058e3..0d138c0de293 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -191,8 +191,7 @@ const struct file_operations adfs_dir_operations = {
 };
 
 static int
-adfs_hash(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr)
+adfs_hash(const struct dentry *parent, struct qstr *qstr)
 {
 	const unsigned int name_len = ADFS_SB(parent->d_sb)->s_namelen;
 	const unsigned char *name;
@@ -228,8 +227,7 @@ adfs_hash(const struct dentry *parent, const struct inode *inode,
  * requirements of the underlying filesystem.
  */
 static int
-adfs_compare(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+adfs_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index ff65884a7839..c36cbb4537a2 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -13,18 +13,12 @@
 typedef int (*toupper_t)(int);
 
 static int	 affs_toupper(int ch);
-static int	 affs_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int	 affs_intl_toupper(int ch);
-static int	 affs_intl_hash_dentry(const struct dentry *,
-		const struct inode *, struct qstr *);
-static int       affs_intl_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int	 affs_intl_hash_dentry(const struct dentry *, struct qstr *);
+static int       affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 const struct dentry_operations affs_dentry_operations = {
@@ -86,14 +80,12 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
 }
 
 static int
-affs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	return __affs_hash_dentry(qstr, affs_toupper);
 }
 static int
-affs_intl_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	return __affs_hash_dentry(qstr, affs_intl_toupper);
 }
@@ -131,15 +123,13 @@ static inline int __affs_compare_dentry(unsigned int len,
 }
 
 static int
-affs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return __affs_compare_dentry(len, str, name, affs_toupper);
 }
 static int
-affs_intl_compare_dentry(const struct dentry *parent,const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return __affs_compare_dentry(len, str, name, affs_intl_toupper);
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5699b5036ed8..5175aebf6737 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -822,8 +822,7 @@ const struct dentry_operations cifs_dentry_ops = {
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
 
-static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *q)
+static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
@@ -838,12 +837,10 @@ static int cifs_ci_hash(const struct dentry *dentry, const struct inode *inode,
 	return 0;
 }
 
-static int cifs_ci_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int cifs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
-	struct nls_table *codepage = CIFS_SB(pinode->i_sb)->local_nls;
+	struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
 
 	if ((name->len == len) &&
 	    (nls_strnicmp(codepage, name->name, str, len) == 0))
diff --git a/fs/dcache.c b/fs/dcache.c
index b692c7e097c5..3199fe6863a8 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1723,7 +1723,7 @@ EXPORT_SYMBOL(d_add_ci);
  * Do the slow-case of the dentry name compare.
  *
  * Unlike the dentry_cmp() function, we need to atomically
- * load the name, length and inode information, so that the
+ * load the name and length information, so that the
  * filesystem can rely on them, and can use the 'name' and
  * 'len' information without worrying about walking off the
  * end of memory etc.
@@ -1741,22 +1741,18 @@ enum slow_d_compare {
 
 static noinline enum slow_d_compare slow_dentry_cmp(
 		const struct dentry *parent,
-		struct inode *inode,
 		struct dentry *dentry,
 		unsigned int seq,
 		const struct qstr *name)
 {
 	int tlen = dentry->d_name.len;
 	const char *tname = dentry->d_name.name;
-	struct inode *i = dentry->d_inode;
 
 	if (read_seqcount_retry(&dentry->d_seq, seq)) {
 		cpu_relax();
 		return D_COMP_SEQRETRY;
 	}
-	if (parent->d_op->d_compare(parent, inode,
-				dentry, i,
-				tlen, tname, name))
+	if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
 		return D_COMP_NOMATCH;
 	return D_COMP_OK;
 }
@@ -1766,7 +1762,6 @@ static noinline enum slow_d_compare slow_dentry_cmp(
  * @parent: parent dentry
  * @name: qstr of name we wish to find
  * @seqp: returns d_seq value at the point where the dentry was found
- * @inode: returns dentry->d_inode when the inode was found valid.
  * Returns: dentry, or NULL
  *
  * __d_lookup_rcu is the dcache lookup function for rcu-walk name
@@ -1793,7 +1788,7 @@ static noinline enum slow_d_compare slow_dentry_cmp(
  */
 struct dentry *__d_lookup_rcu(const struct dentry *parent,
 				const struct qstr *name,
-				unsigned *seqp, struct inode *inode)
+				unsigned *seqp)
 {
 	u64 hashlen = name->hash_len;
 	const unsigned char *str = name->name;
@@ -1827,11 +1822,10 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
 seqretry:
 		/*
 		 * The dentry sequence count protects us from concurrent
-		 * renames, and thus protects inode, parent and name fields.
+		 * renames, and thus protects parent and name fields.
 		 *
 		 * The caller must perform a seqcount check in order
-		 * to do anything useful with the returned dentry,
-		 * including using the 'd_inode' pointer.
+		 * to do anything useful with the returned dentry.
 		 *
 		 * NOTE! We do a "raw" seqcount_begin here. That means that
 		 * we don't wait for the sequence count to stabilize if it
@@ -1845,12 +1839,12 @@ seqretry:
 			continue;
 		if (d_unhashed(dentry))
 			continue;
-		*seqp = seq;
 
 		if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
 			if (dentry->d_name.hash != hashlen_hash(hashlen))
 				continue;
-			switch (slow_dentry_cmp(parent, inode, dentry, seq, name)) {
+			*seqp = seq;
+			switch (slow_dentry_cmp(parent, dentry, seq, name)) {
 			case D_COMP_OK:
 				return dentry;
 			case D_COMP_NOMATCH:
@@ -1862,6 +1856,7 @@ seqretry:
 
 		if (dentry->d_name.hash_len != hashlen)
 			continue;
+		*seqp = seq;
 		if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
 			return dentry;
 	}
@@ -1959,9 +1954,7 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
 		if (parent->d_flags & DCACHE_OP_COMPARE) {
 			int tlen = dentry->d_name.len;
 			const char *tname = dentry->d_name.name;
-			if (parent->d_op->d_compare(parent, parent->d_inode,
-						dentry, dentry->d_inode,
-						tlen, tname, name))
+			if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
 				goto next;
 		} else {
 			if (dentry->d_name.len != len)
@@ -1998,7 +1991,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 	 */
 	name->hash = full_name_hash(name->name, name->len);
 	if (dir->d_flags & DCACHE_OP_HASH) {
-		int err = dir->d_op->d_hash(dir, dir->d_inode, name);
+		int err = dir->d_op->d_hash(dir, name);
 		if (unlikely(err < 0))
 			return ERR_PTR(err);
 	}
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 141aee31884f..a8766b880c07 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -45,8 +45,8 @@ static struct super_block *efivarfs_sb;
  * So we need to perform a case-sensitive match on part 1 and a
  * case-insensitive match on part 2.
  */
-static int efivarfs_d_compare(const struct dentry *parent, const struct inode *pinode,
-			      const struct dentry *dentry, const struct inode *inode,
+static int efivarfs_d_compare(const struct dentry *parent,
+			      const struct dentry *dentry,
 			      unsigned int len, const char *str,
 			      const struct qstr *name)
 {
@@ -63,8 +63,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
 	return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
 }
 
-static int efivarfs_d_hash(const struct dentry *dentry,
-			   const struct inode *inode, struct qstr *qstr)
+static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	unsigned long hash = init_name_hash();
 	const unsigned char *s = qstr->name;
@@ -108,7 +107,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
 	q.name = name;
 	q.len = strlen(name);
 
-	err = efivarfs_d_hash(NULL, NULL, &q);
+	err = efivarfs_d_hash(NULL, &q);
 	if (err)
 		return ERR_PTR(err);
 
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 081b759cff83..a783b0e1272a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -148,8 +148,7 @@ static int msdos_find(struct inode *dir, const unsigned char *name, int len,
  * that the existing dentry can be used. The msdos fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
-	       struct qstr *qstr)
+static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	struct fat_mount_options *options = &MSDOS_SB(dentry->d_sb)->options;
 	unsigned char msdos_name[MSDOS_NAME];
@@ -165,8 +164,7 @@ static int msdos_hash(const struct dentry *dentry, const struct inode *inode,
  * Compare two msdos names. If either of the names are invalid,
  * we fall back to doing the standard name comparison.
  */
-static int msdos_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int msdos_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct fat_mount_options *options = &MSDOS_SB(parent->d_sb)->options;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 2da952036a3d..6df8d3d885e5 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,8 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
 	return 0;
@@ -120,8 +119,7 @@ static int vfat_hash(const struct dentry *dentry, const struct inode *inode,
  * that the existing dentry can be used. The vfat fs routines will
  * return ENOENT or EINVAL as appropriate.
  */
-static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
 	struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
 	const unsigned char *name;
@@ -142,8 +140,7 @@ static int vfat_hashi(const struct dentry *dentry, const struct inode *inode,
 /*
  * Case insensitive compare of two vfat names.
  */
-static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct nls_table *t = MSDOS_SB(parent->d_sb)->nls_io;
@@ -162,8 +159,7 @@ static int vfat_cmpi(const struct dentry *parent, const struct inode *pinode,
 /*
  * Case sensitive compare of two vfat names.
  */
-static int vfat_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int vfat_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned int alen, blen;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4fddb3c22d25..f2448ab2aac5 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -109,8 +109,7 @@ fail:
 	return 0;
 }
 
-static int gfs2_dhash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *str)
+static int gfs2_dhash(const struct dentry *dentry, struct qstr *str)
 {
 	str->hash = gfs2_disk_hash(str->name, str->len);
 	return 0;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index a73b11839a41..0524cda47a6e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -229,13 +229,10 @@ extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 /* string.c */
 extern const struct dentry_operations hfs_dentry_operations;
 
-extern int hfs_hash_dentry(const struct dentry *, const struct inode *,
-		struct qstr *);
+extern int hfs_hash_dentry(const struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
 		      const unsigned char *, unsigned int);
-extern int hfs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+extern int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 /* trans.c */
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 495a976a3cc9..85b610c3909f 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -51,8 +51,7 @@ static unsigned char caseorder[256] = {
 /*
  * Hash a string to an integer in a case-independent way
  */
-int hfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *this)
+int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
 {
 	const unsigned char *name = this->name;
 	unsigned int hash, len = this->len;
@@ -93,8 +92,7 @@ int hfs_strcmp(const unsigned char *s1, unsigned int len1,
  * Test for equality of two strings in the HFS filename character ordering.
  * return 1 on failure and 0 on success
  */
-int hfs_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+int hfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	const unsigned char *n1, *n2;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 60b0a3388b26..ede79317cfb8 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -495,11 +495,8 @@ int hfsplus_uni2asc(struct super_block *,
 		const struct hfsplus_unistr *, char *, int *);
 int hfsplus_asc2uni(struct super_block *,
 		struct hfsplus_unistr *, int, const char *, int);
-int hfsplus_hash_dentry(const struct dentry *dentry,
-		const struct inode *inode, struct qstr *str);
-int hfsplus_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 /* wrapper.c */
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 2c2e47dcfdd8..e8ef121a4d8b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -334,8 +334,7 @@ int hfsplus_asc2uni(struct super_block *sb,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *str)
+int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
 {
 	struct super_block *sb = dentry->d_sb;
 	const char *astr;
@@ -386,9 +385,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, const struct inode *inode,
  * Composed unicode characters are decomposed and case-folding is performed
  * if the appropriate bits are (un)set on the superblock.
  */
-int hfsplus_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct super_block *sb = parent->d_sb;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 05d4816e4e77..fa27980f2229 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -12,8 +12,7 @@
  * Note: the dentry argument is the parent dentry.
  */
 
-static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
 {
 	unsigned long	 hash;
 	int		 i;
@@ -35,9 +34,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, const struct inode *ino
 	return 0;
 }
 
-static int hpfs_compare_dentry(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int hpfs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	unsigned al = len;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d9b8aebdeb22..c348d6d88624 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -28,31 +28,23 @@
 
 #define BEQUIET
 
-static int isofs_hashi(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
-static int isofs_hash(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
+static int isofs_hashi(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 
 #ifdef CONFIG_JOLIET
-static int isofs_hashi_ms(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
-static int isofs_hash_ms(const struct dentry *parent, const struct inode *inode,
-		struct qstr *qstr);
+static int isofs_hashi_ms(const struct dentry *parent, struct qstr *qstr);
+static int isofs_hash_ms(const struct dentry *parent, struct qstr *qstr);
 static int isofs_dentry_cmpi_ms(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 static int isofs_dentry_cmp_ms(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+		const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name);
 #endif
 
@@ -265,30 +257,26 @@ static int isofs_dentry_cmp_common(
 }
 
 static int
-isofs_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 0);
 }
 
 static int
-isofs_hashi(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 0);
 }
 
 static int
-isofs_dentry_cmp(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 0, 0);
 }
 
 static int
-isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 0, 1);
@@ -296,30 +284,26 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct inode *pinode,
 
 #ifdef CONFIG_JOLIET
 static int
-isofs_hash_ms(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hash_common(dentry, qstr, 1);
 }
 
 static int
-isofs_hashi_ms(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
 {
 	return isofs_hashi_common(dentry, qstr, 1);
 }
 
 static int
-isofs_dentry_cmp_ms(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmp_ms(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 1, 0);
 }
 
 static int
-isofs_dentry_cmpi_ms(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+isofs_dentry_cmpi_ms(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	return isofs_dentry_cmp_common(len, str, name, 1, 1);
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index c167028844ed..95295640d9c8 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -37,8 +37,7 @@ isofs_cmp(struct dentry *dentry, const char *compare, int dlen)
 
 	qstr.name = compare;
 	qstr.len = dlen;
-	return dentry->d_op->d_compare(NULL, NULL, NULL, NULL,
-			dentry->d_name.len, dentry->d_name.name, &qstr);
+	return dentry->d_op->d_compare(NULL, NULL, dentry->d_name.len, dentry->d_name.name, &qstr);
 }
 
 /*
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 89186b7b9002..8b19027291d6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1538,8 +1538,7 @@ const struct file_operations jfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
-		struct qstr *this)
+static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
 {
 	unsigned long hash;
 	int i;
@@ -1552,9 +1551,7 @@ static int jfs_ci_hash(const struct dentry *dir, const struct inode *inode,
 	return 0;
 }
 
-static int jfs_ci_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int jfs_ci_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	int i, result = 1;
diff --git a/fs/namei.c b/fs/namei.c
index 66998b06d822..b2beee7a733f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1352,7 +1352,7 @@ static int lookup_fast(struct nameidata *nd,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-		dentry = __d_lookup_rcu(parent, &nd->last, &seq, nd->inode);
+		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (!dentry)
 			goto unlazy;
 
@@ -1787,8 +1787,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-				err = parent->d_op->d_hash(parent, nd->inode,
-							   &this);
+				err = parent->d_op->d_hash(parent, &this);
 				if (err < 0)
 					break;
 			}
@@ -2121,7 +2120,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	 * to use its own hash..
 	 */
 	if (base->d_flags & DCACHE_OP_HASH) {
-		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		int err = base->d_op->d_hash(base, &this);
 		if (err < 0)
 			return ERR_PTR(err);
 	}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 3bc105d36f10..3be047474bfc 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -73,10 +73,8 @@ const struct inode_operations ncp_dir_inode_operations =
  * Dentry operations routines
  */
 static int ncp_lookup_validate(struct dentry *, unsigned int);
-static int ncp_hash_dentry(const struct dentry *, const struct inode *,
-		struct qstr *);
-static int ncp_compare_dentry(const struct dentry *, const struct inode *,
-		const struct dentry *, const struct inode *,
+static int ncp_hash_dentry(const struct dentry *, struct qstr *);
+static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
 		unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
 
@@ -119,11 +117,19 @@ static inline int ncp_case_sensitive(const struct inode *i)
 /*
  * Note: leave the hash unchanged if the directory
  * is case-sensitive.
+ *
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
  */
 static int 
-ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *this)
+ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
 {
+	struct inode *inode = ACCESS_ONCE(dentry->d_inode);
+
+	if (!inode)
+		return 0;
+
 	if (!ncp_case_sensitive(inode)) {
 		struct super_block *sb = dentry->d_sb;
 		struct nls_table *t;
@@ -140,14 +146,24 @@ ncp_hash_dentry(const struct dentry *dentry, const struct inode *inode,
 	return 0;
 }
 
+/*
+ * Accessing the parent inode can be racy under RCU pathwalking.
+ * Use ACCESS_ONCE() to make sure we use _one_ particular inode,
+ * the callers will handle races.
+ */
 static int
-ncp_compare_dentry(const struct dentry *parent, const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	struct inode *pinode;
+
 	if (len != name->len)
 		return 1;
 
+	pinode = ACCESS_ONCE(parent->d_inode);
+	if (!pinode)
+		return 1;
+
 	if (ncp_case_sensitive(pinode))
 		return strncmp(str, name->name, len);
 
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index f3a570e7c257..71290463a1d3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -796,15 +796,16 @@ static int sysctl_is_seen(struct ctl_table_header *p)
 	return res;
 }
 
-static int proc_sys_compare(const struct dentry *parent,
-		const struct inode *pinode,
-		const struct dentry *dentry, const struct inode *inode,
+static int proc_sys_compare(const struct dentry *parent, const struct dentry *dentry,
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct ctl_table_header *head;
+	struct inode *inode;
+
 	/* Although proc doesn't have negative dentries, rcu-walk means
 	 * that inode here can be NULL */
 	/* AV: can it, indeed? */
+	inode = ACCESS_ONCE(dentry->d_inode);
 	if (!inode)
 		return 1;
 	if (name->len != len)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 1c0d5f264767..731b2bbcaab3 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -27,8 +27,7 @@ static int add_nondir(struct dentry *dentry, struct inode *inode)
 	return err;
 }
 
-static int sysv_hash(const struct dentry *dentry, const struct inode *inode,
-		struct qstr *qstr)
+static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
 {
 	/* Truncate the name in place, avoids having to define a compare
 	   function. */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 86da7595ba31..f42dbe145479 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -146,10 +146,8 @@ enum dentry_d_lock_class
 struct dentry_operations {
 	int (*d_revalidate)(struct dentry *, unsigned int);
 	int (*d_weak_revalidate)(struct dentry *, unsigned int);
-	int (*d_hash)(const struct dentry *, const struct inode *,
-			struct qstr *);
-	int (*d_compare)(const struct dentry *, const struct inode *,
-			const struct dentry *, const struct inode *,
+	int (*d_hash)(const struct dentry *, struct qstr *);
+	int (*d_compare)(const struct dentry *, const struct dentry *,
 			unsigned int, const char *, const struct qstr *);
 	int (*d_delete)(const struct dentry *);
 	void (*d_release)(struct dentry *);
@@ -302,8 +300,7 @@ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);
 extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
 extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
-				const struct qstr *name,
-				unsigned *seq, struct inode *inode);
+				const struct qstr *name, unsigned *seq);
 
 /**
  * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok
-- 
cgit v1.3-7-g2ca7


From f891a29f46553a384edbaa0f6ac446b1d03bccac Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:09 -0400
Subject: locks: drop the unused filp argument to posix_unblock_lock

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/lockd/svclock.c | 2 +-
 fs/locks.c         | 4 +---
 include/linux/fs.h | 5 ++---
 3 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e703318c41df..a469098682c4 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -276,7 +276,7 @@ static int nlmsvc_unlink_block(struct nlm_block *block)
 	dprintk("lockd: unlinking block %p...\n", block);
 
 	/* Remove block from list */
-	status = posix_unblock_lock(block->b_file->f_file, &block->b_call->a_args.lock.fl);
+	status = posix_unblock_lock(&block->b_call->a_args.lock.fl);
 	nlmsvc_remove_block(block);
 	return status;
 }
diff --git a/fs/locks.c b/fs/locks.c
index cb424a4fed71..72fb2b722211 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2099,13 +2099,12 @@ void locks_remove_flock(struct file *filp)
 
 /**
  *	posix_unblock_lock - stop waiting for a file lock
- *      @filp:   how the file was opened
  *	@waiter: the lock which was waiting
  *
  *	lockd needs to block waiting for locks.
  */
 int
-posix_unblock_lock(struct file *filp, struct file_lock *waiter)
+posix_unblock_lock(struct file_lock *waiter)
 {
 	int status = 0;
 
@@ -2117,7 +2116,6 @@ posix_unblock_lock(struct file *filp, struct file_lock *waiter)
 	unlock_flocks();
 	return status;
 }
-
 EXPORT_SYMBOL(posix_unblock_lock);
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68f10204ab29..172303655702 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -994,7 +994,7 @@ extern void locks_release_private(struct file_lock *);
 extern void posix_test_lock(struct file *, struct file_lock *);
 extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
 extern int posix_lock_file_wait(struct file *, struct file_lock *);
-extern int posix_unblock_lock(struct file *, struct file_lock *);
+extern int posix_unblock_lock(struct file_lock *);
 extern int vfs_test_lock(struct file *, struct file_lock *);
 extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
 extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
@@ -1084,8 +1084,7 @@ static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl)
 	return -ENOLCK;
 }
 
-static inline int posix_unblock_lock(struct file *filp,
-				     struct file_lock *waiter)
+static inline int posix_unblock_lock(struct file_lock *waiter)
 {
 	return -ENOENT;
 }
-- 
cgit v1.3-7-g2ca7


From 1a9e64a7118c5ad13dd5119da18375a5bd45b330 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:10 -0400
Subject: cifs: use posix_unblock_lock instead of locks_delete_block

commit 66189be74 (CIFS: Fix VFS lock usage for oplocked files) exported
the locks_delete_block symbol. There's already an exported helper
function that provides this capability however, so make cifs use that
instead and turn locks_delete_block back into a static function.

Note that if fl->fl_next == NULL then this lock has already been through
locks_delete_block(), so we should be OK to ignore an ENOENT error here
and simply not retry the lock.

Cc: Pavel Shilovsky <piastryyy@gmail.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cifs/file.c     | 2 +-
 fs/locks.c         | 3 +--
 include/linux/fs.h | 5 -----
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 48b29d24c9f4..1686e4085646 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -999,7 +999,7 @@ try_again:
 		rc = wait_event_interruptible(flock->fl_wait, !flock->fl_next);
 		if (!rc)
 			goto try_again;
-		locks_delete_block(flock);
+		posix_unblock_lock(flock);
 	}
 	return rc;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 72fb2b722211..d732e2226f17 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -496,13 +496,12 @@ static void __locks_delete_block(struct file_lock *waiter)
 
 /*
  */
-void locks_delete_block(struct file_lock *waiter)
+static void locks_delete_block(struct file_lock *waiter)
 {
 	lock_flocks();
 	__locks_delete_block(waiter);
 	unlock_flocks();
 }
-EXPORT_SYMBOL(locks_delete_block);
 
 /* Insert waiter into blocker's block list.
  * We use a circular list so that processes can be easily woken up in
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 172303655702..6cfc9a29a783 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1006,7 +1006,6 @@ extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-extern void locks_delete_block(struct file_lock *waiter);
 extern void lock_flocks(void);
 extern void unlock_flocks(void);
 #else /* !CONFIG_FILE_LOCKING */
@@ -1150,10 +1149,6 @@ static inline int lock_may_write(struct inode *inode, loff_t start,
 	return 1;
 }
 
-static inline void locks_delete_block(struct file_lock *waiter)
-{
-}
-
 static inline void lock_flocks(void)
 {
 }
-- 
cgit v1.3-7-g2ca7


From 1cb360125966cb6cb594e414ea80a0154617b846 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:12 -0400
Subject: locks: comment cleanups and clarifications

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c         | 21 +++++++++++++--------
 include/linux/fs.h | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 804bb9e01a65..ddeab49fe2be 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -518,9 +518,10 @@ static void locks_insert_block(struct file_lock *blocker,
 		list_add(&waiter->fl_link, &blocked_list);
 }
 
-/* Wake up processes blocked waiting for blocker.
- * If told to wait then schedule the processes until the block list
- * is empty, otherwise empty the block list ourselves.
+/*
+ * Wake up processes blocked waiting for blocker.
+ *
+ * Must be called with the file_lock_lock held!
  */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
@@ -806,6 +807,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	}
 
 	lock_flocks();
+	/*
+	 * New lock request. Walk all POSIX locks and look for conflicts. If
+	 * there are any, either return error or put the request on the
+	 * blocker's list of waiters and the global blocked_list.
+	 */
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
 			fl = *before;
@@ -844,7 +850,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		before = &fl->fl_next;
 	}
 
-	/* Process locks with this owner.  */
+	/* Process locks with this owner. */
 	while ((fl = *before) && posix_same_owner(request, fl)) {
 		/* Detect adjacent or overlapping regions (if same lock type)
 		 */
@@ -930,10 +936,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	}
 
 	/*
-	 * The above code only modifies existing locks in case of
-	 * merging or replacing.  If new lock(s) need to be inserted
-	 * all modifications are done bellow this, so it's safe yet to
-	 * bail out.
+	 * The above code only modifies existing locks in case of merging or
+	 * replacing. If new lock(s) need to be inserted all modifications are
+	 * done below this, so it's safe yet to bail out.
 	 */
 	error = -ENOLCK; /* "no luck" */
 	if (right && left == right && !new_fl2)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6cfc9a29a783..ed9fdaaf3223 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -926,6 +926,24 @@ int locks_in_grace(struct net *);
 /* that will die - we need it for nfs_lock_info */
 #include <linux/nfs_fs_i.h>
 
+/*
+ * struct file_lock represents a generic "file lock". It's used to represent
+ * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
+ * note that the same struct is used to represent both a request for a lock and
+ * the lock itself, but the same object is never used for both.
+ *
+ * FIXME: should we create a separate "struct lock_request" to help distinguish
+ * these two uses?
+ *
+ * The i_flock list is ordered by:
+ *
+ * 1) lock type -- FL_LEASEs first, then FL_FLOCK, and finally FL_POSIX
+ * 2) lock owner
+ * 3) lock range start
+ * 4) lock range end
+ *
+ * Obviously, the last two criteria only matter for POSIX locks.
+ */
 struct file_lock {
 	struct file_lock *fl_next;	/* singly linked list for this inode  */
 	struct list_head fl_link;	/* doubly linked list of all locks */
-- 
cgit v1.3-7-g2ca7


From 1c8c601a8c0dc59fe64907dcd9d512a3d181ddc7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:15 -0400
Subject: locks: protect most of the file_lock handling with i_lock

Having a global lock that protects all of this code is a clear
scalability problem. Instead of doing that, move most of the code to be
protected by the i_lock instead. The exceptions are the global lists
that the ->fl_link sits on, and the ->fl_block list.

->fl_link is what connects these structures to the
global lists, so we must ensure that we hold those locks when iterating
over or updating these lists.

Furthermore, sound deadlock detection requires that we hold the
blocked_list state steady while checking for loops. We also must ensure
that the search and update to the list are atomic.

For the checking and insertion side of the blocked_list, push the
acquisition of the global lock into __posix_lock_file and ensure that
checking and update of the  blocked_list is done without dropping the
lock in between.

On the removal side, when waking up blocked lock waiters, take the
global lock before walking the blocked list and dequeue the waiters from
the global list prior to removal from the fl_block list.

With this, deadlock detection should be race free while we minimize
excessive file_lock_lock thrashing.

Finally, in order to avoid a lock inversion problem when handling
/proc/locks output we must ensure that manipulations of the fl_block
list are also protected by the file_lock_lock.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking |  21 +++--
 fs/afs/flock.c                    |   7 +-
 fs/ceph/locks.c                   |   2 +-
 fs/ceph/mds_client.c              |   8 +-
 fs/cifs/cifsfs.c                  |   2 +-
 fs/cifs/file.c                    |  13 +--
 fs/gfs2/file.c                    |   2 +-
 fs/lockd/svcsubs.c                |  12 +--
 fs/locks.c                        | 164 ++++++++++++++++++++++++--------------
 fs/nfs/delegation.c               |  10 +--
 fs/nfs/nfs4state.c                |   8 +-
 fs/nfsd/nfs4state.c               |   8 +-
 include/linux/fs.h                |  11 ---
 13 files changed, 155 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f94a362f408e..c2963a74fbc3 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -342,7 +342,7 @@ prototypes:
 
 
 locking rules:
-			file_lock_lock	may block
+			inode->i_lock	may block
 fl_copy_lock:		yes		no
 fl_release_private:	maybe		no
 
@@ -355,12 +355,19 @@ prototypes:
 	int (*lm_change)(struct file_lock **, int);
 
 locking rules:
-			file_lock_lock	may block
-lm_compare_owner:	yes		no
-lm_notify:		yes		no
-lm_grant:		no		no
-lm_break:		yes		no
-lm_change		yes		no
+
+			inode->i_lock	file_lock_lock	may block
+lm_compare_owner:	yes[1]		maybe		no
+lm_notify:		yes		yes		no
+lm_grant:		no		no		no
+lm_break:		yes		no		no
+lm_change		yes		no		no
+
+[1]:	->lm_compare_owner is generally called with *an* inode->i_lock held. It
+may not be the i_lock of the inode for either file_lock being compared! This is
+the case with deadlock detection, since the code has to chase down the owners
+of locks that may be entirely unrelated to the one on which the lock is being
+acquired. When doing a search for deadlocks, the file_lock_lock is also held.
 
 --------------------------- buffer_head -----------------------------------
 prototypes:
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 2497bf306c70..a8cf2cff836c 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -252,7 +252,8 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
  */
 static int afs_do_setlk(struct file *file, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
+	struct inode *inode = file_inode(file);
+	struct afs_vnode *vnode = AFS_FS_I(inode);
 	afs_lock_type_t type;
 	struct key *key = file->private_data;
 	int ret;
@@ -273,7 +274,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 
 	/* make sure we've got a callback on this file and that our view of the
 	 * data version is up to date */
@@ -420,7 +421,7 @@ given_lock:
 	afs_vnode_fetch_status(vnode, NULL, key);
 
 error:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	_leave(" = %d", ret);
 	return ret;
 
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ebbf680378e2..690f73f42425 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -192,7 +192,7 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 
 /**
  * Encode the flock and fcntl locks for the given inode into the ceph_filelock
- * array. Must be called with lock_flocks() already held.
+ * array. Must be called with inode->i_lock already held.
  * If we encounter more of a specific lock type than expected, return -ENOSPC.
  */
 int ceph_encode_locks_to_buffer(struct inode *inode,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4d2920304be8..74fd2898b2ab 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2481,20 +2481,20 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		struct ceph_filelock *flocks;
 
 encode_again:
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 		ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
 				 sizeof(struct ceph_filelock), GFP_NOFS);
 		if (!flocks) {
 			err = -ENOMEM;
 			goto out_free;
 		}
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 		err = ceph_encode_locks_to_buffer(inode, flocks,
 						  num_fcntl_locks,
 						  num_flock_locks);
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		if (err) {
 			kfree(flocks);
 			if (err == -ENOSPC)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 540c1ccfcdb2..a445e71746fa 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -765,7 +765,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with lock_flocks held
+	/* note that this is called by vfs setlease with i_lock held
 	   to protect *lease from going away */
 	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1686e4085646..0630710a9c3f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1092,6 +1092,7 @@ struct lock_to_push {
 static int
 cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
+	struct inode *inode = cfile->dentry->d_inode;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct file_lock *flock, **before;
 	unsigned int count = 0, i = 0;
@@ -1102,12 +1103,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	xid = get_xid();
 
-	lock_flocks();
-	cifs_for_each_lock(cfile->dentry->d_inode, before) {
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
 		if ((*before)->fl_flags & FL_POSIX)
 			count++;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	INIT_LIST_HEAD(&locks_to_send);
 
@@ -1126,8 +1127,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 	}
 
 	el = locks_to_send.next;
-	lock_flocks();
-	cifs_for_each_lock(cfile->dentry->d_inode, before) {
+	spin_lock(&inode->i_lock);
+	cifs_for_each_lock(inode, before) {
 		flock = *before;
 		if ((flock->fl_flags & FL_POSIX) == 0)
 			continue;
@@ -1152,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 		lck->offset = flock->fl_start;
 		el = el->next;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
 		int stored_rc;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b3333371aebb..cebfd404c1d9 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -889,7 +889,7 @@ out_uninit:
  * cluster; until we do, disable leases (by just returning -EINVAL),
  * unless the administrator has requested purely local locking.
  *
- * Locking: called under lock_flocks
+ * Locking: called under i_lock
  *
  * Returns: errno
  */
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index 97e87415b145..dc5c75930f0f 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -169,7 +169,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 
 again:
 	file->f_locks = 0;
-	lock_flocks(); /* protects i_flock list */
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
@@ -181,7 +181,7 @@ again:
 		if (match(lockhost, host)) {
 			struct file_lock lock = *fl;
 
-			unlock_flocks();
+			spin_unlock(&inode->i_lock);
 			lock.fl_type  = F_UNLCK;
 			lock.fl_start = 0;
 			lock.fl_end   = OFFSET_MAX;
@@ -193,7 +193,7 @@ again:
 			goto again;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	return 0;
 }
@@ -228,14 +228,14 @@ nlm_file_inuse(struct nlm_file *file)
 	if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
 		return 1;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl; fl = fl->fl_next) {
 		if (fl->fl_lmops == &nlmsvc_lock_operations) {
-			unlock_flocks();
+			spin_unlock(&inode->i_lock);
 			return 1;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	file->f_locks = 0;
 	return 0;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 89d898bce166..ce302d43822b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -153,27 +153,37 @@ int lease_break_time = 45;
 #define for_each_lock(inode, lockp) \
 	for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 
-/* The global file_lock_list is only used for displaying /proc/locks. */
+/*
+ * The global file_lock_list is only used for displaying /proc/locks. Protected
+ * by the file_lock_lock.
+ */
 static LIST_HEAD(file_lock_list);
 
-/* The blocked_list is used to find POSIX lock loops for deadlock detection. */
+/*
+ * The blocked_list is used to find POSIX lock loops for deadlock detection.
+ * Protected by file_lock_lock.
+ */
 static LIST_HEAD(blocked_list);
 
-/* Protects the two list heads above, plus the inode->i_flock list */
+/*
+ * This lock protects the blocked_list, and the file_lock_list. Generally, if
+ * you're accessing one of those lists, you want to be holding this lock.
+ *
+ * In addition, it also protects the fl->fl_block list, and the fl->fl_next
+ * pointer for file_lock structures that are acting as lock requests (in
+ * contrast to those that are acting as records of acquired locks).
+ *
+ * Note that when we acquire this lock in order to change the above fields,
+ * we often hold the i_lock as well. In certain cases, when reading the fields
+ * protected by this lock, we can skip acquiring it iff we already hold the
+ * i_lock.
+ *
+ * In particular, adding an entry to the fl_block list requires that you hold
+ * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * an entry from the list however only requires the file_lock_lock.
+ */
 static DEFINE_SPINLOCK(file_lock_lock);
 
-void lock_flocks(void)
-{
-	spin_lock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(lock_flocks);
-
-void unlock_flocks(void)
-{
-	spin_unlock(&file_lock_lock);
-}
-EXPORT_SYMBOL_GPL(unlock_flocks);
-
 static struct kmem_cache *filelock_cache __read_mostly;
 
 static void locks_init_lock_heads(struct file_lock *fl)
@@ -489,13 +499,17 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 static inline void
 locks_insert_global_locks(struct file_lock *fl)
 {
+	spin_lock(&file_lock_lock);
 	list_add_tail(&fl->fl_link, &file_lock_list);
+	spin_unlock(&file_lock_lock);
 }
 
 static inline void
 locks_delete_global_locks(struct file_lock *fl)
 {
+	spin_lock(&file_lock_lock);
 	list_del_init(&fl->fl_link);
+	spin_unlock(&file_lock_lock);
 }
 
 static inline void
@@ -512,6 +526,8 @@ locks_delete_global_blocked(struct file_lock *waiter)
 
 /* Remove waiter from blocker's block list.
  * When blocker ends up pointing to itself then the list is empty.
+ *
+ * Must be called with file_lock_lock held.
  */
 static void __locks_delete_block(struct file_lock *waiter)
 {
@@ -520,37 +536,47 @@ static void __locks_delete_block(struct file_lock *waiter)
 	waiter->fl_next = NULL;
 }
 
-/*
- */
 static void locks_delete_block(struct file_lock *waiter)
 {
-	lock_flocks();
+	spin_lock(&file_lock_lock);
 	__locks_delete_block(waiter);
-	unlock_flocks();
+	spin_unlock(&file_lock_lock);
 }
 
 /* Insert waiter into blocker's block list.
  * We use a circular list so that processes can be easily woken up in
  * the order they blocked. The documentation doesn't require this but
  * it seems like the reasonable thing to do.
+ *
+ * Must be called with file_lock_lock held!
  */
-static void locks_insert_block(struct file_lock *blocker, 
-			       struct file_lock *waiter)
+static void __locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
 {
 	BUG_ON(!list_empty(&waiter->fl_block));
 	waiter->fl_next = blocker;
 	list_add_tail(&waiter->fl_block, &blocker->fl_block);
 	if (IS_POSIX(blocker))
-		locks_insert_global_blocked(request);
+		locks_insert_global_blocked(waiter);
+}
+
+/* Must be called with i_lock held. */
+static void locks_insert_block(struct file_lock *blocker,
+					struct file_lock *waiter)
+{
+	spin_lock(&file_lock_lock);
+	__locks_insert_block(blocker, waiter);
+	spin_unlock(&file_lock_lock);
 }
 
 /*
  * Wake up processes blocked waiting for blocker.
  *
- * Must be called with the file_lock_lock held!
+ * Must be called with the inode->i_lock held!
  */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
+	spin_lock(&file_lock_lock);
 	while (!list_empty(&blocker->fl_block)) {
 		struct file_lock *waiter;
 
@@ -562,10 +588,13 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 		else
 			wake_up(&waiter->fl_wait);
 	}
+	spin_unlock(&file_lock_lock);
 }
 
 /* Insert file lock fl into an inode's lock list at the position indicated
  * by pos. At the same time add the lock to the global file lock list.
+ *
+ * Must be called with the i_lock held!
  */
 static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
@@ -583,6 +612,8 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
  * Wake up processes that are blocked waiting for this lock,
  * notify the FS that the lock has been cleared and
  * finally free the lock.
+ *
+ * Must be called with the i_lock held!
  */
 static void locks_delete_lock(struct file_lock **thisfl_p)
 {
@@ -652,8 +683,9 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
 	struct file_lock *cfl;
+	struct inode *inode = file_inode(filp);
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
@@ -666,7 +698,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 			fl->fl_pid = pid_vnr(cfl->fl_nspid);
 	} else
 		fl->fl_type = F_UNLCK;
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -710,6 +742,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
 	return NULL;
 }
 
+/* Must be called with the file_lock_lock held! */
 static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
@@ -745,7 +778,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 			return -ENOMEM;
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
 
@@ -775,9 +808,9 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	 * give it the opportunity to lock the file.
 	 */
 	if (found) {
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		cond_resched();
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
 
 find_conflict:
@@ -804,7 +837,7 @@ find_conflict:
 	error = 0;
 
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	if (new_fl)
 		locks_free_lock(new_fl);
 	return error;
@@ -834,7 +867,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		new_fl2 = locks_alloc_lock();
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	/*
 	 * New lock request. Walk all POSIX locks and look for conflicts. If
 	 * there are any, either return error or put the request on the
@@ -852,11 +885,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			error = -EAGAIN;
 			if (!(request->fl_flags & FL_SLEEP))
 				goto out;
+			/*
+			 * Deadlock detection and insertion into the blocked
+			 * locks list must be done while holding the same lock!
+			 */
 			error = -EDEADLK;
-			if (posix_locks_deadlock(request, fl))
-				goto out;
-			error = FILE_LOCK_DEFERRED;
-			locks_insert_block(fl, request);
+			spin_lock(&file_lock_lock);
+			if (likely(!posix_locks_deadlock(request, fl))) {
+				error = FILE_LOCK_DEFERRED;
+				__locks_insert_block(fl, request);
+			}
+			spin_unlock(&file_lock_lock);
 			goto out;
   		}
   	}
@@ -1006,7 +1045,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 		locks_wake_up_blocks(left);
 	}
  out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	/*
 	 * Free any unused locks.
 	 */
@@ -1081,14 +1120,14 @@ int locks_mandatory_locked(struct inode *inode)
 	/*
 	 * Search the lock list for this inode for any POSIX locks.
 	 */
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!IS_POSIX(fl))
 			continue;
 		if (fl->fl_owner != owner)
 			break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return fl ? -EAGAIN : 0;
 }
 
@@ -1231,7 +1270,7 @@ int __break_lease(struct inode *inode, unsigned int mode)
 	if (IS_ERR(new_fl))
 		return PTR_ERR(new_fl);
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 
 	time_out_leases(inode);
 
@@ -1281,11 +1320,11 @@ restart:
 			break_time++;
 	}
 	locks_insert_block(flock, new_fl);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
-	lock_flocks();
-	__locks_delete_block(new_fl);
+	spin_lock(&inode->i_lock);
+	locks_delete_block(new_fl);
 	if (error >= 0) {
 		if (error == 0)
 			time_out_leases(inode);
@@ -1302,7 +1341,7 @@ restart:
 	}
 
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	locks_free_lock(new_fl);
 	return error;
 }
@@ -1355,9 +1394,10 @@ EXPORT_SYMBOL(lease_get_mtime);
 int fcntl_getlease(struct file *filp)
 {
 	struct file_lock *fl;
+	struct inode *inode = file_inode(filp);
 	int type = F_UNLCK;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	time_out_leases(file_inode(filp));
 	for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
 			fl = fl->fl_next) {
@@ -1366,7 +1406,7 @@ int fcntl_getlease(struct file *filp)
 			break;
 		}
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return type;
 }
 
@@ -1460,7 +1500,7 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp)
  *	The (input) flp->fl_lmops->lm_break function is required
  *	by break_lease().
  *
- *	Called with file_lock_lock held.
+ *	Called with inode->i_lock held.
  */
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
@@ -1529,11 +1569,12 @@ static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 
 int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
 {
+	struct inode *inode = file_inode(filp);
 	int error;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	error = __vfs_setlease(filp, arg, lease);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 	return error;
 }
@@ -1551,6 +1592,7 @@ static int do_fcntl_delete_lease(struct file *filp)
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 {
 	struct file_lock *fl, *ret;
+	struct inode *inode = file_inode(filp);
 	struct fasync_struct *new;
 	int error;
 
@@ -1564,10 +1606,10 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		return -ENOMEM;
 	}
 	ret = fl;
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	error = __vfs_setlease(filp, arg, &ret);
 	if (error) {
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		locks_free_lock(fl);
 		goto out_free_fasync;
 	}
@@ -1584,7 +1626,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 		new = NULL;
 
 	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 
 out_free_fasync:
 	if (new)
@@ -2108,7 +2150,7 @@ void locks_remove_flock(struct file *filp)
 			fl.fl_ops->fl_release_private(&fl);
 	}
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	before = &inode->i_flock;
 
 	while ((fl = *before) != NULL) {
@@ -2126,7 +2168,7 @@ void locks_remove_flock(struct file *filp)
  		}
 		before = &fl->fl_next;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 }
 
 /**
@@ -2140,12 +2182,12 @@ posix_unblock_lock(struct file_lock *waiter)
 {
 	int status = 0;
 
-	lock_flocks();
+	spin_lock(&file_lock_lock);
 	if (waiter->fl_next)
 		__locks_delete_block(waiter);
 	else
 		status = -ENOENT;
-	unlock_flocks();
+	spin_unlock(&file_lock_lock);
 	return status;
 }
 EXPORT_SYMBOL(posix_unblock_lock);
@@ -2259,7 +2301,7 @@ static void *locks_start(struct seq_file *f, loff_t *pos)
 {
 	loff_t *p = f->private;
 
-	lock_flocks();
+	spin_lock(&file_lock_lock);
 	*p = (*pos + 1);
 	return seq_list_start(&file_lock_list, *pos);
 }
@@ -2273,7 +2315,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 
 static void locks_stop(struct seq_file *f, void *v)
 {
-	unlock_flocks();
+	spin_unlock(&file_lock_lock);
 }
 
 static const struct seq_operations locks_seq_operations = {
@@ -2320,7 +2362,8 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_flocks();
+
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if (fl->fl_type == F_RDLCK)
@@ -2337,7 +2380,7 @@ int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return result;
 }
 
@@ -2360,7 +2403,8 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 {
 	struct file_lock *fl;
 	int result = 1;
-	lock_flocks();
+
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (IS_POSIX(fl)) {
 			if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
@@ -2375,7 +2419,7 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
 		result = 0;
 		break;
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return result;
 }
 
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 57db3244f4d9..7ec4814e298d 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -73,20 +73,20 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	if (inode->i_flock == NULL)
 		goto out;
 
-	/* Protect inode->i_flock using the file locks lock */
-	lock_flocks();
+	/* Protect inode->i_flock using the i_lock */
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file) != ctx)
 			continue;
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		status = nfs4_lock_delegation_recall(fl, state, stateid);
 		if (status < 0)
 			goto out;
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1fab140764c4..ff10b4aa534c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1373,13 +1373,13 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	/* Guard against delegation returns and new lock/unlock calls */
 	down_write(&nfsi->rwsem);
 	/* Protect inode->i_flock using the BKL */
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
 		if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
 			continue;
 		if (nfs_file_open_context(fl->fl_file)->state != state)
 			continue;
-		unlock_flocks();
+		spin_unlock(&inode->i_lock);
 		status = ops->recover_lock(state, fl);
 		switch (status) {
 			case 0:
@@ -1406,9 +1406,9 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				status = 0;
 		}
-		lock_flocks();
+		spin_lock(&inode->i_lock);
 	}
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 out:
 	up_write(&nfsi->rwsem);
 	return status;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 316ec843dec2..f17051838b41 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2645,13 +2645,13 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 
 	list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
 
-	/* only place dl_time is set. protected by lock_flocks*/
+	/* Only place dl_time is set; protected by i_lock: */
 	dp->dl_time = get_seconds();
 
 	nfsd4_cb_recall(dp);
 }
 
-/* Called from break_lease() with lock_flocks() held. */
+/* Called from break_lease() with i_lock held. */
 static void nfsd_break_deleg_cb(struct file_lock *fl)
 {
 	struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
@@ -4520,7 +4520,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 	struct inode *inode = filp->fi_inode;
 	int status = 0;
 
-	lock_flocks();
+	spin_lock(&inode->i_lock);
 	for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
 		if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
 			status = 1;
@@ -4528,7 +4528,7 @@ check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner)
 		}
 	}
 out:
-	unlock_flocks();
+	spin_unlock(&inode->i_lock);
 	return status;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ed9fdaaf3223..24fe998795e1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1024,8 +1024,6 @@ extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
-extern void lock_flocks(void);
-extern void unlock_flocks(void);
 #else /* !CONFIG_FILE_LOCKING */
 static inline int fcntl_getlk(struct file *file, struct flock __user *user)
 {
@@ -1166,15 +1164,6 @@ static inline int lock_may_write(struct inode *inode, loff_t start,
 {
 	return 1;
 }
-
-static inline void lock_flocks(void)
-{
-}
-
-static inline void unlock_flocks(void)
-{
-}
-
 #endif /* !CONFIG_FILE_LOCKING */
 
 
-- 
cgit v1.3-7-g2ca7


From 139ca04ee572fea6c0c105e88aba3a534efcd7c4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:17 -0400
Subject: locks: convert fl_link to a hlist_node

Testing has shown that iterating over the blocked_list for deadlock
detection turns out to be a bottleneck. In order to alleviate that,
begin the process of turning it into a hashtable. We start by turning
the fl_link into a hlist_node and the global lists into hlists. A later
patch will do the conversion of the blocked_list to a hashtable.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/locks.c         | 24 ++++++++++++------------
 include/linux/fs.h |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/locks.c b/fs/locks.c
index 84e269fc4c69..941b7146b6be 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -157,13 +157,13 @@ int lease_break_time = 45;
  * The global file_lock_list is only used for displaying /proc/locks. Protected
  * by the file_lock_lock.
  */
-static LIST_HEAD(file_lock_list);
+static HLIST_HEAD(file_lock_list);
 
 /*
  * The blocked_list is used to find POSIX lock loops for deadlock detection.
  * Protected by file_lock_lock.
  */
-static LIST_HEAD(blocked_list);
+static HLIST_HEAD(blocked_list);
 
 /*
  * This lock protects the blocked_list, and the file_lock_list. Generally, if
@@ -188,7 +188,7 @@ static struct kmem_cache *filelock_cache __read_mostly;
 
 static void locks_init_lock_heads(struct file_lock *fl)
 {
-	INIT_LIST_HEAD(&fl->fl_link);
+	INIT_HLIST_NODE(&fl->fl_link);
 	INIT_LIST_HEAD(&fl->fl_block);
 	init_waitqueue_head(&fl->fl_wait);
 }
@@ -222,7 +222,7 @@ void locks_free_lock(struct file_lock *fl)
 {
 	BUG_ON(waitqueue_active(&fl->fl_wait));
 	BUG_ON(!list_empty(&fl->fl_block));
-	BUG_ON(!list_empty(&fl->fl_link));
+	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
 	locks_release_private(fl);
 	kmem_cache_free(filelock_cache, fl);
@@ -500,7 +500,7 @@ static inline void
 locks_insert_global_locks(struct file_lock *fl)
 {
 	spin_lock(&file_lock_lock);
-	list_add_tail(&fl->fl_link, &file_lock_list);
+	hlist_add_head(&fl->fl_link, &file_lock_list);
 	spin_unlock(&file_lock_lock);
 }
 
@@ -508,20 +508,20 @@ static inline void
 locks_delete_global_locks(struct file_lock *fl)
 {
 	spin_lock(&file_lock_lock);
-	list_del_init(&fl->fl_link);
+	hlist_del_init(&fl->fl_link);
 	spin_unlock(&file_lock_lock);
 }
 
 static inline void
 locks_insert_global_blocked(struct file_lock *waiter)
 {
-	list_add(&waiter->fl_link, &blocked_list);
+	hlist_add_head(&waiter->fl_link, &blocked_list);
 }
 
 static inline void
 locks_delete_global_blocked(struct file_lock *waiter)
 {
-	list_del_init(&waiter->fl_link);
+	hlist_del_init(&waiter->fl_link);
 }
 
 /* Remove waiter from blocker's block list.
@@ -748,7 +748,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
 {
 	struct file_lock *fl;
 
-	list_for_each_entry(fl, &blocked_list, fl_link) {
+	hlist_for_each_entry(fl, &blocked_list, fl_link) {
 		if (posix_same_owner(fl, block_fl))
 			return fl->fl_next;
 	}
@@ -2300,7 +2300,7 @@ static int locks_show(struct seq_file *f, void *v)
 {
 	struct file_lock *fl, *bfl;
 
-	fl = list_entry(v, struct file_lock, fl_link);
+	fl = hlist_entry(v, struct file_lock, fl_link);
 
 	lock_get_status(f, fl, *((loff_t *)f->private), "");
 
@@ -2316,14 +2316,14 @@ static void *locks_start(struct seq_file *f, loff_t *pos)
 
 	spin_lock(&file_lock_lock);
 	*p = (*pos + 1);
-	return seq_list_start(&file_lock_list, *pos);
+	return seq_hlist_start(&file_lock_list, *pos);
 }
 
 static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
 {
 	loff_t *p = f->private;
 	++*p;
-	return seq_list_next(v, &file_lock_list, pos);
+	return seq_hlist_next(v, &file_lock_list, pos);
 }
 
 static void locks_stop(struct seq_file *f, void *v)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 24fe998795e1..fab064a3b65f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -946,7 +946,7 @@ int locks_in_grace(struct net *);
  */
 struct file_lock {
 	struct file_lock *fl_next;	/* singly linked list for this inode  */
-	struct list_head fl_link;	/* doubly linked list of all locks */
+	struct hlist_node fl_link;	/* node in global lists */
 	struct list_head fl_block;	/* circular list of blocked processes */
 	fl_owner_t fl_owner;
 	unsigned int fl_flags;
-- 
cgit v1.3-7-g2ca7


From 3999e49364193f7dbbba66e2be655fe91ba1fced Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 21 Jun 2013 08:58:19 -0400
Subject: locks: add a new "lm_owner_key" lock operation

Currently, the hashing that the locking code uses to add these values
to the blocked_hash is simply calculated using fl_owner field. That's
valid in most cases except for server-side lockd, which validates the
owner of a lock based on fl_owner and fl_pid.

In the case where you have a small number of NFS clients doing a lot
of locking between different processes, you could end up with all
the blocked requests sitting in a very small number of hash buckets.

Add a new lm_owner_key operation to the lock_manager_operations that
will generate an unsigned long to use as the key in the hashtable.
That function is only implemented for server-side lockd, and simply
XORs the fl_owner and fl_pid.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: J. Bruce Fields <bfields@fieldses.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking | 16 +++++++++++-----
 fs/lockd/svclock.c                | 12 ++++++++++++
 fs/locks.c                        | 12 ++++++++++--
 include/linux/fs.h                |  1 +
 4 files changed, 34 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index c2963a74fbc3..2db7c9e492e9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -349,6 +349,7 @@ fl_release_private:	maybe		no
 ----------------------- lock_manager_operations ---------------------------
 prototypes:
 	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+	unsigned long (*lm_owner_key)(struct file_lock *);
 	void (*lm_notify)(struct file_lock *);  /* unblock callback */
 	int (*lm_grant)(struct file_lock *, struct file_lock *, int);
 	void (*lm_break)(struct file_lock *); /* break_lease callback */
@@ -358,16 +359,21 @@ locking rules:
 
 			inode->i_lock	file_lock_lock	may block
 lm_compare_owner:	yes[1]		maybe		no
+lm_owner_key		yes[1]		yes		no
 lm_notify:		yes		yes		no
 lm_grant:		no		no		no
 lm_break:		yes		no		no
 lm_change		yes		no		no
 
-[1]:	->lm_compare_owner is generally called with *an* inode->i_lock held. It
-may not be the i_lock of the inode for either file_lock being compared! This is
-the case with deadlock detection, since the code has to chase down the owners
-of locks that may be entirely unrelated to the one on which the lock is being
-acquired. When doing a search for deadlocks, the file_lock_lock is also held.
+[1]:	->lm_compare_owner and ->lm_owner_key are generally called with
+*an* inode->i_lock held. It may not be the i_lock of the inode
+associated with either file_lock argument! This is the case with deadlock
+detection, since the code has to chase down the owners of locks that may
+be entirely unrelated to the one on which the lock is being acquired.
+For deadlock detection however, the file_lock_lock is also held. The
+fact that these locks are held ensures that the file_locks do not
+disappear out from under you while doing the comparison or generating an
+owner key.
 
 --------------------------- buffer_head -----------------------------------
 prototypes:
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index a469098682c4..067778b0ccc9 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -744,8 +744,20 @@ static int nlmsvc_same_owner(struct file_lock *fl1, struct file_lock *fl2)
 	return fl1->fl_owner == fl2->fl_owner && fl1->fl_pid == fl2->fl_pid;
 }
 
+/*
+ * Since NLM uses two "keys" for tracking locks, we need to hash them down
+ * to one for the blocked_hash. Here, we're just xor'ing the host address
+ * with the pid in order to create a key value for picking a hash bucket.
+ */
+static unsigned long
+nlmsvc_owner_key(struct file_lock *fl)
+{
+	return (unsigned long)fl->fl_owner ^ (unsigned long)fl->fl_pid;
+}
+
 const struct lock_manager_operations nlmsvc_lock_operations = {
 	.lm_compare_owner = nlmsvc_same_owner,
+	.lm_owner_key = nlmsvc_owner_key,
 	.lm_notify = nlmsvc_notify_blocked,
 	.lm_grant = nlmsvc_grant_deferred,
 };
diff --git a/fs/locks.c b/fs/locks.c
index 71d847cbbb63..6242e0b1c69c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -521,10 +521,18 @@ locks_delete_global_locks(struct file_lock *fl)
 	spin_unlock(&file_lock_lock);
 }
 
+static unsigned long
+posix_owner_key(struct file_lock *fl)
+{
+	if (fl->fl_lmops && fl->fl_lmops->lm_owner_key)
+		return fl->fl_lmops->lm_owner_key(fl);
+	return (unsigned long)fl->fl_owner;
+}
+
 static inline void
 locks_insert_global_blocked(struct file_lock *waiter)
 {
-	hash_add(blocked_hash, &waiter->fl_link, (unsigned long)waiter->fl_owner);
+	hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
 }
 
 static inline void
@@ -757,7 +765,7 @@ static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
 {
 	struct file_lock *fl;
 
-	hash_for_each_possible(blocked_hash, fl, fl_link, (unsigned long)block_fl->fl_owner) {
+	hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
 		if (posix_same_owner(fl, block_fl))
 			return fl->fl_next;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fab064a3b65f..a137a73fc1fe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -908,6 +908,7 @@ struct file_lock_operations {
 
 struct lock_manager_operations {
 	int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
+	unsigned long (*lm_owner_key)(struct file_lock *);
 	void (*lm_notify)(struct file_lock *);	/* unblock callback */
 	int (*lm_grant)(struct file_lock *, struct file_lock *, int);
 	void (*lm_break)(struct file_lock *);
-- 
cgit v1.3-7-g2ca7


From 46a1c2c7ae53de2a5676754b54a73c591a3951d2 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Tue, 25 Jun 2013 12:02:13 +0800
Subject: vfs: export lseek_execute() to modules

For those file systems(btrfs/ext4/ocfs2/tmpfs) that support
SEEK_DATA/SEEK_HOLE functions, we end up handling the similar
matter in lseek_execute() to update the current file offset
to the desired offset if it is valid, ceph also does the
simliar things at ceph_llseek().

To reduce the duplications, this patch make lseek_execute()
public accessible so that we can call it directly from the
underlying file systems.

Thanks Dave Chinner for this suggestion.

[AV: call it vfs_setpos(), don't bring the removed 'inode' argument back]

v2->v1:
- Add kernel-doc comments for lseek_execute()
- Call lseek_execute() in ceph->llseek()

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chris Mason <chris.mason@fusionio.com>
Cc: Josef Bacik <jbacik@fusionio.com>
Cc: Ben Myers <bpm@sgi.com>
Cc: Ted Tso <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Sage Weil <sage@inktank.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    | 15 +--------------
 fs/ceph/file.c     | 11 +----------
 fs/ext4/file.c     | 24 ++----------------------
 fs/ocfs2/file.c    | 12 +-----------
 fs/read_write.c    | 19 ++++++++++++++++---
 fs/xfs/xfs_file.c  |  6 ++----
 include/linux/fs.h |  1 +
 mm/shmem.c         |  5 +----
 8 files changed, 25 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4205ba752d40..89da56a58b63 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2425,20 +2425,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
-	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
-		offset = -EINVAL;
-		goto out;
-	}
-	if (offset > inode->i_sb->s_maxbytes) {
-		offset = -EINVAL;
-		goto out;
-	}
-
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return offset;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 656e16907430..16c989d3e23c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -866,16 +866,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		break;
 	}
 
-	if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
-		offset = -EINVAL;
-		goto out;
-	}
-
-	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..469361dbe619 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	if (dataoff > isize)
 		return -ENXIO;
 
-	if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (dataoff > maxsize)
-		return -EINVAL;
-
-	if (dataoff != file->f_pos) {
-		file->f_pos = dataoff;
-		file->f_version = 0;
-	}
-
-	return dataoff;
+	return vfs_setpos(file, dataoff, maxsize);
 }
 
 /*
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	if (holeoff > isize)
 		holeoff = isize;
 
-	if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		return -EINVAL;
-	if (holeoff > maxsize)
-		return -EINVAL;
-
-	if (holeoff != file->f_pos) {
-		file->f_pos = holeoff;
-		file->f_version = 0;
-	}
-
-	return holeoff;
+	return vfs_setpos(file, holeoff, maxsize);
 }
 
 /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8a38714f1d92..41000f223ca4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2646,17 +2646,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 		goto out;
 	}
 
-	if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-		ret = -EINVAL;
-	if (!ret && offset > inode->i_sb->s_maxbytes)
-		ret = -EINVAL;
-	if (ret)
-		goto out;
-
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
 	mutex_unlock(&inode->i_mutex);
diff --git a/fs/read_write.c b/fs/read_write.c
index 37d16e82b575..122a3846d9e1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -41,7 +41,19 @@ static inline int unsigned_offsets(struct file *file)
 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
 }
 
-static loff_t lseek_execute(struct file *file, loff_t offset, loff_t maxsize)
+/**
+ * vfs_setpos - update the file offset for lseek
+ * @file:	file structure in question
+ * @offset:	file offset to seek to
+ * @maxsize:	maximum file size
+ *
+ * This is a low-level filesystem helper for updating the file offset to
+ * the value specified by @offset if the given offset is valid and it is
+ * not equal to the current file offset.
+ *
+ * Return the specified offset on success and -EINVAL on invalid offset.
+ */
+loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
 {
 	if (offset < 0 && !unsigned_offsets(file))
 		return -EINVAL;
@@ -54,6 +66,7 @@ static loff_t lseek_execute(struct file *file, loff_t offset, loff_t maxsize)
 	}
 	return offset;
 }
+EXPORT_SYMBOL(vfs_setpos);
 
 /**
  * generic_file_llseek_size - generic llseek implementation for regular files
@@ -94,7 +107,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		 * like SEEK_SET.
 		 */
 		spin_lock(&file->f_lock);
-		offset = lseek_execute(file, file->f_pos + offset, maxsize);
+		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
 		spin_unlock(&file->f_lock);
 		return offset;
 	case SEEK_DATA:
@@ -116,7 +129,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
 		break;
 	}
 
-	return lseek_execute(file, offset, maxsize);
+	return vfs_setpos(file, offset, maxsize);
 }
 EXPORT_SYMBOL(generic_file_llseek_size);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0ad2b95fca12..de3dc98f4e8f 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1268,8 +1268,7 @@ xfs_seek_data(
 	}
 
 out:
-	if (offset != file->f_pos)
-		file->f_pos = offset;
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
 	xfs_iunlock_map_shared(ip, lock);
@@ -1377,8 +1376,7 @@ out:
 	 * situation in particular.
 	 */
 	offset = min_t(loff_t, offset, isize);
-	if (offset != file->f_pos)
-		file->f_pos = offset;
+	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out_unlock:
 	xfs_iunlock_map_shared(ip, lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a137a73fc1fe..bccb1924ec93 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2426,6 +2426,7 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t no_llseek(struct file *file, loff_t offset, int whence);
+extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
 		int whence, loff_t maxsize, loff_t eof);
diff --git a/mm/shmem.c b/mm/shmem.c
index f887358dabc5..118dfa4952f4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1798,10 +1798,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
-	if (offset >= 0 && offset != file->f_pos) {
-		file->f_pos = offset;
-		file->f_version = 0;
-	}
+	offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
 	mutex_unlock(&inode->i_mutex);
 	return offset;
 }
-- 
cgit v1.3-7-g2ca7


From c846ef7deba2d4f75138cf6a4b137b7e0e7659af Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Jul 2013 15:00:41 -0700
Subject: include/linux/smp.h:on_each_cpu(): switch back to a macro

Commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP
version of on_each_cpu()") converted on_each_cpu() to a C function.

This required inclusion of irqflags.h, which broke ia64 and mn10300 (at
least) due to header ordering hell.

Switch on_each_cpu() back to a macro to fix this.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: David Daney <david.daney@cavium.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: <stable@vger.kernel.org>	[3.10.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index c8488763277f..c181399f2c20 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,7 +11,6 @@
 #include <linux/list.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
-#include <linux/irqflags.h>
 
 extern void cpu_idle(void);
 
@@ -140,17 +139,14 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info)
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
-
-static inline int on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	func(info);
-	local_irq_restore(flags);
-	return 0;
-}
-
+#define on_each_cpu(func, info, wait)		\
+	({					\
+		unsigned long __flags;		\
+		local_irq_save(__flags);	\
+		func(info);			\
+		local_irq_restore(__flags);	\
+		0;				\
+	})
 /*
  * Note we still need to test the mask even for UP
  * because we actually can get an empty mask from
-- 
cgit v1.3-7-g2ca7


From ffbdccf5e1facd18b54429a749667fb185c10f20 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 3 Jul 2013 15:01:23 -0700
Subject: mm, memcg: don't take task_lock in task_in_mem_cgroup

For processes that have detached their mm's, task_in_mem_cgroup()
unnecessarily takes task_lock() when rcu_read_lock() is all that is
necessary to call mem_cgroup_from_task().

While we're here, switch task_in_mem_cgroup() to return bool.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  9 +++++----
 mm/memcontrol.c            | 11 ++++++-----
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index d6183f06d8c1..7b4d9d79570b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page);
 
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
+bool task_in_mem_cgroup(struct task_struct *task,
+			const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
@@ -273,10 +274,10 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 	return true;
 }
 
-static inline int task_in_mem_cgroup(struct task_struct *task,
-				     const struct mem_cgroup *memcg)
+static inline bool task_in_mem_cgroup(struct task_struct *task,
+				      const struct mem_cgroup *memcg)
 {
-	return 1;
+	return true;
 }
 
 static inline struct cgroup_subsys_state
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..4748966b1511 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 	return ret;
 }
 
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
+bool task_in_mem_cgroup(struct task_struct *task,
+			const struct mem_cgroup *memcg)
 {
-	int ret;
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
+	bool ret;
 
 	p = find_lock_task_mm(task);
 	if (p) {
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 		 * killer still needs to detect if they have already been oom
 		 * killed to prevent needlessly killing additional tasks.
 		 */
-		task_lock(task);
+		rcu_read_lock();
 		curr = mem_cgroup_from_task(task);
 		if (curr)
 			css_get(&curr->css);
-		task_unlock(task);
+		rcu_read_unlock();
 	}
 	if (!curr)
-		return 0;
+		return false;
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
-- 
cgit v1.3-7-g2ca7


From d43006d503ac921c7df4f94d13c17db6f13c9d26 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:01:50 -0700
Subject: mm: vmscan: have kswapd writeback pages based on dirty pages
 encountered, not priority

Currently kswapd queues dirty pages for writeback if scanning at an
elevated priority but the priority kswapd scans at is not related to the
number of unqueued dirty encountered.  Since commit "mm: vmscan: Flatten
kswapd priority loop", the priority is related to the size of the LRU
and the zone watermark which is no indication as to whether kswapd
should write pages or not.

This patch tracks if an excessive number of unqueued dirty pages are
being encountered at the end of the LRU.  If so, it indicates that dirty
pages are being recycled before flusher threads can clean them and flags
the zone so that kswapd will start writing pages until the zone is
balanced.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  9 +++++++++
 mm/vmscan.c            | 31 +++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5c76737d836b..2aaf72f7e345 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -495,6 +495,10 @@ typedef enum {
 	ZONE_CONGESTED,			/* zone has many dirty pages backed by
 					 * a congested BDI
 					 */
+	ZONE_TAIL_LRU_DIRTY,		/* reclaim scanning has recently found
+					 * many dirty file pages at the tail
+					 * of the LRU.
+					 */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -517,6 +521,11 @@ static inline int zone_is_reclaim_congested(const struct zone *zone)
 	return test_bit(ZONE_CONGESTED, &zone->flags);
 }
 
+static inline int zone_is_reclaim_dirty(const struct zone *zone)
+{
+	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1505c573719d..d6c916d808ba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -676,13 +676,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				      struct zone *zone,
 				      struct scan_control *sc,
 				      enum ttu_flags ttu_flags,
-				      unsigned long *ret_nr_dirty,
+				      unsigned long *ret_nr_unqueued_dirty,
 				      unsigned long *ret_nr_writeback,
 				      bool force_reclaim)
 {
 	LIST_HEAD(ret_pages);
 	LIST_HEAD(free_pages);
 	int pgactivate = 0;
+	unsigned long nr_unqueued_dirty = 0;
 	unsigned long nr_dirty = 0;
 	unsigned long nr_congested = 0;
 	unsigned long nr_reclaimed = 0;
@@ -808,14 +809,17 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageDirty(page)) {
 			nr_dirty++;
 
+			if (!PageWriteback(page))
+				nr_unqueued_dirty++;
+
 			/*
 			 * Only kswapd can writeback filesystem pages to
-			 * avoid risk of stack overflow but do not writeback
-			 * unless under significant pressure.
+			 * avoid risk of stack overflow but only writeback
+			 * if many dirty pages have been encountered.
 			 */
 			if (page_is_file_cache(page) &&
 					(!current_is_kswapd() ||
-					 sc->priority >= DEF_PRIORITY - 2)) {
+					 !zone_is_reclaim_dirty(zone))) {
 				/*
 				 * Immediately reclaim when written back.
 				 * Similar in principal to deactivate_page()
@@ -960,7 +964,7 @@ keep:
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
 	mem_cgroup_uncharge_end();
-	*ret_nr_dirty += nr_dirty;
+	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
 	*ret_nr_writeback += nr_writeback;
 	return nr_reclaimed;
 }
@@ -1373,6 +1377,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 			(nr_taken >> (DEF_PRIORITY - sc->priority)))
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
 
+	/*
+	 * Similarly, if many dirty pages are encountered that are not
+	 * currently being written then flag that kswapd should start
+	 * writing back pages.
+	 */
+	if (global_reclaim(sc) && nr_dirty &&
+			nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority)))
+		zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+
 	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
 		zone_idx(zone),
 		nr_scanned, nr_reclaimed,
@@ -2769,8 +2782,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 				end_zone = i;
 				break;
 			} else {
-				/* If balanced, clear the congested flag */
+				/*
+				 * If balanced, clear the dirty and congested
+				 * flags
+				 */
 				zone_clear_flag(zone, ZONE_CONGESTED);
+				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 			}
 		}
 
@@ -2888,8 +2905,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 				 * possible there are dirty pages backed by
 				 * congested BDIs but as pressure is relieved,
 				 * speculatively avoid congestion waits
+				 * or writing pages from kswapd context.
 				 */
 				zone_clear_flag(zone, ZONE_CONGESTED);
+				zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
 		}
 
 		/*
-- 
cgit v1.3-7-g2ca7


From 283aba9f9e0e4882bf09bd37a2983379a6fae805 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:01:51 -0700
Subject: mm: vmscan: block kswapd if it is encountering pages under writeback

Historically, kswapd used to congestion_wait() at higher priorities if
it was not making forward progress.  This made no sense as the failure
to make progress could be completely independent of IO.  It was later
replaced by wait_iff_congested() and removed entirely by commit 258401a6
(mm: don't wait on congested zones in balance_pgdat()) as it was
duplicating logic in shrink_inactive_list().

This is problematic.  If kswapd encounters many pages under writeback
and it continues to scan until it reaches the high watermark then it
will quickly skip over the pages under writeback and reclaim clean young
pages or push applications out to swap.

The use of wait_iff_congested() is not suited to kswapd as it will only
stall if the underlying BDI is really congested or a direct reclaimer
was unable to write to the underlying BDI.  kswapd bypasses the BDI
congestion as it sets PF_SWAPWRITE but even if this was taken into
account then it would cause direct reclaimers to stall on writeback
which is not desirable.

This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is
encountering too many pages under writeback.  If this flag is set and
kswapd encounters a PageReclaim page under writeback then it'll assume
that the LRU lists are being recycled too quickly before IO can complete
and block waiting for some IO to complete.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  8 +++++
 mm/vmscan.c            | 82 ++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 68 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf72f7e345..fce64afba042 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
 					 * many dirty file pages at the tail
 					 * of the LRU.
 					 */
+	ZONE_WRITEBACK,			/* reclaim scanning has recently found
+					 * many pages under writeback
+					 */
 } zone_flags_t;
 
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
 	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
 }
 
+static inline int zone_is_reclaim_writeback(const struct zone *zone)
+{
+	return test_bit(ZONE_WRITEBACK, &zone->flags);
+}
+
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
 	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6c916d808ba..1109de0c35bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
 			(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
+		/*
+		 * If a page at the tail of the LRU is under writeback, there
+		 * are three cases to consider.
+		 *
+		 * 1) If reclaim is encountering an excessive number of pages
+		 *    under writeback and this page is both under writeback and
+		 *    PageReclaim then it indicates that pages are being queued
+		 *    for IO but are being recycled through the LRU before the
+		 *    IO can complete. Waiting on the page itself risks an
+		 *    indefinite stall if it is impossible to writeback the
+		 *    page due to IO error or disconnected storage so instead
+		 *    block for HZ/10 or until some IO completes then clear the
+		 *    ZONE_WRITEBACK flag to recheck if the condition exists.
+		 *
+		 * 2) Global reclaim encounters a page, memcg encounters a
+		 *    page that is not marked for immediate reclaim or
+		 *    the caller does not have __GFP_IO. In this case mark
+		 *    the page for immediate reclaim and continue scanning.
+		 *
+		 *    __GFP_IO is checked  because a loop driver thread might
+		 *    enter reclaim, and deadlock if it waits on a page for
+		 *    which it is needed to do the write (loop masks off
+		 *    __GFP_IO|__GFP_FS for this reason); but more thought
+		 *    would probably show more reasons.
+		 *
+		 *    Don't require __GFP_FS, since we're not going into the
+		 *    FS, just waiting on its writeback completion. Worryingly,
+		 *    ext4 gfs2 and xfs allocate pages with
+		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+		 *    may_enter_fs here is liable to OOM on them.
+		 *
+		 * 3) memcg encounters a page that is not already marked
+		 *    PageReclaim. memcg does not have any dirty pages
+		 *    throttling so we could easily OOM just because too many
+		 *    pages are in writeback and there is nothing else to
+		 *    reclaim. Wait for the writeback to complete.
+		 */
 		if (PageWriteback(page)) {
-			/*
-			 * memcg doesn't have any dirty pages throttling so we
-			 * could easily OOM just because too many pages are in
-			 * writeback and there is nothing else to reclaim.
-			 *
-			 * Check __GFP_IO, certainly because a loop driver
-			 * thread might enter reclaim, and deadlock if it waits
-			 * on a page for which it is needed to do the write
-			 * (loop masks off __GFP_IO|__GFP_FS for this reason);
-			 * but more thought would probably show more reasons.
-			 *
-			 * Don't require __GFP_FS, since we're not going into
-			 * the FS, just waiting on its writeback completion.
-			 * Worryingly, ext4 gfs2 and xfs allocate pages with
-			 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-			 * testing may_enter_fs here is liable to OOM on them.
-			 */
-			if (global_reclaim(sc) ||
+			/* Case 1 above */
+			if (current_is_kswapd() &&
+			    PageReclaim(page) &&
+			    zone_is_reclaim_writeback(zone)) {
+				unlock_page(page);
+				congestion_wait(BLK_RW_ASYNC, HZ/10);
+				zone_clear_flag(zone, ZONE_WRITEBACK);
+				goto keep;
+
+			/* Case 2 above */
+			} else if (global_reclaim(sc) ||
 			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
 				/*
 				 * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 */
 				SetPageReclaim(page);
 				nr_writeback++;
+
 				goto keep_locked;
+
+			/* Case 3 above */
+			} else {
+				wait_on_page_writeback(page);
 			}
-			wait_on_page_writeback(page);
 		}
 
 		if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	 *                     isolated page is PageWriteback
 	 */
 	if (nr_writeback && nr_writeback >=
-			(nr_taken >> (DEF_PRIORITY - sc->priority)))
+			(nr_taken >> (DEF_PRIORITY - sc->priority))) {
 		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+		zone_set_flag(zone, ZONE_WRITEBACK);
+	}
 
 	/*
 	 * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
  * the high watermark.
  *
  * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
- * raised.
+ * reclaim or if the lack of progress was due to pages under writeback.
+ * This is used to determine if the scanning priority needs to be raised.
  */
 static bool kswapd_shrink_zone(struct zone *zone,
 			       struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	if (nr_slab == 0 && !zone_reclaimable(zone))
 		zone->all_unreclaimable = 1;
 
+	zone_clear_flag(zone, ZONE_WRITEBACK);
+
 	return sc->nr_scanned >= sc->nr_to_reclaim;
 }
 
-- 
cgit v1.3-7-g2ca7


From b45972265f823ed01eae0867a176320071665787 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:05 -0700
Subject: mm: vmscan: take page buffers dirty and locked state into account

Page reclaim keeps track of dirty and under writeback pages and uses it
to determine if wait_iff_congested() should stall or if kswapd should
begin writing back pages.  This fails to account for buffer pages that
can be under writeback but not PageWriteback which is the case for
filesystems like ext3 ordered mode.  Furthermore, PageDirty buffer pages
can have all the buffers clean and writepage does no IO so it should not
be accounted as congested.

This patch adds an address_space operation that filesystems may
optionally use to check if a page is really dirty or really under
writeback.  An implementation is provided for for buffer_heads is added
and used for block operations and ext3 in ordered mode.  By default the
page flags are obeyed.

Credit goes to Jan Kara for identifying that the page flags alone are
not sufficient for ext3 and sanity checking a number of ideas on how the
problem could be addressed.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Zlatko Calusic <zcalusic@bitsync.net>
Cc: dormando <dormando@rydia.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c              |  1 +
 fs/buffer.c                 | 34 ++++++++++++++++++++++++++++++++++
 fs/ext3/inode.c             |  1 +
 include/linux/buffer_head.h |  3 +++
 include/linux/fs.h          |  1 +
 mm/vmscan.c                 | 10 ++++++++++
 6 files changed, 50 insertions(+)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 431b6a04ebfd..bb43ce081d6e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1562,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = {
 	.writepages	= generic_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
+	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
 const struct file_operations def_blk_fops = {
diff --git a/fs/buffer.c b/fs/buffer.c
index f93392e2df12..4d7433534f5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -82,6 +82,40 @@ void unlock_buffer(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(unlock_buffer);
 
+/*
+ * Returns if the page has dirty or writeback buffers. If all the buffers
+ * are unlocked and clean then the PageDirty information is stale. If
+ * any of the pages are locked, it is assumed they are locked for IO.
+ */
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback)
+{
+	struct buffer_head *head, *bh;
+	*dirty = false;
+	*writeback = false;
+
+	BUG_ON(!PageLocked(page));
+
+	if (!page_has_buffers(page))
+		return;
+
+	if (PageWriteback(page))
+		*writeback = true;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (buffer_locked(bh))
+			*writeback = true;
+
+		if (buffer_dirty(bh))
+			*dirty = true;
+
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
+EXPORT_SYMBOL(buffer_check_dirty_writeback);
+
 /*
  * Block until a buffer comes unlocked.  This doesn't stop it
  * from becoming locked again - you have to lock it yourself
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f67668f724ba..2bd85486b879 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = {
 	.direct_IO		= ext3_direct_IO,
 	.migratepage		= buffer_migrate_page,
 	.is_partially_uptodate  = block_is_partially_uptodate,
+	.is_dirty_writeback	= buffer_check_dirty_writeback,
 	.error_remove_page	= generic_error_remove_page,
 };
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index f5a3b838ddb0..91fa9a94ae92 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -139,6 +139,9 @@ BUFFER_FNS(Prio, prio)
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
+void buffer_check_dirty_writeback(struct page *page,
+				     bool *dirty, bool *writeback);
+
 /*
  * Declarations
  */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2b82c8041490..99be011e00de 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -380,6 +380,7 @@ struct address_space_operations {
 	int (*launder_page) (struct page *);
 	int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
 					unsigned long);
+	void (*is_dirty_writeback) (struct page *, bool *, bool *);
 	int (*error_remove_page)(struct address_space *, struct page *);
 
 	/* swapfile support */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bf4778479e3a..c85794399848 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -673,6 +673,8 @@ static enum page_references page_check_references(struct page *page,
 static void page_check_dirty_writeback(struct page *page,
 				       bool *dirty, bool *writeback)
 {
+	struct address_space *mapping;
+
 	/*
 	 * Anonymous pages are not handled by flushers and must be written
 	 * from reclaim context. Do not stall reclaim based on them
@@ -686,6 +688,14 @@ static void page_check_dirty_writeback(struct page *page,
 	/* By default assume that the page flags are accurate */
 	*dirty = PageDirty(page);
 	*writeback = PageWriteback(page);
+
+	/* Verify dirty/writeback state if the filesystem supports it */
+	if (!page_has_private(page))
+		return;
+
+	mapping = page_mapping(page);
+	if (mapping && mapping->a_ops->is_dirty_writeback)
+		mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
 /*
-- 
cgit v1.3-7-g2ca7


From 72c3b51bda557ab38d6c5b2af750c27cba15f828 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:08 -0700
Subject: mm: fix comment referring to non-existent size_seqlock, change to
 span_seqlock

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fce64afba042..9ec7cffcae21 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -733,7 +733,7 @@ typedef struct pglist_data {
 	 * or node_spanned_pages stay constant.  Holding this will also
 	 * guarantee that any pfn_valid() stays that way.
 	 *
-	 * Nests above zone->lock and zone->size_seqlock.
+	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
 #endif
-- 
cgit v1.3-7-g2ca7


From 114d4b79f7e561fd76fb98eba79e18df7cdd60f0 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:09 -0700
Subject: mmzone: note that node_size_lock should be manipulated via
 pgdat_resize_lock()

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9ec7cffcae21..e511f9429f1e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -733,6 +733,9 @@ typedef struct pglist_data {
 	 * or node_spanned_pages stay constant.  Holding this will also
 	 * guarantee that any pfn_valid() stays that way.
 	 *
+	 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
+	 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
+	 *
 	 * Nests above zone->lock and zone->span_seqlock
 	 */
 	spinlock_t node_size_lock;
-- 
cgit v1.3-7-g2ca7


From 0fa73b86ef0797ca4fde5334117ca0b330f08030 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 3 Jul 2013 15:02:11 -0700
Subject: include/linux/mm.h: add PAGE_ALIGNED() helper

To test whether an address is aligned to PAGE_SIZE.

Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>,
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 66d881f1d576..949bd7035895 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -52,6 +52,9 @@ extern unsigned long sysctl_admin_reserve_kbytes;
 /* to align the pointer to the (next) page boundary */
 #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)
 
+/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
+#define PAGE_ALIGNED(addr)	IS_ALIGNED((unsigned long)addr, PAGE_SIZE)
+
 /*
  * Linux kernel virtual memory manager primitives.
  * The idea being to have a "virtual" mm in the same way
-- 
cgit v1.3-7-g2ca7


From e69e9d4aee712a22665f008ae0550bb3d7c7f7c1 Mon Sep 17 00:00:00 2001
From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Date: Wed, 3 Jul 2013 15:02:18 -0700
Subject: vmalloc: introduce remap_vmalloc_range_partial

We want to allocate ELF note segment buffer on the 2nd kernel in vmalloc
space and remap it to user-space in order to reduce the risk that memory
allocation fails on system with huge number of CPUs and so with huge ELF
note segment that exceeds 11-order block size.

Although there's already remap_vmalloc_range for the purpose of
remapping vmalloc memory to user-space, we need to specify user-space
range via vma.
 Mmap on /proc/vmcore needs to remap range across multiple objects, so
the interface that requires vma to cover full range is problematic.

This patch introduces remap_vmalloc_range_partial that receives user-space
range as a pair of base address and size and can be used for mmap on
/proc/vmcore case.

remap_vmalloc_range is rewritten using remap_vmalloc_range_partial.

[akpm@linux-foundation.org: use PAGE_ALIGNED()]
Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Lisa Mitchell <lisa.mitchell@hp.com>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  4 +++
 mm/vmalloc.c            | 67 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 49 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 7d5773a99f20..dd0a2c810529 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -82,6 +82,10 @@ extern void *vmap(struct page **pages, unsigned int count,
 			unsigned long flags, pgprot_t prot);
 extern void vunmap(const void *addr);
 
+extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
+				       unsigned long uaddr, void *kaddr,
+				       unsigned long size);
+
 extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 							unsigned long pgoff);
 void vmalloc_sync_all(void);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3875fa2f0f60..b7259906a806 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1476,10 +1476,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
 	if (!addr)
 		return;
 
-	if ((PAGE_SIZE-1) & (unsigned long)addr) {
-		WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+	if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
+			addr));
 		return;
-	}
 
 	area = remove_vm_area(addr);
 	if (unlikely(!area)) {
@@ -2148,42 +2147,43 @@ finished:
 }
 
 /**
- *	remap_vmalloc_range  -  map vmalloc pages to userspace
- *	@vma:		vma to cover (map full range of vma)
- *	@addr:		vmalloc memory
- *	@pgoff:		number of pages into addr before first page to map
+ *	remap_vmalloc_range_partial  -  map vmalloc pages to userspace
+ *	@vma:		vma to cover
+ *	@uaddr:		target user address to start at
+ *	@kaddr:		virtual address of vmalloc kernel memory
+ *	@size:		size of map area
  *
  *	Returns:	0 for success, -Exxx on failure
  *
- *	This function checks that addr is a valid vmalloc'ed area, and
- *	that it is big enough to cover the vma. Will return failure if
- *	that criteria isn't met.
+ *	This function checks that @kaddr is a valid vmalloc'ed area,
+ *	and that it is big enough to cover the range starting at
+ *	@uaddr in @vma. Will return failure if that criteria isn't
+ *	met.
  *
  *	Similar to remap_pfn_range() (see mm/memory.c)
  */
-int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
-						unsigned long pgoff)
+int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
+				void *kaddr, unsigned long size)
 {
 	struct vm_struct *area;
-	unsigned long uaddr = vma->vm_start;
-	unsigned long usize = vma->vm_end - vma->vm_start;
 
-	if ((PAGE_SIZE-1) & (unsigned long)addr)
+	size = PAGE_ALIGN(size);
+
+	if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
 		return -EINVAL;
 
-	area = find_vm_area(addr);
+	area = find_vm_area(kaddr);
 	if (!area)
 		return -EINVAL;
 
 	if (!(area->flags & VM_USERMAP))
 		return -EINVAL;
 
-	if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+	if (kaddr + size > area->addr + area->size)
 		return -EINVAL;
 
-	addr += pgoff << PAGE_SHIFT;
 	do {
-		struct page *page = vmalloc_to_page(addr);
+		struct page *page = vmalloc_to_page(kaddr);
 		int ret;
 
 		ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2191,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 			return ret;
 
 		uaddr += PAGE_SIZE;
-		addr += PAGE_SIZE;
-		usize -= PAGE_SIZE;
-	} while (usize > 0);
+		kaddr += PAGE_SIZE;
+		size -= PAGE_SIZE;
+	} while (size > 0);
 
 	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
 
 	return 0;
 }
+EXPORT_SYMBOL(remap_vmalloc_range_partial);
+
+/**
+ *	remap_vmalloc_range  -  map vmalloc pages to userspace
+ *	@vma:		vma to cover (map full range of vma)
+ *	@addr:		vmalloc memory
+ *	@pgoff:		number of pages into addr before first page to map
+ *
+ *	Returns:	0 for success, -Exxx on failure
+ *
+ *	This function checks that addr is a valid vmalloc'ed area, and
+ *	that it is big enough to cover the vma. Will return failure if
+ *	that criteria isn't met.
+ *
+ *	Similar to remap_pfn_range() (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+						unsigned long pgoff)
+{
+	return remap_vmalloc_range_partial(vma, vma->vm_start,
+					   addr + (pgoff << PAGE_SHIFT),
+					   vma->vm_end - vma->vm_start);
+}
 EXPORT_SYMBOL(remap_vmalloc_range);
 
 /*
-- 
cgit v1.3-7-g2ca7


From a0b8cab3b9b2efadabdcff264c450ca515e2619c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:32 -0700
Subject: mm: remove lru parameter from __pagevec_lru_add and remove parts of
 pagevec API

Now that the LRU to add a page to is decided at LRU-add time, remove the
misleading lru parameter from __pagevec_lru_add.  A consequence of this
is that the pagevec_lru_add_file, pagevec_lru_add_anon and similar
helpers are misleading as the caller no longer has direct control over
what LRU the page is added to.  Unused helpers are removed by this patch
and existing users of pagevec_lru_add_file() are converted to use
lru_cache_add_file() directly and use the per-cpu pagevecs instead of
creating their own pagevec.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com>
Cc: Andrew Perepechko <anserper@ya.ru>
Cc: Robin Dong <sanbai@taobao.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Bernd Schubert <bernd.schubert@fastmail.fm>
Cc: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/cachefiles/rdwr.c    | 30 +++++++-----------------------
 fs/nfs/dir.c            |  7 ++-----
 include/linux/pagevec.h | 34 +---------------------------------
 mm/swap.c               | 12 ++++--------
 4 files changed, 14 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 317f9ee9c991..ebaff368120d 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -12,6 +12,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/swap.h>
 #include "internal.h"
 
 /*
@@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
  */
 static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 					    struct fscache_retrieval *op,
-					    struct page *netpage,
-					    struct pagevec *pagevec)
+					    struct page *netpage)
 {
 	struct cachefiles_one_read *monitor;
 	struct address_space *bmapping;
@@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
 
 	_enter("");
 
-	pagevec_reinit(pagevec);
-
 	_debug("read back %p{%lu,%d}",
 	       netpage, netpage->index, page_count(netpage));
 
@@ -283,9 +281,7 @@ installed_new_backing_page:
 	backpage = newpage;
 	newpage = NULL;
 
-	page_cache_get(backpage);
-	pagevec_add(pagevec, backpage);
-	__pagevec_lru_add_file(pagevec);
+	lru_cache_add_file(backpage);
 
 read_backing_page:
 	ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	if (block) {
 		/* submit the apparently valid page to the backing fs to be
 		 * read from disk */
-		ret = cachefiles_read_backing_file_one(object, op, page,
-						       &pagevec);
+		ret = cachefiles_read_backing_file_one(object, op, page);
 	} else if (cachefiles_has_space(cache, 0, 1) == 0) {
 		/* there's space in the cache we can use */
 		fscache_mark_page_cached(op, page);
@@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 {
 	struct cachefiles_one_read *monitor = NULL;
 	struct address_space *bmapping = object->backer->d_inode->i_mapping;
-	struct pagevec lru_pvec;
 	struct page *newpage = NULL, *netpage, *_n, *backpage = NULL;
 	int ret = 0;
 
 	_enter("");
 
-	pagevec_init(&lru_pvec, 0);
-
 	list_for_each_entry_safe(netpage, _n, list, lru) {
 		list_del(&netpage->lru);
 
@@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 		backpage = newpage;
 		newpage = NULL;
 
-		page_cache_get(backpage);
-		if (!pagevec_add(&lru_pvec, backpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(backpage);
 
 	reread_backing_page:
 		ret = bmapping->a_ops->readpage(NULL, backpage);
@@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 			goto nomem;
 		}
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* install a monitor */
 		page_cache_get(netpage);
@@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 		fscache_mark_page_cached(op, netpage);
 
-		page_cache_get(netpage);
-		if (!pagevec_add(&lru_pvec, netpage))
-			__pagevec_lru_add_file(&lru_pvec);
+		lru_cache_add_file(netpage);
 
 		/* the netpage is unlocked and marked up to date here */
 		fscache_end_io(op, netpage, 0);
@@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
 
 out:
 	/* tidy up */
-	pagevec_lru_add_file(&lru_pvec);
-
 	if (newpage)
 		page_cache_release(newpage);
 	if (netpage)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5d051419527b..d7ed697133f0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -33,6 +33,7 @@
 #include <linux/pagevec.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/swap.h>
 #include <linux/sched.h>
 #include <linux/kmemleak.h>
 #include <linux/xattr.h>
@@ -1758,7 +1759,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink);
  */
 int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 {
-	struct pagevec lru_pvec;
 	struct page *page;
 	char *kaddr;
 	struct iattr attr;
@@ -1798,11 +1798,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	 * No big deal if we can't add this page to the page cache here.
 	 * READLINK will get the missing page from the server if needed.
 	 */
-	pagevec_init(&lru_pvec, 0);
-	if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0,
+	if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0,
 							GFP_KERNEL)) {
-		pagevec_add(&lru_pvec, page);
-		pagevec_lru_add_file(&lru_pvec);
 		SetPageUptodate(page);
 		unlock_page(page);
 	} else
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2aa12b8499c0..e4dbfab37729 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -21,7 +21,7 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
+void __pagevec_lru_add(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
 unsigned pagevec_lookup_tag(struct pagevec *pvec,
@@ -64,36 +64,4 @@ static inline void pagevec_release(struct pagevec *pvec)
 		__pagevec_release(pvec);
 }
 
-static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
-}
-
-static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_ACTIVE_ANON);
-}
-
-static inline void __pagevec_lru_add_file(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_INACTIVE_FILE);
-}
-
-static inline void __pagevec_lru_add_active_file(struct pagevec *pvec)
-{
-	__pagevec_lru_add(pvec, LRU_ACTIVE_FILE);
-}
-
-static inline void pagevec_lru_add_file(struct pagevec *pvec)
-{
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_file(pvec);
-}
-
-static inline void pagevec_lru_add_anon(struct pagevec *pvec)
-{
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_anon(pvec);
-}
-
 #endif /* _LINUX_PAGEVEC_H */
diff --git a/mm/swap.c b/mm/swap.c
index c53d161fc76d..6a9d0c43924a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -505,7 +505,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
 
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
-		__pagevec_lru_add(pvec, lru);
+		__pagevec_lru_add(pvec);
 	pagevec_add(pvec, page);
 	put_cpu_var(lru_add_pvec);
 }
@@ -628,7 +628,7 @@ void lru_add_drain_cpu(int cpu)
 	struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
 
 	if (pagevec_count(pvec))
-		__pagevec_lru_add(pvec, NR_LRU_LISTS);
+		__pagevec_lru_add(pvec);
 
 	pvec = &per_cpu(lru_rotate_pvecs, cpu);
 	if (pagevec_count(pvec)) {
@@ -832,12 +832,10 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 				 void *arg)
 {
-	enum lru_list requested_lru = (enum lru_list)arg;
 	int file = page_is_file_cache(page);
 	int active = PageActive(page);
 	enum lru_list lru = page_lru(page);
 
-	WARN_ON_ONCE(requested_lru < NR_LRU_LISTS && requested_lru != lru);
 	VM_BUG_ON(PageUnevictable(page));
 	VM_BUG_ON(PageLRU(page));
 
@@ -851,11 +849,9 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
  * Add the passed pages to the LRU, then drop the caller's refcount
  * on them.  Reinitialises the caller's pagevec.
  */
-void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
+void __pagevec_lru_add(struct pagevec *pvec)
 {
-	VM_BUG_ON(is_unevictable_lru(lru));
-
-	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
+	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
 }
 EXPORT_SYMBOL(__pagevec_lru_add);
 
-- 
cgit v1.3-7-g2ca7


From c53954a092d07c5684d31ea1fc813d262cff08a5 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 3 Jul 2013 15:02:34 -0700
Subject: mm: remove lru parameter from __lru_cache_add and lru_cache_add_lru

Similar to __pagevec_lru_add, this patch removes the LRU parameter from
__lru_cache_add and lru_cache_add_lru as the caller does not control the
exact LRU the page gets added to.  lru_cache_add_lru gets renamed to
lru_cache_add the name is silly without the lru parameter.  With the
parameter removed, it is required that the caller indicate if they want
the page added to the active or inactive list by setting or clearing
PageActive respectively.

[akpm@linux-foundation.org: Suggested the patch]
[gang.chen@asianux.com: fix used-unintialized warning]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com>
Cc: Andrew Perepechko <anserper@ya.ru>
Cc: Robin Dong <sanbai@taobao.com>
Cc: Theodore Tso <tytso@mit.edu>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Bernd Schubert <bernd.schubert@fastmail.fm>
Cc: David Howells <dhowells@redhat.com>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 11 +++++++----
 mm/rmap.c            |  7 ++++---
 mm/swap.c            | 17 +++++++----------
 mm/vmscan.c          |  5 ++---
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1701ce4be746..85d74373002c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -10,6 +10,7 @@
 #include <linux/node.h>
 #include <linux/fs.h>
 #include <linux/atomic.h>
+#include <linux/page-flags.h>
 #include <asm/page.h>
 
 struct notifier_block;
@@ -233,8 +234,8 @@ extern unsigned long nr_free_pagecache_pages(void);
 
 
 /* linux/mm/swap.c */
-extern void __lru_cache_add(struct page *, enum lru_list lru);
-extern void lru_cache_add_lru(struct page *, enum lru_list lru);
+extern void __lru_cache_add(struct page *);
+extern void lru_cache_add(struct page *);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
 			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
@@ -254,12 +255,14 @@ extern void add_page_to_unevictable_list(struct page *page);
  */
 static inline void lru_cache_add_anon(struct page *page)
 {
-	__lru_cache_add(page, LRU_INACTIVE_ANON);
+	ClearPageActive(page);
+	__lru_cache_add(page);
 }
 
 static inline void lru_cache_add_file(struct page *page)
 {
-	__lru_cache_add(page, LRU_INACTIVE_FILE);
+	ClearPageActive(page);
+	__lru_cache_add(page);
 }
 
 /* linux/mm/vmscan.c */
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..e22ceeb6e5ec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
 	else
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
-	if (!mlocked_vma_newpage(vma, page))
-		lru_cache_add_lru(page, LRU_ACTIVE_ANON);
-	else
+	if (!mlocked_vma_newpage(vma, page)) {
+		SetPageActive(page);
+		lru_cache_add(page);
+	} else
 		add_page_to_unevictable_list(page);
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 6a9d0c43924a..4a1d0d2c52fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -494,15 +494,10 @@ EXPORT_SYMBOL(mark_page_accessed);
  * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
  * have the page added to the active list using mark_page_accessed().
  */
-void __lru_cache_add(struct page *page, enum lru_list lru)
+void __lru_cache_add(struct page *page)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 
-	if (is_active_lru(lru))
-		SetPageActive(page);
-	else
-		ClearPageActive(page);
-
 	page_cache_get(page);
 	if (!pagevec_space(pvec))
 		__pagevec_lru_add(pvec);
@@ -512,11 +507,10 @@ void __lru_cache_add(struct page *page, enum lru_list lru)
 EXPORT_SYMBOL(__lru_cache_add);
 
 /**
- * lru_cache_add_lru - add a page to a page list
+ * lru_cache_add - add a page to a page list
  * @page: the page to be added to the LRU.
- * @lru: the LRU list to which the page is added.
  */
-void lru_cache_add_lru(struct page *page, enum lru_list lru)
+void lru_cache_add(struct page *page)
 {
 	if (PageActive(page)) {
 		VM_BUG_ON(PageUnevictable(page));
@@ -525,7 +519,7 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
 	}
 
 	VM_BUG_ON(PageLRU(page));
-	__lru_cache_add(page, lru);
+	__lru_cache_add(page);
 }
 
 /**
@@ -745,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold)
 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		}
 
+		/* Clear Active bit in case of parallel mark_page_accessed */
+		ClearPageActive(page);
+
 		list_add(&page->lru, &pages_to_free);
 	}
 	if (zone)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c85794399848..99b3ac7771ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
 void putback_lru_page(struct page *page)
 {
 	int lru;
-	int active = !!TestClearPageActive(page);
 	int was_unevictable = PageUnevictable(page);
 
 	VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
-		lru = active + page_lru_base_type(page);
-		lru_cache_add_lru(page, lru);
+		lru = page_lru_base_type(page);
+		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
-- 
cgit v1.3-7-g2ca7


From 4c42efa266030f32227f35fabbdd986fcf8f2ec4 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:41 -0700
Subject: mm/pageblock: remove get/set_pageblock_flags

get_pageblock_flags and set_pageblock_flags are not used any more, this
patch removes them.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pageblock-flags.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h
index be655e4a2a75..2ee8cd2466b5 100644
--- a/include/linux/pageblock-flags.h
+++ b/include/linux/pageblock-flags.h
@@ -80,10 +80,4 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 							PB_migrate_skip)
 #endif /* CONFIG_COMPACTION */
 
-#define get_pageblock_flags(page) \
-			get_pageblock_flags_group(page, 0, PB_migrate_end)
-#define set_pageblock_flags(page, flags) \
-			set_pageblock_flags_group(page, flags,	\
-						  0, PB_migrate_end)
-
 #endif	/* PAGEBLOCK_FLAGS_H */
-- 
cgit v1.3-7-g2ca7


From 5f1e31d2f5977d910d0b2f5018173e99241d1940 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:02:42 -0700
Subject: mm/hugetlb: remove hugetlb_prefault

hugetlb_prefault() is not used any more, this patch removes it.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 89d4fbf681e7..c2b1801a160b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -55,7 +55,6 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 				unsigned long start, unsigned long end,
 				struct page *ref_page);
-int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
 void hugetlb_show_meminfo(void);
@@ -114,7 +113,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define follow_hugetlb_page(m,v,p,vs,a,b,i,w)	({ BUG(); 0; })
 #define follow_huge_addr(mm, addr, write)	ERR_PTR(-EINVAL)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
-#define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
-- 
cgit v1.3-7-g2ca7


From 917d9290af749fac9c4d90bacf18699c9d8ba28d Mon Sep 17 00:00:00 2001
From: Tim Chen <tim.c.chen@linux.intel.com>
Date: Wed, 3 Jul 2013 15:02:44 -0700
Subject: mm: tune vm_committed_as percpu_counter batching size

Currently the per cpu counter's batch size for memory accounting is
configured as twice the number of cpus in the system.  However, for
system with very large memory, it is more appropriate to make it
proportional to the memory size per cpu in the system.

For example, for a x86_64 system with 64 cpus and 128 GB of memory, the
batch size is only 2*64 pages (0.5 MB).  So any memory accounting
changes of more than 0.5MB will overflow the per cpu counter into the
global counter.  Instead, for the new scheme, the batch size is
configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256), which is
more inline with the memory size.

I've done a repeated brk test of 800KB (from will-it-scale test suite)
with 80 concurrent processes on a 4 socket Westmere machine with a total
of 40 cores.  Without the patch, about 80% of cpu is spent on spin-lock
contention within the vm_committed_as counter.  With the patch, there's
a 73x speedup on the benchmark and the lock contention drops off almost
entirely.

[akpm@linux-foundation.org: fix section mismatch]
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mman.h |  8 +++++++-
 mm/mm_init.c         | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mman.h b/include/linux/mman.h
index 9aa863da287f..92dc257251e4 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern struct percpu_counter vm_committed_as;
 
+#ifdef CONFIG_SMP
+extern s32 vm_committed_as_batch;
+#else
+#define vm_committed_as_batch 0
+#endif
+
 unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	percpu_counter_add(&vm_committed_as, pages);
+	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
 #include <linux/init.h>
 #include <linux/kobject.h>
 #include <linux/export.h>
+#include <linux/memory.h>
+#include <linux/notifier.h>
 #include "internal.h"
 
 #ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
 struct kobject *mm_kobj;
 EXPORT_SYMBOL_GPL(mm_kobj);
 
+#ifdef CONFIG_SMP
+s32 vm_committed_as_batch = 32;
+
+static void __meminit mm_compute_batch(void)
+{
+	u64 memsized_batch;
+	s32 nr = num_present_cpus();
+	s32 batch = max_t(s32, nr*2, 32);
+
+	/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
+	memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+
+	vm_committed_as_batch = max_t(s32, memsized_batch, batch);
+}
+
+static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
+					unsigned long action, void *arg)
+{
+	switch (action) {
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		mm_compute_batch();
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block compute_batch_nb __meminitdata = {
+	.notifier_call = mm_compute_batch_notifier,
+	.priority = IPC_CALLBACK_PRI, /* use lowest priority */
+};
+
+static int __init mm_compute_batch_init(void)
+{
+	mm_compute_batch();
+	register_hotmemory_notifier(&compute_batch_nb);
+
+	return 0;
+}
+
+__initcall(mm_compute_batch_init);
+
+#endif
+
 static int __init mm_sysfs_init(void)
 {
 	mm_kobj = kobject_create_and_add("mm", kernel_kobj);
-- 
cgit v1.3-7-g2ca7


From dcf6b7ddd7df8965727746f89c59229b23180e5a Mon Sep 17 00:00:00 2001
From: Rafael Aquini <aquini@redhat.com>
Date: Wed, 3 Jul 2013 15:02:46 -0700
Subject: swap: discard while swapping only if SWAP_FLAG_DISCARD_PAGES

Considering the use cases where the swap device supports discard:
a) and can do it quickly;
b) but it's slow to do in small granularities (or concurrent with other
   I/O);
c) but the implementation is so horrendous that you don't even want to
   send one down;

And assuming that the sysadmin considers it useful to send the discards down
at all, we would (probably) want the following solutions:

  i. do the fine-grained discards for freed swap pages, if device is
     capable of doing so optimally;
 ii. do single-time (batched) swap area discards, either at swapon
     or via something like fstrim (not implemented yet);
iii. allow doing both single-time and fine-grained discards; or
 iv. turn it off completely (default behavior)

As implemented today, one can only enable/disable discards for swap, but
one cannot select, for instance, solution (ii) on a swap device like (b)
even though the single-time discard is regarded to be interesting, or
necessary to the workload because it would imply (1), and the device is
not capable of performing it optimally.

This patch addresses the scenario depicted above by introducing a way to
ensure the (probably) wanted solutions (i, ii, iii and iv) can be flexibly
flagged through swapon(8) to allow a sysadmin to select the best suitable
swap discard policy accordingly to system constraints.

This patch introduces SWAP_FLAG_DISCARD_PAGES and SWAP_FLAG_DISCARD_ONCE
new flags to allow more flexibe swap discard policies being flagged
through swapon(8).  The default behavior is to keep both single-time, or
batched, area discards (SWAP_FLAG_DISCARD_ONCE) and fine-grained discards
for page-clusters (SWAP_FLAG_DISCARD_PAGES) enabled, in order to keep
consistentcy with older kernel behavior, as well as maintain compatibility
with older swapon(8).  However, through the new introduced flags the best
suitable discard policy can be selected accordingly to any given swap
device constraint.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Rafael Aquini <aquini@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Shaohua Li <shli@kernel.org>
Cc: Karel Zak <kzak@redhat.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 13 +++++++++----
 mm/swapfile.c        | 55 +++++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 59 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 85d74373002c..d95cde5e257d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -20,10 +20,13 @@ struct bio;
 #define SWAP_FLAG_PREFER	0x8000	/* set if swap priority specified */
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
-#define SWAP_FLAG_DISCARD	0x10000 /* discard swap cluster after use */
+#define SWAP_FLAG_DISCARD	0x10000 /* enable discard for swap */
+#define SWAP_FLAG_DISCARD_ONCE	0x20000 /* discard swap area at swapon-time */
+#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */
 
 #define SWAP_FLAGS_VALID	(SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
-				 SWAP_FLAG_DISCARD)
+				 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
+				 SWAP_FLAG_DISCARD_PAGES)
 
 static inline int current_is_kswapd(void)
 {
@@ -147,14 +150,16 @@ struct swap_extent {
 enum {
 	SWP_USED	= (1 << 0),	/* is slot in swap_info[] used? */
 	SWP_WRITEOK	= (1 << 1),	/* ok to write to this swap?	*/
-	SWP_DISCARDABLE = (1 << 2),	/* swapon+blkdev support discard */
+	SWP_DISCARDABLE = (1 << 2),	/* blkdev support discard */
 	SWP_DISCARDING	= (1 << 3),	/* now discarding a free cluster */
 	SWP_SOLIDSTATE	= (1 << 4),	/* blkdev seeks are cheap */
 	SWP_CONTINUED	= (1 << 5),	/* swap_map has count continuation */
 	SWP_BLKDEV	= (1 << 6),	/* its a block device */
 	SWP_FILE	= (1 << 7),	/* set after swap_activate success */
+	SWP_AREA_DISCARD = (1 << 8),	/* single-time swap area discards */
+	SWP_PAGE_DISCARD = (1 << 9),	/* freed swap page-cluster discards */
 					/* add others here before... */
-	SWP_SCANNING	= (1 << 8),	/* refcount in scan_swap_map */
+	SWP_SCANNING	= (1 << 10),	/* refcount in scan_swap_map */
 };
 
 #define SWAP_CLUSTER_MAX 32UL
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..36af6eeaa67e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
-		if (si->flags & SWP_DISCARDABLE) {
+		if (si->flags & SWP_PAGE_DISCARD) {
 			/*
 			 * Start range check on racing allocations, in case
 			 * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
 
 	if (si->lowest_alloc) {
 		/*
-		 * Only set when SWP_DISCARDABLE, and there's a scan
+		 * Only set when SWP_PAGE_DISCARD, and there's a scan
 		 * for a free cluster in progress or just completed.
 		 */
 		if (found_free_cluster) {
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 	return nr_extents;
 }
 
+/*
+ * Helper to sys_swapon determining if a given swap
+ * backing device queue supports DISCARD operations.
+ */
+static bool swap_discardable(struct swap_info_struct *si)
+{
+	struct request_queue *q = bdev_get_queue(si->bdev);
+
+	if (!q || !blk_queue_discard(q))
+		return false;
+
+	return true;
+}
+
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
 	struct swap_info_struct *p;
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			p->flags |= SWP_SOLIDSTATE;
 			p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
 		}
-		if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
-			p->flags |= SWP_DISCARDABLE;
+
+		if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+			/*
+			 * When discard is enabled for swap with no particular
+			 * policy flagged, we set all swap discard flags here in
+			 * order to sustain backward compatibility with older
+			 * swapon(8) releases.
+			 */
+			p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+				     SWP_PAGE_DISCARD);
+
+			/*
+			 * By flagging sys_swapon, a sysadmin can tell us to
+			 * either do single-time area discards only, or to just
+			 * perform discards for released swap page-clusters.
+			 * Now it's time to adjust the p->flags accordingly.
+			 */
+			if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+				p->flags &= ~SWP_PAGE_DISCARD;
+			else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+				p->flags &= ~SWP_AREA_DISCARD;
+
+			/* issue a swapon-time discard if it's still required */
+			if (p->flags & SWP_AREA_DISCARD) {
+				int err = discard_swap(p);
+				if (unlikely(err))
+					printk(KERN_ERR
+					       "swapon: discard_swap(%p): %d\n",
+						p, err);
+			}
+		}
 	}
 
 	mutex_lock(&swapon_mutex);
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	enable_swap_info(p, prio, swap_map, frontswap_map);
 
 	printk(KERN_INFO "Adding %uk swap on %s.  "
-			"Priority:%d extents:%d across:%lluk %s%s%s\n",
+			"Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
 		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
 		(p->flags & SWP_SOLIDSTATE) ? "SS" : "",
 		(p->flags & SWP_DISCARDABLE) ? "D" : "",
+		(p->flags & SWP_AREA_DISCARD) ? "s" : "",
+		(p->flags & SWP_PAGE_DISCARD) ? "c" : "",
 		(frontswap_map) ? "FS" : "");
 
 	mutex_unlock(&swapon_mutex);
-- 
cgit v1.3-7-g2ca7


From 11199692d83dd3fe1511203024fb9853d176ec4c Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:02:48 -0700
Subject: mm: change signature of free_reserved_area() to fix building warnings

Change signature of free_reserved_area() according to Russell King's
suggestion to fix following build warnings:

  arch/arm/mm/init.c: In function 'mem_init':
  arch/arm/mm/init.c:603:2: warning: passing argument 1 of 'free_reserved_area' makes integer from pointer without a cast [enabled by default]
    free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL);
    ^
  In file included from include/linux/mman.h:4:0,
                   from arch/arm/mm/init.c:15:
  include/linux/mm.h:1301:22: note: expected 'long unsigned int' but argument is of type 'void *'
   extern unsigned long free_reserved_area(unsigned long start, unsigned long end,

   mm/page_alloc.c: In function 'free_reserved_area':
>> mm/page_alloc.c:5134:3: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [enabled by default]
   In file included from arch/mips/include/asm/page.h:49:0,
                    from include/linux/mmzone.h:20,
                    from include/linux/gfp.h:4,
                    from include/linux/mm.h:8,
                    from mm/page_alloc.c:18:
   arch/mips/include/asm/io.h:119:29: note: expected 'const volatile void *' but argument is of type 'long unsigned int'
   mm/page_alloc.c: In function 'free_area_init_nodes':
   mm/page_alloc.c:5030:34: warning: array subscript is below array bounds [-Warray-bounds]

Also address some minor code review comments.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <sworddragon2@aol.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michel Lespinasse <walken@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/sys_nautilus.c |  4 ++--
 arch/alpha/mm/init.c             |  2 +-
 arch/arc/mm/init.c               |  2 +-
 arch/arm/mm/init.c               |  2 +-
 arch/arm64/mm/init.c             |  2 +-
 arch/avr32/mm/init.c             |  2 +-
 arch/blackfin/mm/init.c          |  2 +-
 arch/c6x/mm/init.c               |  2 +-
 arch/frv/mm/init.c               |  2 +-
 arch/h8300/mm/init.c             |  2 +-
 arch/ia64/mm/init.c              |  3 +--
 arch/m32r/mm/init.c              |  2 +-
 arch/m68k/mm/init.c              |  2 +-
 arch/metag/mm/init.c             |  3 ++-
 arch/microblaze/mm/init.c        |  2 +-
 arch/mips/mm/init.c              |  3 ++-
 arch/mn10300/mm/init.c           |  3 ++-
 arch/openrisc/mm/init.c          |  2 +-
 arch/parisc/mm/init.c            |  3 ++-
 arch/powerpc/kernel/kvm.c        |  2 +-
 arch/powerpc/mm/mem.c            |  2 +-
 arch/s390/mm/init.c              |  3 ++-
 arch/score/mm/init.c             |  3 ++-
 arch/sh/mm/init.c                |  2 +-
 arch/sparc/mm/init_32.c          |  4 ++--
 arch/sparc/mm/init_64.c          |  4 ++--
 arch/um/kernel/mem.c             |  2 +-
 arch/unicore32/mm/init.c         |  2 +-
 arch/xtensa/mm/init.c            |  2 +-
 include/linux/mm.h               |  5 ++---
 mm/page_alloc.c                  | 19 ++++++++++---------
 31 files changed, 50 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
index 1d4aabfcf9a1..891bd274ccb5 100644
--- a/arch/alpha/kernel/sys_nautilus.c
+++ b/arch/alpha/kernel/sys_nautilus.c
@@ -238,8 +238,8 @@ nautilus_init_pci(void)
 	if (pci_mem < memtop)
 		memtop = pci_mem;
 	if (memtop > alpha_mv.min_mem_address) {
-		free_reserved_area((unsigned long)__va(alpha_mv.min_mem_address),
-				   (unsigned long)__va(memtop), 0, NULL);
+		free_reserved_area(__va(alpha_mv.min_mem_address),
+				   __va(memtop), 0, NULL);
 		printk("nautilus_init_pci: %ldk freed\n",
 			(memtop - alpha_mv.min_mem_address) >> 10);
 	}
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 0ba85ee4a466..d54848d4e464 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -326,6 +326,6 @@ free_initmem(void)
 void
 free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 4a177365b2c4..dce02e4716a1 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -152,7 +152,7 @@ void __init_refok free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 2ffee02d1d5c..7fae391caf86 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd) {
 		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		free_reserved_area(start, end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, 0, "initrd");
 	}
 }
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index f497ca77925a..6041e4008a83 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd) {
 		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		free_reserved_area(start, end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, 0, "initrd");
 	}
 }
 
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index e66e8406f992..5a79fa08cb3c 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -154,6 +154,6 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index 82d01a71207f..8e9eab272811 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -133,7 +133,7 @@ void __init mem_init(void)
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
 #ifndef CONFIG_MPU
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 #endif
 }
 #endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index b74ccb5a7690..07bfcc98a3b7 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -78,7 +78,7 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index dee354fa6b64..a67f3a5897b8 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -173,6 +173,6 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 } /* end free_initrd_mem() */
 #endif
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index ff349d70a29b..57e03c59861d 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -161,7 +161,7 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index d1fe4b402601..da568c2e839f 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -154,8 +154,7 @@ ia64_init_addr_space (void)
 void
 free_initmem (void)
 {
-	free_reserved_area((unsigned long)ia64_imva(__init_begin),
-			   (unsigned long)ia64_imva(__init_end),
+	free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end),
 			   0, "unused kernel");
 }
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index ab4cbce91a9b..d80412d0c14e 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -191,6 +191,6 @@ void free_initmem(void)
  *======================================================================*/
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 1af2ca3411f6..95de725534e9 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -202,6 +202,6 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index d05b8455c44c..5e2238dd72e0 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -414,7 +414,8 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			   "initrd");
 }
 #endif
 
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index b38ae3acfeb4..d7b8ada9345f 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -235,7 +235,7 @@ void __init setup_memory(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 9b973e0af9cb..268f2a94031b 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -440,7 +440,8 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			   "initrd");
 }
 #endif
 
diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c
index 5a8ace63a6b4..e19049d1f2b9 100644
--- a/arch/mn10300/mm/init.c
+++ b/arch/mn10300/mm/init.c
@@ -152,6 +152,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			   "initrd");
 }
 #endif
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index b3cbc6703837..ab113325dc4c 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -261,7 +261,7 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 505b56c6b9b9..3223d5e4a372 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -1101,6 +1101,7 @@ void flush_tlb_all(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	num_physpages += free_reserved_area(start, end, 0, "initrd");
+	num_physpages += free_reserved_area((void *)start, (void *)end, 0,
+					    "initrd");
 }
 #endif
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 6782221d49bd..5e4830a33c08 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -756,7 +756,7 @@ static __init void kvm_free_tmp(void)
 	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
 
 	/* Free the tmp space we don't need */
-	free_reserved_area(start, end, 0, NULL);
+	free_reserved_area((void *)start, (void *)end, 0, NULL);
 }
 
 static int __init kvm_guest_init(void)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 0988a26e0413..347c5b1bbd62 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -407,7 +407,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 89ebae4008f2..0878c89fe7d2 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -172,7 +172,8 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			   "initrd");
 }
 #endif
 
diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c
index 0940682ab38b..f5dd61eb4544 100644
--- a/arch/score/mm/init.c
+++ b/arch/score/mm/init.c
@@ -108,7 +108,8 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd");
+	free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
+			   "initrd");
 }
 #endif
 
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 20f9ead650d3..b892a9b7d7e3 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -505,7 +505,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index af472cf7c69a..d5f9c023826f 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -372,8 +372,8 @@ void free_initmem (void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
-					    "initrd");
+	num_physpages += free_reserved_area((void *)start, (void *)end,
+					    POISON_FREE_INITMEM, "initrd");
 }
 #endif
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 04fd55a6e461..8269deb84eda 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2131,8 +2131,8 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM,
-					    "initrd");
+	num_physpages += free_reserved_area((void *)start, (void *)end,
+					    POISON_FREE_INITMEM, "initrd");
 }
 #endif
 
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 9df292b270a8..2aa7a2448d58 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -244,7 +244,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area(start, end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 63df12d71ce3..220755cc9700 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -486,7 +486,7 @@ static int keep_initrd;
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd)
-		free_reserved_area(start, end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 
 static int __init keepinitrd_setup(char *__unused)
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index bba125b4bb06..4d658efc3289 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -214,7 +214,7 @@ extern int initrd_is_mapped;
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (initrd_is_mapped)
-		free_reserved_area(start, end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, 0, "initrd");
 }
 #endif
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 949bd7035895..be1b96ce0650 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1311,7 +1311,7 @@ extern void free_initmem(void);
  * "poison" if it's non-zero.
  * Return pages freed into the buddy system.
  */
-extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
+extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, char *s);
 #ifdef	CONFIG_HIGHMEM
 /*
@@ -1355,8 +1355,7 @@ static inline unsigned long free_initmem_default(int poison)
 {
 	extern char __init_begin[], __init_end[];
 
-	return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) ,
-				  ((unsigned long)&__init_end) & PAGE_MASK,
+	return free_reserved_area(&__init_begin, &__init_end,
 				  poison, "unused kernel");
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d711dcdda362..be18ccd017bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5206,25 +5206,26 @@ early_param("movablecore", cmdline_parse_movablecore);
 
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
-unsigned long free_reserved_area(unsigned long start, unsigned long end,
-				 int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
-	unsigned long pages, pos;
+	void *pos;
+	unsigned long pages = 0;
 
-	pos = start = PAGE_ALIGN(start);
-	end &= PAGE_MASK;
-	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+	start = (void *)PAGE_ALIGN((unsigned long)start);
+	end = (void *)((unsigned long)end & PAGE_MASK);
+	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
 		if (poison)
-			memset((void *)pos, poison, PAGE_SIZE);
-		free_reserved_page(virt_to_page((void *)pos));
+			memset(pos, poison, PAGE_SIZE);
+		free_reserved_page(virt_to_page(pos));
 	}
 
 	if (pages && s)
-		pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+		pr_info("Freeing %s memory: %ldK (%p - %p)\n",
 			s, pages << (PAGE_SHIFT - 10), start, end);
 
 	return pages;
 }
+EXPORT_SYMBOL(free_reserved_area);
 
 #ifdef	CONFIG_HIGHMEM
 void free_highmem_page(struct page *page)
-- 
cgit v1.3-7-g2ca7


From dbe67df4ba78c79db547c7864e1120981c144c97 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:02:51 -0700
Subject: mm: enhance free_reserved_area() to support poisoning memory with
 zero

Address more review comments from last round of code review.
1) Enhance free_reserved_area() to support poisoning freed memory with
   pattern '0'. This could be used to get rid of poison_init_mem()
   on ARM64.
2) A previous patch has disabled memory poison for initmem on s390
   by mistake, so restore to the original behavior.
3) Remove redundant PAGE_ALIGN() when calling free_reserved_area().

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <sworddragon2@aol.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michel Lespinasse <walken@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/sys_nautilus.c | 2 +-
 arch/alpha/mm/init.c             | 4 ++--
 arch/arc/mm/init.c               | 4 ++--
 arch/arm/mm/init.c               | 8 ++++----
 arch/arm64/mm/init.c             | 4 ++--
 arch/avr32/mm/init.c             | 4 ++--
 arch/blackfin/mm/init.c          | 4 ++--
 arch/c6x/mm/init.c               | 4 ++--
 arch/cris/mm/init.c              | 2 +-
 arch/frv/mm/init.c               | 4 ++--
 arch/h8300/mm/init.c             | 4 ++--
 arch/ia64/mm/init.c              | 2 +-
 arch/m32r/mm/init.c              | 4 ++--
 arch/m68k/mm/init.c              | 4 ++--
 arch/microblaze/mm/init.c        | 4 ++--
 arch/openrisc/mm/init.c          | 4 ++--
 arch/parisc/mm/init.c            | 4 ++--
 arch/powerpc/kernel/kvm.c        | 9 ++-------
 arch/powerpc/mm/mem.c            | 2 +-
 arch/s390/mm/init.c              | 2 +-
 arch/sh/mm/init.c                | 4 ++--
 arch/um/kernel/mem.c             | 2 +-
 arch/unicore32/mm/init.c         | 4 ++--
 arch/xtensa/mm/init.c            | 4 ++--
 include/linux/mm.h               | 7 ++++---
 mm/page_alloc.c                  | 2 +-
 26 files changed, 49 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c
index 891bd274ccb5..837c0fa58317 100644
--- a/arch/alpha/kernel/sys_nautilus.c
+++ b/arch/alpha/kernel/sys_nautilus.c
@@ -239,7 +239,7 @@ nautilus_init_pci(void)
 		memtop = pci_mem;
 	if (memtop > alpha_mv.min_mem_address) {
 		free_reserved_area(__va(alpha_mv.min_mem_address),
-				   __va(memtop), 0, NULL);
+				   __va(memtop), -1, NULL);
 		printk("nautilus_init_pci: %ldk freed\n",
 			(memtop - alpha_mv.min_mem_address) >> 10);
 	}
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index d54848d4e464..218c29c14bb3 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -319,13 +319,13 @@ mem_init(void)
 void
 free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void
 free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index dce02e4716a1..f9c707712096 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -146,13 +146,13 @@ void __init mem_init(void)
  */
 void __init_refok free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 7fae391caf86..2070651c1bb4 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -601,7 +601,7 @@ void __init mem_init(void)
 
 #ifdef CONFIG_SA1111
 	/* now that our DMA memory is actually so designated, we can free it */
-	free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL);
+	free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, -1, NULL);
 #endif
 
 	free_highpages();
@@ -729,12 +729,12 @@ void free_initmem(void)
 	extern char __tcm_start, __tcm_end;
 
 	poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start);
-	free_reserved_area(&__tcm_start, &__tcm_end, 0, "TCM link");
+	free_reserved_area(&__tcm_start, &__tcm_end, -1, "TCM link");
 #endif
 
 	poison_init_mem(__init_begin, __init_end - __init_begin);
 	if (!machine_is_integrator() && !machine_is_cintegrator())
-		free_initmem_default(0);
+		free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd) {
 		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		free_reserved_area((void *)start, (void *)end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, -1, "initrd");
 	}
 }
 
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 6041e4008a83..997c6345cdd6 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -387,7 +387,7 @@ void __init mem_init(void)
 void free_initmem(void)
 {
 	poison_init_mem(__init_begin, __init_end - __init_begin);
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd) {
 		poison_init_mem((void *)start, PAGE_ALIGN(end) - start);
-		free_reserved_area((void *)start, (void *)end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, -1, "initrd");
 	}
 }
 
diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c
index 5a79fa08cb3c..b079e04f6954 100644
--- a/arch/avr32/mm/init.c
+++ b/arch/avr32/mm/init.c
@@ -148,12 +148,12 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c
index 8e9eab272811..fa241f5a7dcf 100644
--- a/arch/blackfin/mm/init.c
+++ b/arch/blackfin/mm/init.c
@@ -133,7 +133,7 @@ void __init mem_init(void)
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
 #ifndef CONFIG_MPU
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 #endif
 }
 #endif
@@ -141,7 +141,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 void __init_refok free_initmem(void)
 {
 #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU
-	free_initmem_default(0);
+	free_initmem_default(-1);
 	if (memory_start == (unsigned long)(&__init_end))
 		memory_start = (unsigned long)(&__init_begin);
 #endif
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 07bfcc98a3b7..3987a20fdee6 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -78,11 +78,11 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
 void __init free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 9ac80946dada..8fec26392ae7 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -65,5 +65,5 @@ mem_init(void)
 void 
 free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index a67f3a5897b8..8ba9d22d0d91 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -162,7 +162,7 @@ void __init mem_init(void)
 void free_initmem(void)
 {
 #if defined(CONFIG_RAMKERNEL) && !defined(CONFIG_PROTECT_KERNEL)
-	free_initmem_default(0);
+	free_initmem_default(-1);
 #endif
 } /* end free_initmem() */
 
@@ -173,6 +173,6 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 } /* end free_initrd_mem() */
 #endif
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 57e03c59861d..c831f1dba132 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -161,7 +161,7 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
@@ -169,7 +169,7 @@ void
 free_initmem(void)
 {
 #ifdef CONFIG_RAMKERNEL
-	free_initmem_default(0);
+	free_initmem_default(-1);
 #endif
 }
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index da568c2e839f..f8a4f38b0ad5 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -155,7 +155,7 @@ void
 free_initmem (void)
 {
 	free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end),
-			   0, "unused kernel");
+			   -1, "unused kernel");
 }
 
 void __init
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index d80412d0c14e..cca87d918436 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -181,7 +181,7 @@ void __init mem_init(void)
  *======================================================================*/
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -191,6 +191,6 @@ void free_initmem(void)
  *======================================================================*/
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index 95de725534e9..ab0b54ca5d85 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -110,7 +110,7 @@ void __init paging_init(void)
 void free_initmem(void)
 {
 #ifndef CONFIG_MMU_SUN3
-	free_initmem_default(0);
+	free_initmem_default(-1);
 #endif /* CONFIG_MMU_SUN3 */
 }
 
@@ -202,6 +202,6 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index d7b8ada9345f..d149e0ebb767 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -235,13 +235,13 @@ void __init setup_memory(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 void __init mem_init(void)
diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c
index ab113325dc4c..c371e4a0fcac 100644
--- a/arch/openrisc/mm/init.c
+++ b/arch/openrisc/mm/init.c
@@ -261,11 +261,11 @@ void __init mem_init(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3223d5e4a372..ebac7bd76b56 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -532,7 +532,7 @@ void free_initmem(void)
 	 * pages are no-longer executable */
 	flush_icache_range(init_begin, init_end);
 	
-	num_physpages += free_initmem_default(0);
+	num_physpages += free_initmem_default(-1);
 
 	/* set up a new led state on systems shipped LED State panel */
 	pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE);
@@ -1101,7 +1101,7 @@ void flush_tlb_all(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	num_physpages += free_reserved_area((void *)start, (void *)end, 0,
+	num_physpages += free_reserved_area((void *)start, (void *)end, -1,
 					    "initrd");
 }
 #endif
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 5e4830a33c08..db28032e320e 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -750,13 +750,8 @@ EXPORT_SYMBOL_GPL(kvm_hypercall);
 
 static __init void kvm_free_tmp(void)
 {
-	unsigned long start, end;
-
-	start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
-	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
-
-	/* Free the tmp space we don't need */
-	free_reserved_area((void *)start, (void *)end, 0, NULL);
+	free_reserved_area(&kvm_tmp[kvm_tmp_index],
+			   &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL);
 }
 
 static int __init kvm_guest_init(void)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 347c5b1bbd62..7f47a05f55af 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -407,7 +407,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void __init free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 0878c89fe7d2..bf01d18422ec 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -166,7 +166,7 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(POISON_FREE_INITMEM);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index b892a9b7d7e3..d3af56b7a098 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -499,13 +499,13 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 2aa7a2448d58..8ff0b7ae8ec0 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -244,7 +244,7 @@ void free_initmem(void)
 #ifdef CONFIG_BLK_DEV_INITRD
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
-	free_reserved_area((void *)start, (void *)end, 0, "initrd");
+	free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 220755cc9700..df9b8abcb6a5 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -476,7 +476,7 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
 
 #ifdef CONFIG_BLK_DEV_INITRD
@@ -486,7 +486,7 @@ static int keep_initrd;
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (!keep_initrd)
-		free_reserved_area((void *)start, (void *)end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 
 static int __init keepinitrd_setup(char *__unused)
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 4d658efc3289..026d29bee30b 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -214,11 +214,11 @@ extern int initrd_is_mapped;
 void free_initrd_mem(unsigned long start, unsigned long end)
 {
 	if (initrd_is_mapped)
-		free_reserved_area((void *)start, (void *)end, 0, "initrd");
+		free_reserved_area((void *)start, (void *)end, -1, "initrd");
 }
 #endif
 
 void free_initmem(void)
 {
-	free_initmem_default(0);
+	free_initmem_default(-1);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index be1b96ce0650..083cc0ba2384 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1308,7 +1308,7 @@ extern void free_initmem(void);
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
- * "poison" if it's non-zero.
+ * "poison" if it's within range [0, UCHAR_MAX].
  * Return pages freed into the buddy system.
  */
 extern unsigned long free_reserved_area(void *start, void *end,
@@ -1348,8 +1348,9 @@ static inline void mark_page_reserved(struct page *page)
 
 /*
  * Default method to free all the __init memory into the buddy system.
- * The freed pages will be poisoned with pattern "poison" if it is
- * non-zero. Return pages freed into the buddy system.
+ * The freed pages will be poisoned with pattern "poison" if it's within
+ * range [0, UCHAR_MAX].
+ * Return pages freed into the buddy system.
  */
 static inline unsigned long free_initmem_default(int poison)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index be18ccd017bb..6780b2e18aa1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5214,7 +5214,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 	start = (void *)PAGE_ALIGN((unsigned long)start);
 	end = (void *)((unsigned long)end & PAGE_MASK);
 	for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
-		if (poison)
+		if ((unsigned int)poison <= 0xFF)
 			memset(pos, poison, PAGE_SIZE);
 		free_reserved_page(virt_to_page(pos));
 	}
-- 
cgit v1.3-7-g2ca7


From 7b4b2a0d6c8500350784beb83a6a55e60ea3bea3 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:03:11 -0700
Subject: mm: accurately calculate zone->managed_pages for highmem zones

Commit "mm: introduce new field 'managed_pages' to struct zone" assumes
that all highmem pages will be freed into the buddy system by function
mem_init().  But that's not always true, some architectures may reserve
some highmem pages during boot.  For example PPC may allocate highmem
pages for giagant HugeTLB pages, and several architectures have code to
check PageReserved flag to exclude highmem pages allocated during boot
when freeing highmem pages into the buddy system.

So treat highmem pages in the same way as normal pages, that is to:
1) reset zone->managed_pages to zero in mem_init().
2) recalculate managed_pages when freeing pages into the buddy system.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <sworddragon2@aol.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/metag/mm/init.c     |  6 ++++++
 arch/x86/mm/highmem_32.c |  6 ++++++
 include/linux/bootmem.h  |  1 +
 mm/bootmem.c             | 32 ++++++++++++++++++--------------
 mm/nobootmem.c           | 30 ++++++++++++++++--------------
 mm/page_alloc.c          |  1 +
 6 files changed, 48 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c
index 5e2238dd72e0..d7595f58fad5 100644
--- a/arch/metag/mm/init.c
+++ b/arch/metag/mm/init.c
@@ -380,6 +380,12 @@ void __init mem_init(void)
 
 #ifdef CONFIG_HIGHMEM
 	unsigned long tmp;
+
+	/*
+	 * Explicitly reset zone->managed_pages because highmem pages are
+	 * freed before calling free_all_bootmem_node();
+	 */
+	reset_all_zones_managed_pages();
 	for (tmp = highstart_pfn; tmp < highend_pfn; tmp++)
 		free_highmem_page(pfn_to_page(tmp));
 	num_physpages += totalhigh_pages;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 252b8f5489ba..4500142bc4aa 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -1,6 +1,7 @@
 #include <linux/highmem.h>
 #include <linux/module.h>
 #include <linux/swap.h> /* for totalram_pages */
+#include <linux/bootmem.h>
 
 void *kmap(struct page *page)
 {
@@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void)
 	struct zone *zone;
 	int nid;
 
+	/*
+	 * Explicitly reset zone->managed_pages because set_highmem_pages_init()
+	 * is invoked before free_all_bootmem()
+	 */
+	reset_all_zones_managed_pages();
 	for_each_zone(zone) {
 		unsigned long zone_start_pfn, zone_end_pfn;
 
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 5f0b0e1f7c08..0e48c3221d82 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -46,6 +46,7 @@ extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
+extern void reset_all_zones_managed_pages(void);
 
 extern void free_bootmem_node(pg_data_t *pgdat,
 			      unsigned long addr,
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..eb792323187b 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,20 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 	return count;
 }
 
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
 	struct zone *z;
 
-	/*
-	 * In free_area_init_core(), highmem zone's managed_pages is set to
-	 * present_pages, and bootmem allocator doesn't allocate from highmem
-	 * zones. So there's no need to recalculate managed_pages because all
-	 * highmem pages will be managed by the buddy system. Here highmem
-	 * zone also includes highmem movable zone.
-	 */
+	if (reset_managed_pages_done)
+		return;
+
 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-		if (!is_highmem(z))
-			z->managed_pages = 0;
+		z->managed_pages = 0;
+}
+
+void __init reset_all_zones_managed_pages(void)
+{
+	struct pglist_data *pgdat;
+
+	for_each_online_pgdat(pgdat)
+		reset_node_managed_pages(pgdat);
+	reset_managed_pages_done = 1;
 }
 
 /**
@@ -266,7 +272,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
 	register_page_bootmem_info_node(pgdat);
-	reset_node_lowmem_managed_pages(pgdat);
+	reset_node_managed_pages(pgdat);
 	return free_all_bootmem_core(pgdat->bdata);
 }
 
@@ -279,10 +285,8 @@ unsigned long __init free_all_bootmem(void)
 {
 	unsigned long total_pages = 0;
 	bootmem_data_t *bdata;
-	struct pglist_data *pgdat;
 
-	for_each_online_pgdat(pgdat)
-		reset_node_lowmem_managed_pages(pgdat);
+	reset_all_zones_managed_pages();
 
 	list_for_each_entry(bdata, &bdata_list, list)
 		total_pages += free_all_bootmem_core(bdata);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..0ae8d91365af 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
 	return count;
 }
 
-static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
+static int reset_managed_pages_done __initdata;
+
+static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
 {
 	struct zone *z;
 
-	/*
-	 * In free_area_init_core(), highmem zone's managed_pages is set to
-	 * present_pages, and bootmem allocator doesn't allocate from highmem
-	 * zones. So there's no need to recalculate managed_pages because all
-	 * highmem pages will be managed by the buddy system. Here highmem
-	 * zone also includes highmem movable zone.
-	 */
+	if (reset_managed_pages_done)
+		return;
 	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
-		if (!is_highmem(z))
-			z->managed_pages = 0;
+		z->managed_pages = 0;
+}
+
+void __init reset_all_zones_managed_pages(void)
+{
+	struct pglist_data *pgdat;
+
+	for_each_online_pgdat(pgdat)
+		reset_node_managed_pages(pgdat);
+	reset_managed_pages_done = 1;
 }
 
 /**
@@ -160,10 +165,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
  */
 unsigned long __init free_all_bootmem(void)
 {
-	struct pglist_data *pgdat;
-
-	for_each_online_pgdat(pgdat)
-		reset_node_lowmem_managed_pages(pgdat);
+	reset_all_zones_managed_pages();
 
 	/*
 	 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f22542f6dc12..22438eba00b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5232,6 +5232,7 @@ void free_highmem_page(struct page *page)
 {
 	__free_reserved_page(page);
 	totalram_pages++;
+	page_zone(page)->managed_pages++;
 	totalhigh_pages++;
 }
 #endif
-- 
cgit v1.3-7-g2ca7


From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:03:14 -0700
Subject: mm: use a dedicated lock to protect totalram_pages and
 zone->managed_pages

Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to
protect totalram_pages and zone->managed_pages.  Other than the memory
hotplug driver, totalram_pages and zone->managed_pages may also be
modified at runtime by other drivers, such as Xen balloon,
virtio_balloon etc.  For those cases, memory hotplug lock is a little
too heavy, so introduce a dedicated lock to protect totalram_pages and
zone->managed_pages.

Now we have a simplified locking rules totalram_pages and
zone->managed_pages as:

1) no locking for read accesses because they are unsigned long.
2) no locking for write accesses at boot time in single-threaded context.
3) serialize write accesses at runtime by acquiring the dedicated
   managed_page_count_lock.

Also adjust zone->managed_pages when freeing reserved pages into the
buddy system, to keep totalram_pages and zone->managed_pages in
consistence.

[akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)]
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: <sworddragon2@aol.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h     |  6 ++----
 include/linux/mmzone.h | 14 ++++++++++----
 mm/page_alloc.c        | 11 +++++++++++
 3 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 083cc0ba2384..4310f80ce956 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1313,6 +1313,7 @@ extern void free_initmem(void);
  */
 extern unsigned long free_reserved_area(void *start, void *end,
 					int poison, char *s);
+
 #ifdef	CONFIG_HIGHMEM
 /*
  * Free a highmem page into the buddy system, adjusting totalhigh_pages
@@ -1321,10 +1322,7 @@ extern unsigned long free_reserved_area(void *start, void *end,
 extern void free_highmem_page(struct page *page);
 #endif
 
-static inline void adjust_managed_page_count(struct page *page, long count)
-{
-	totalram_pages += count;
-}
+extern void adjust_managed_page_count(struct page *page, long count);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e511f9429f1e..09d381b71fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -474,10 +474,16 @@ struct zone {
 	 * frequently read in proximity to zone->lock.  It's good to
 	 * give them a chance of being in the same cacheline.
 	 *
-	 * Write access to present_pages and managed_pages at runtime should
-	 * be protected by lock_memory_hotplug()/unlock_memory_hotplug().
-	 * Any reader who can't tolerant drift of present_pages and
-	 * managed_pages should hold memory hotplug lock to get a stable value.
+	 * Write access to present_pages at runtime should be protected by
+	 * lock_memory_hotplug()/unlock_memory_hotplug().  Any reader who can't
+	 * tolerant drift of present_pages should hold memory hotplug lock to
+	 * get a stable value.
+	 *
+	 * Read access to managed_pages should be safe because it's unsigned
+	 * long. Write access to zone->managed_pages and totalram_pages are
+	 * protected by managed_page_count_lock at runtime. Idealy only
+	 * adjust_managed_page_count() should be used instead of directly
+	 * touching zone->managed_pages and totalram_pages.
 	 */
 	unsigned long		spanned_pages;
 	unsigned long		present_pages;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22438eba00b6..93f292a60cb0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,6 +103,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 };
 EXPORT_SYMBOL(node_states);
 
+/* Protect totalram_pages and zone->managed_pages */
+static DEFINE_SPINLOCK(managed_page_count_lock);
+
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 /*
@@ -5206,6 +5209,14 @@ early_param("movablecore", cmdline_parse_movablecore);
 
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
+void adjust_managed_page_count(struct page *page, long count)
+{
+	spin_lock(&managed_page_count_lock);
+	page_zone(page)->managed_pages += count;
+	totalram_pages += count;
+	spin_unlock(&managed_page_count_lock);
+}
+
 unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
 {
 	void *pos;
-- 
cgit v1.3-7-g2ca7


From 7ee3d4e8cd560500192d80ca84d7f15d6dee0807 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:03:41 -0700
Subject: mm: introduce helper function mem_init_print_info() to simplify
 mem_init()

Introduce helper function mem_init_print_info() to simplify mem_init()
across different architectures, which also unifies the format and
information printed.

Function mem_init_print_info() calculates memory statistics information
without walking each page, so it should be a little faster on some
architectures.

Also introduce another helper get_num_physpages() to kill the global
variable num_physpages.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 12 ++++++++++++
 mm/page_alloc.c    | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4310f80ce956..09c235301dbb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1323,6 +1323,7 @@ extern void free_highmem_page(struct page *page);
 #endif
 
 extern void adjust_managed_page_count(struct page *page, long count);
+extern void mem_init_print_info(const char *str);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
@@ -1358,6 +1359,17 @@ static inline unsigned long free_initmem_default(int poison)
 				  poison, "unused kernel");
 }
 
+static inline unsigned long get_num_physpages(void)
+{
+	int nid;
+	unsigned long phys_pages = 0;
+
+	for_each_online_node(nid)
+		phys_pages += node_present_pages(nid);
+
+	return phys_pages;
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
  * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d9445c4f5fd7..327516b7aee9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,6 +61,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 
+#include <asm/sections.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
@@ -5246,6 +5247,57 @@ void free_highmem_page(struct page *page)
 }
 #endif
 
+
+void __init mem_init_print_info(const char *str)
+{
+	unsigned long physpages, codesize, datasize, rosize, bss_size;
+	unsigned long init_code_size, init_data_size;
+
+	physpages = get_num_physpages();
+	codesize = _etext - _stext;
+	datasize = _edata - _sdata;
+	rosize = __end_rodata - __start_rodata;
+	bss_size = __bss_stop - __bss_start;
+	init_data_size = __init_end - __init_begin;
+	init_code_size = _einittext - _sinittext;
+
+	/*
+	 * Detect special cases and adjust section sizes accordingly:
+	 * 1) .init.* may be embedded into .data sections
+	 * 2) .init.text.* may be out of [__init_begin, __init_end],
+	 *    please refer to arch/tile/kernel/vmlinux.lds.S.
+	 * 3) .rodata.* may be embedded into .text or .data sections.
+	 */
+#define adj_init_size(start, end, size, pos, adj) \
+	if (start <= pos && pos < end && size > adj) \
+		size -= adj;
+
+	adj_init_size(__init_begin, __init_end, init_data_size,
+		     _sinittext, init_code_size);
+	adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
+	adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
+	adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
+	adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
+
+#undef	adj_init_size
+
+	printk("Memory: %luK/%luK available "
+	       "(%luK kernel code, %luK rwdata, %luK rodata, "
+	       "%luK init, %luK bss, %luK reserved"
+#ifdef	CONFIG_HIGHMEM
+	       ", %luK highmem"
+#endif
+	       "%s%s)\n",
+	       nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
+	       codesize >> 10, datasize >> 10, rosize >> 10,
+	       (init_data_size + init_code_size) >> 10, bss_size >> 10,
+	       (physpages - totalram_pages) << (PAGE_SHIFT-10),
+#ifdef	CONFIG_HIGHMEM
+	       totalhigh_pages << (PAGE_SHIFT-10),
+#endif
+	       str ? ", " : "", str ? str : "");
+}
+
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
-- 
cgit v1.3-7-g2ca7


From 1895418189e08c1d1eec4fbdb5fb41d793f57ba5 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:04:21 -0700
Subject: mm: kill global variable num_physpages

Now all references to num_physpages have been removed, so kill it.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 1 -
 mm/memory.c        | 2 --
 mm/nommu.c         | 2 --
 3 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 09c235301dbb..e550a1773a9e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -29,7 +29,6 @@ struct writeback_control;
 extern unsigned long max_mapnr;
 #endif
 
-extern unsigned long num_physpages;
 extern unsigned long totalram_pages;
 extern void * high_memory;
 extern int page_cluster;
diff --git a/mm/memory.c b/mm/memory.c
index 407533219673..b68812d682b6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
 EXPORT_SYMBOL(mem_map);
 #endif
 
-unsigned long num_physpages;
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
  * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
  */
 void * high_memory;
 
-EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 
 /*
diff --git a/mm/nommu.c b/mm/nommu.c
index 1898b2fe9da5..e44e6e0a125c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
-unsigned long num_physpages;
 unsigned long highest_memmap_pfn;
 struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
 EXPORT_SYMBOL_GPL(vm_memory_committed);
 
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(num_physpages);
 
 /* list of mapped, potentially shareable regions */
 static struct kmem_cache *vm_region_jar;
-- 
cgit v1.3-7-g2ca7


From fccc998771043db1613509b26c18b3d7f071e668 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:04:23 -0700
Subject: mm: introduce helper function set_max_mapnr()

Introduce a helper function set_max_mapnr() to set global variable
max_mapnr.

Also unify condition compilation for max_mapnr with
CONFIG_NEED_MULTIPLE_NODES instead of CONFIG_DISCONTIGMEM.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e550a1773a9e..b87681adf0ba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -25,8 +25,15 @@ struct file_ra_state;
 struct user_struct;
 struct writeback_control;
 
-#ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
+#ifndef CONFIG_NEED_MULTIPLE_NODES	/* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
+
+static inline void set_max_mapnr(unsigned long limit)
+{
+	max_mapnr = limit;
+}
+#else
+static inline void set_max_mapnr(unsigned long limit) { }
 #endif
 
 extern unsigned long totalram_pages;
-- 
cgit v1.3-7-g2ca7


From e1280be0d8614be94e5bef48b6c830dfa03e82a7 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Wed, 3 Jul 2013 15:04:34 -0700
Subject: mm: kill free_all_bootmem_node()

Now nobody makes use of free_all_bootmem_node(), kill it.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Yinghai Lu <yinghai@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h |  1 -
 mm/bootmem.c            | 18 ------------------
 2 files changed, 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0e48c3221d82..f1f07d31a3af 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
 				       unsigned long endpfn);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
-extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
 extern void reset_all_zones_managed_pages(void);
 
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 58609bbf584e..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -263,24 +263,6 @@ void __init reset_all_zones_managed_pages(void)
 	reset_managed_pages_done = 1;
 }
 
-/**
- * free_all_bootmem_node - release a node's free pages to the buddy allocator
- * @pgdat: node to be released
- *
- * Returns the number of pages actually released.
- */
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
-{
-	unsigned long pages;
-
-	register_page_bootmem_info_node(pgdat);
-	reset_node_managed_pages(pgdat);
-	pages = free_all_bootmem_core(pgdat->bdata);
-	totalram_pages += pages;
-
-	return pages;
-}
-
 /**
  * free_all_bootmem - release free pages to the buddy allocator
  *
-- 
cgit v1.3-7-g2ca7


From 55878e88c59221c3187e1c24ec3b15eb79c374c0 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 3 Jul 2013 15:04:44 -0700
Subject: sparsemem: add BUILD_BUG_ON when sizeof mem_section is non-power-of-2

Instead of leaving a hidden trap for the next person who comes along and
wants to add something to mem_section, add a big fat warning about it
needing to be a power-of-2, and insert a BUILD_BUG_ON() in sparse_init()
to catch mistakes.

Right now non-power-of-2 mem_sections cause a number of WARNs at boot
(which don't clearly point to the size of mem_section as an issue), but
the system limps on (temporarily, at least).

This is based upon Dave Hansen's earlier RFC where he ran into the same
issue:
	"sparsemem: fix boot when SECTIONS_PER_ROOT is not power-of-2"
	http://lkml.indiana.edu/hypermail/linux/kernel/1205.2/03077.html

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Acked-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 4 ++++
 mm/sparse.c            | 3 +++
 2 files changed, 7 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 09d381b71fd8..ae19af5ec02c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1137,6 +1137,10 @@ struct mem_section {
 	struct page_cgroup *page_cgroup;
 	unsigned long pad;
 #endif
+	/*
+	 * WARNING: mem_section must be a power-of-2 in size for the
+	 * calculation and use of SECTION_ROOT_MASK to make sense.
+	 */
 };
 
 #ifdef CONFIG_SPARSEMEM_EXTREME
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..3194ec414728 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -481,6 +481,9 @@ void __init sparse_init(void)
 	struct page **map_map;
 #endif
 
+	/* see include/linux/mmzone.h 'struct mem_section' definition */
+	BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+
 	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
 	set_pageblock_order();
 
-- 
cgit v1.3-7-g2ca7


From e7152b97f38f1f5b915c719e6a3040697a700a16 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 3 Jul 2013 15:04:54 -0700
Subject: err.h: IS_ERR() can accept __user pointers

Sparse generates a false positive when you pass a __user or __iomem
pointer to the IS_ERR() functions.

  drivers/rtc/rtc-ds1286.c:344:36: sparse: incorrect type in argument 1 (different address spaces)
  drivers/rtc/rtc-ds1286.c:344:36:    expected void const *ptr
  drivers/rtc/rtc-ds1286.c:344:36:    got unsigned int [noderef] [usertype] <asn:2>*rtcregs

We can silence these by adding a __force here and upgrading to Sparse
v0.4.5-rc1 or later.

This change has no effect when using current Sparse releases.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Christopher Li <sparse@chrisli.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/err.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/err.h b/include/linux/err.h
index f2edce25a76b..221fcfb676c4 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -24,17 +24,17 @@ static inline void * __must_check ERR_PTR(long error)
 	return (void *) error;
 }
 
-static inline long __must_check PTR_ERR(const void *ptr)
+static inline long __must_check PTR_ERR(__force const void *ptr)
 {
 	return (long) ptr;
 }
 
-static inline long __must_check IS_ERR(const void *ptr)
+static inline long __must_check IS_ERR(__force const void *ptr)
 {
 	return IS_ERR_VALUE((unsigned long)ptr);
 }
 
-static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
+static inline long __must_check IS_ERR_OR_NULL(__force const void *ptr)
 {
 	return !ptr || IS_ERR_VALUE((unsigned long)ptr);
 }
@@ -46,13 +46,13 @@ static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
  * Explicitly cast an error-valued pointer to another pointer type in such a
  * way as to make it clear that's what's going on.
  */
-static inline void * __must_check ERR_CAST(const void *ptr)
+static inline void * __must_check ERR_CAST(__force const void *ptr)
 {
 	/* cast away the const */
 	return (void *) ptr;
 }
 
-static inline int __must_check PTR_RET(const void *ptr)
+static inline int __must_check PTR_RET(__force const void *ptr)
 {
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
-- 
cgit v1.3-7-g2ca7


From d8537548c924db3c44afde7646b6e220c7beb79d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jul 2013 15:04:57 -0700
Subject: drivers: avoid format strings in names passed to alloc_workqueue()

For the workqueue creation interfaces that do not expect format strings,
make sure they cannot accidently be parsed that way.  Additionally, clean
up calls made with a single parameter that would be handled as a format
string.  Many callers are passing potentially dynamic string content, so
use "%s" in those cases to avoid any potential accidents.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 crypto/pcrypt.c                         | 4 ++--
 drivers/media/pci/cx18/cx18-driver.c    | 2 +-
 drivers/message/i2o/driver.c            | 4 ++--
 drivers/net/wireless/rt2x00/rt2x00dev.c | 2 +-
 drivers/net/wireless/rtlwifi/base.c     | 2 +-
 drivers/pci/hotplug/pciehp_hpc.c        | 4 +---
 drivers/pci/hotplug/shpchp_core.c       | 3 +--
 drivers/scsi/be2iscsi/be_main.c         | 2 +-
 drivers/scsi/qla4xxx/ql4_os.c           | 4 ++--
 drivers/scsi/scsi_transport_fc.c        | 6 +++---
 include/linux/workqueue.h               | 7 ++++---
 net/bluetooth/hci_core.c                | 9 ++++-----
 net/mac80211/main.c                     | 2 +-
 13 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c
index b2c99dc1c5e2..f8c920cafe63 100644
--- a/crypto/pcrypt.c
+++ b/crypto/pcrypt.c
@@ -455,8 +455,8 @@ static int pcrypt_init_padata(struct padata_pcrypt *pcrypt,
 
 	get_online_cpus();
 
-	pcrypt->wq = alloc_workqueue(name,
-				     WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE, 1);
+	pcrypt->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM | WQ_CPU_INTENSIVE,
+				     1, name);
 	if (!pcrypt->wq)
 		goto err;
 
diff --git a/drivers/media/pci/cx18/cx18-driver.c b/drivers/media/pci/cx18/cx18-driver.c
index 67b61cf3e03a..004d8ace5019 100644
--- a/drivers/media/pci/cx18/cx18-driver.c
+++ b/drivers/media/pci/cx18/cx18-driver.c
@@ -695,7 +695,7 @@ static int cx18_create_in_workq(struct cx18 *cx)
 {
 	snprintf(cx->in_workq_name, sizeof(cx->in_workq_name), "%s-in",
 		 cx->v4l2_dev.name);
-	cx->in_work_queue = alloc_ordered_workqueue(cx->in_workq_name, 0);
+	cx->in_work_queue = alloc_ordered_workqueue("%s", 0, cx->in_workq_name);
 	if (cx->in_work_queue == NULL) {
 		CX18_ERR("Unable to create incoming mailbox handler thread\n");
 		return -ENOMEM;
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 8a5b2d8f4daf..813eaa33fa14 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -84,8 +84,8 @@ int i2o_driver_register(struct i2o_driver *drv)
 	osm_debug("Register driver %s\n", drv->name);
 
 	if (drv->event) {
-		drv->event_queue = alloc_workqueue(drv->name,
-						   WQ_MEM_RECLAIM, 1);
+		drv->event_queue = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1,
+						   drv->name);
 		if (!drv->event_queue) {
 			osm_err("Could not initialize event queue for driver "
 				"%s\n", drv->name);
diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c
index 90dc14336980..c8b9ef0c21f8 100644
--- a/drivers/net/wireless/rt2x00/rt2x00dev.c
+++ b/drivers/net/wireless/rt2x00/rt2x00dev.c
@@ -1321,7 +1321,7 @@ int rt2x00lib_probe_dev(struct rt2x00_dev *rt2x00dev)
 	 * Initialize work.
 	 */
 	rt2x00dev->workqueue =
-	    alloc_ordered_workqueue(wiphy_name(rt2x00dev->hw->wiphy), 0);
+	    alloc_ordered_workqueue("%s", 0, wiphy_name(rt2x00dev->hw->wiphy));
 	if (!rt2x00dev->workqueue) {
 		retval = -ENOMEM;
 		goto exit;
diff --git a/drivers/net/wireless/rtlwifi/base.c b/drivers/net/wireless/rtlwifi/base.c
index af59dd5718e1..a5f223145b0f 100644
--- a/drivers/net/wireless/rtlwifi/base.c
+++ b/drivers/net/wireless/rtlwifi/base.c
@@ -380,7 +380,7 @@ static void _rtl_init_deferred_work(struct ieee80211_hw *hw)
 
 	/* <2> work queue */
 	rtlpriv->works.hw = hw;
-	rtlpriv->works.rtl_wq = alloc_workqueue(rtlpriv->cfg->name, 0, 0);
+	rtlpriv->works.rtl_wq = alloc_workqueue("%s", 0, 0, rtlpriv->cfg->name);
 	INIT_DELAYED_WORK(&rtlpriv->works.watchdog_wq,
 			  (void *)rtl_watchdog_wq_callback);
 	INIT_DELAYED_WORK(&rtlpriv->works.ips_nic_off_wq,
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 5127f3f41821..b2255736ac81 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -773,14 +773,12 @@ static void pcie_shutdown_notification(struct controller *ctrl)
 static int pcie_init_slot(struct controller *ctrl)
 {
 	struct slot *slot;
-	char name[32];
 
 	slot = kzalloc(sizeof(*slot), GFP_KERNEL);
 	if (!slot)
 		return -ENOMEM;
 
-	snprintf(name, sizeof(name), "pciehp-%u", PSN(ctrl));
-	slot->wq = alloc_workqueue(name, 0, 0);
+	slot->wq = alloc_workqueue("pciehp-%u", 0, 0, PSN(ctrl));
 	if (!slot->wq)
 		goto abort;
 
diff --git a/drivers/pci/hotplug/shpchp_core.c b/drivers/pci/hotplug/shpchp_core.c
index 3100c52c837c..d3f757df691c 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -128,8 +128,7 @@ static int init_slots(struct controller *ctrl)
 		slot->hpc_ops = ctrl->hpc_ops;
 		slot->number = ctrl->first_slot + (ctrl->slot_num_inc * i);
 
-		snprintf(name, sizeof(name), "shpchp-%d", slot->number);
-		slot->wq = alloc_workqueue(name, 0, 0);
+		slot->wq = alloc_workqueue("shpchp-%d", 0, 0, slot->number);
 		if (!slot->wq) {
 			retval = -ENOMEM;
 			goto error_info;
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index d24a2867bc21..a1f5ac7a9806 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -4996,7 +4996,7 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev,
 
 	snprintf(phba->wq_name, sizeof(phba->wq_name), "beiscsi_%02x_wq",
 		 phba->shost->host_no);
-	phba->wq = alloc_workqueue(phba->wq_name, WQ_MEM_RECLAIM, 1);
+	phba->wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, phba->wq_name);
 	if (!phba->wq) {
 		beiscsi_log(phba, KERN_ERR, BEISCSI_LOG_INIT,
 			    "BM_%d : beiscsi_dev_probe-"
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index 4d231c12463e..b246b3c26912 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -7060,8 +7060,8 @@ skip_retry_init:
 	}
 	INIT_WORK(&ha->dpc_work, qla4xxx_do_dpc);
 
-	sprintf(buf, "qla4xxx_%lu_task", ha->host_no);
-	ha->task_wq = alloc_workqueue(buf, WQ_MEM_RECLAIM, 1);
+	ha->task_wq = alloc_workqueue("qla4xxx_%lu_task", WQ_MEM_RECLAIM, 1,
+				      ha->host_no);
 	if (!ha->task_wq) {
 		ql4_printk(KERN_WARNING, ha, "Unable to start task thread!\n");
 		ret = -ENODEV;
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index e106c276aa00..4628fd5e0688 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -435,7 +435,7 @@ static int fc_host_setup(struct transport_container *tc, struct device *dev,
 
 	snprintf(fc_host->work_q_name, sizeof(fc_host->work_q_name),
 		 "fc_wq_%d", shost->host_no);
-	fc_host->work_q = alloc_workqueue(fc_host->work_q_name, 0, 0);
+	fc_host->work_q = alloc_workqueue("%s", 0, 0, fc_host->work_q_name);
 	if (!fc_host->work_q)
 		return -ENOMEM;
 
@@ -443,8 +443,8 @@ static int fc_host_setup(struct transport_container *tc, struct device *dev,
 	snprintf(fc_host->devloss_work_q_name,
 		 sizeof(fc_host->devloss_work_q_name),
 		 "fc_dl_%d", shost->host_no);
-	fc_host->devloss_work_q =
-			alloc_workqueue(fc_host->devloss_work_q_name, 0, 0);
+	fc_host->devloss_work_q = alloc_workqueue("%s", 0, 0,
+					fc_host->devloss_work_q_name);
 	if (!fc_host->devloss_work_q) {
 		destroy_workqueue(fc_host->work_q);
 		fc_host->work_q = NULL;
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index a9f4119c7e2e..a0ed78ab54d7 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -445,11 +445,12 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 
 #define create_workqueue(name)						\
-	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
+	alloc_workqueue("%s", WQ_MEM_RECLAIM, 1, (name))
 #define create_freezable_workqueue(name)				\
-	alloc_workqueue((name), WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
+	alloc_workqueue("%s", WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, \
+			1, (name))
 #define create_singlethread_workqueue(name)				\
-	alloc_workqueue((name), WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
+	alloc_workqueue("%s", WQ_UNBOUND | WQ_MEM_RECLAIM, 1, (name))
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ace5e55fe5a3..db7de80b88a2 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2211,16 +2211,15 @@ int hci_register_dev(struct hci_dev *hdev)
 	list_add(&hdev->list, &hci_dev_list);
 	write_unlock(&hci_dev_list_lock);
 
-	hdev->workqueue = alloc_workqueue(hdev->name, WQ_HIGHPRI | WQ_UNBOUND |
-					  WQ_MEM_RECLAIM, 1);
+	hdev->workqueue = alloc_workqueue("%s", WQ_HIGHPRI | WQ_UNBOUND |
+					  WQ_MEM_RECLAIM, 1, hdev->name);
 	if (!hdev->workqueue) {
 		error = -ENOMEM;
 		goto err;
 	}
 
-	hdev->req_workqueue = alloc_workqueue(hdev->name,
-					      WQ_HIGHPRI | WQ_UNBOUND |
-					      WQ_MEM_RECLAIM, 1);
+	hdev->req_workqueue = alloc_workqueue("%s", WQ_HIGHPRI | WQ_UNBOUND |
+					      WQ_MEM_RECLAIM, 1, hdev->name);
 	if (!hdev->req_workqueue) {
 		destroy_workqueue(hdev->workqueue);
 		error = -ENOMEM;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 8a7bfc47d577..8eae74ac4e1e 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -921,7 +921,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
 		hw->queues = IEEE80211_MAX_QUEUES;
 
 	local->workqueue =
-		alloc_ordered_workqueue(wiphy_name(local->hw.wiphy), 0);
+		alloc_ordered_workqueue("%s", 0, wiphy_name(local->hw.wiphy));
 	if (!local->workqueue) {
 		result = -ENOMEM;
 		goto fail_workqueue;
-- 
cgit v1.3-7-g2ca7


From 5017b2851373ee15c7035151853bb1448800cae2 Mon Sep 17 00:00:00 2001
From: Jani Nikula <jani.nikula@intel.com>
Date: Wed, 3 Jul 2013 15:05:02 -0700
Subject: dmi: add support for exact DMI matches in addition to substring
 matching

dmi_match() considers a substring match to be a successful match.  This is
not always sufficient to distinguish between DMI data for different
systems.  Add support for exact string matching using strcmp() in addition
to the substring matching using strstr().

The specific use case in the i915 driver is to allow us to use an exact
match for D510MO, without also incorrectly matching D510MOV:

  {
	.ident = "Intel D510MO",
	.matches = {
		DMI_MATCH(DMI_BOARD_VENDOR, "Intel"),
		DMI_EXACT_MATCH(DMI_BOARD_NAME, "D510MO"),
	},
  }

Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Cc: <annndddrr@gmail.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Cornel Panceac <cpanceac@gmail.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/dmi_scan.c     | 12 +++++++++---
 include/linux/mod_devicetable.h |  6 ++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index b95159b33c39..eb760a218da4 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -551,9 +551,15 @@ static bool dmi_matches(const struct dmi_system_id *dmi)
 		int s = dmi->matches[i].slot;
 		if (s == DMI_NONE)
 			break;
-		if (dmi_ident[s]
-		    && strstr(dmi_ident[s], dmi->matches[i].substr))
-			continue;
+		if (dmi_ident[s]) {
+			if (!dmi->matches[i].exact_match &&
+			    strstr(dmi_ident[s], dmi->matches[i].substr))
+				continue;
+			else if (dmi->matches[i].exact_match &&
+				 !strcmp(dmi_ident[s], dmi->matches[i].substr))
+				continue;
+		}
+
 		/* No match */
 		return false;
 	}
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b508016fb76d..b3bd7e737e8b 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -456,7 +456,8 @@ enum dmi_field {
 };
 
 struct dmi_strmatch {
-	unsigned char slot;
+	unsigned char slot:7;
+	unsigned char exact_match:1;
 	char substr[79];
 };
 
@@ -474,7 +475,8 @@ struct dmi_system_id {
  */
 #define dmi_device_id dmi_system_id
 
-#define DMI_MATCH(a, b)	{ a, b }
+#define DMI_MATCH(a, b)	{ .slot = a, .substr = b }
+#define DMI_EXACT_MATCH(a, b)	{ .slot = a, .substr = b, .exact_match = 1 }
 
 #define PLATFORM_NAME_SIZE	20
 #define PLATFORM_MODULE_PREFIX	"platform:"
-- 
cgit v1.3-7-g2ca7


From 48a9db462d99494583dad829969616ac90a8df4e Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Date: Wed, 3 Jul 2013 15:05:06 -0700
Subject: drivers/dma: remove unused support for MEMSET operations

There have never been any real users of MEMSET operations since they
have been introduced in January 2007 by commit 7405f74badf4 ("dmaengine:
refactor dmaengine around dma_async_tx_descriptor").  Therefore remove
support for them for now, it can be always brought back when needed.

[sebastian.hesselbarth@gmail.com: fix drivers/dma/mv_xor]
Signed-off-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Cc: Vinod Koul <vinod.koul@intel.com>
Acked-by: Dan Williams <djbw@fb.com>
Cc: Tomasz Figa <t.figa@samsung.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Olof Johansson <olof@lixom.net>
Cc: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/crypto/async-tx-api.txt |   1 -
 arch/arm/mach-iop13xx/setup.c         |   3 -
 arch/arm/plat-iop/adma.c              |   2 -
 arch/arm/plat-orion/common.c          |  10 ---
 crypto/async_tx/Kconfig               |   4 --
 crypto/async_tx/Makefile              |   1 -
 crypto/async_tx/async_memset.c        |  89 --------------------------
 drivers/dma/dmaengine.c               |   7 ---
 drivers/dma/ioat/dma.c                |   3 +-
 drivers/dma/ioat/dma_v2.h             |   1 -
 drivers/dma/ioat/dma_v3.c             | 114 +---------------------------------
 drivers/dma/ioat/hw.h                 |  27 --------
 drivers/dma/iop-adma.c                |  66 +-------------------
 drivers/dma/mv_xor.c                  |  85 ++-----------------------
 drivers/dma/mv_xor.h                  |   1 -
 drivers/dma/ppc4xx/adma.c             |  47 --------------
 include/linux/async_tx.h              |   4 --
 include/linux/dmaengine.h             |   5 --
 18 files changed, 8 insertions(+), 462 deletions(-)
 delete mode 100644 crypto/async_tx/async_memset.c

(limited to 'include/linux')

diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt
index ba046b8fa92f..7bf1be20d93a 100644
--- a/Documentation/crypto/async-tx-api.txt
+++ b/Documentation/crypto/async-tx-api.txt
@@ -222,5 +222,4 @@ drivers/dma/: location for offload engine drivers
 include/linux/async_tx.h: core header file for the async_tx api
 crypto/async_tx/async_tx.c: async_tx interface to dmaengine and common code
 crypto/async_tx/async_memcpy.c: copy offload
-crypto/async_tx/async_memset.c: memory fill offload
 crypto/async_tx/async_xor.c: xor and xor zero sum offload
diff --git a/arch/arm/mach-iop13xx/setup.c b/arch/arm/mach-iop13xx/setup.c
index 3181f61ea63e..1c5bd7637b05 100644
--- a/arch/arm/mach-iop13xx/setup.c
+++ b/arch/arm/mach-iop13xx/setup.c
@@ -469,7 +469,6 @@ void __init iop13xx_platform_init(void)
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_1:
@@ -479,7 +478,6 @@ void __init iop13xx_platform_init(void)
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			break;
 		case IOP13XX_INIT_ADMA_2:
@@ -489,7 +487,6 @@ void __init iop13xx_platform_init(void)
 			dma_cap_set(DMA_MEMCPY, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR, plat_data->cap_mask);
 			dma_cap_set(DMA_XOR_VAL, plat_data->cap_mask);
-			dma_cap_set(DMA_MEMSET, plat_data->cap_mask);
 			dma_cap_set(DMA_INTERRUPT, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ, plat_data->cap_mask);
 			dma_cap_set(DMA_PQ_VAL, plat_data->cap_mask);
diff --git a/arch/arm/plat-iop/adma.c b/arch/arm/plat-iop/adma.c
index 1ff6a37e893c..a4d1f8de3b5b 100644
--- a/arch/arm/plat-iop/adma.c
+++ b/arch/arm/plat-iop/adma.c
@@ -192,12 +192,10 @@ static int __init iop3xx_adma_cap_init(void)
 
 	#ifdef CONFIG_ARCH_IOP32X /* the 32x AAU does not perform zero sum */
 	dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask);
-	dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
 	#else
 	dma_cap_set(DMA_XOR, iop3xx_aau_data.cap_mask);
 	dma_cap_set(DMA_XOR_VAL, iop3xx_aau_data.cap_mask);
-	dma_cap_set(DMA_MEMSET, iop3xx_aau_data.cap_mask);
 	dma_cap_set(DMA_INTERRUPT, iop3xx_aau_data.cap_mask);
 	#endif
 
diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c
index c019b7aaf776..c66d163d7a2a 100644
--- a/arch/arm/plat-orion/common.c
+++ b/arch/arm/plat-orion/common.c
@@ -666,14 +666,9 @@ void __init orion_xor0_init(unsigned long mapbase_low,
 	orion_xor0_shared_resources[3].start = irq_1;
 	orion_xor0_shared_resources[3].end = irq_1;
 
-	/*
-	 * two engines can't do memset simultaneously, this limitation
-	 * satisfied by removing memset support from one of the engines.
-	 */
 	dma_cap_set(DMA_MEMCPY, orion_xor0_channels_data[0].cap_mask);
 	dma_cap_set(DMA_XOR, orion_xor0_channels_data[0].cap_mask);
 
-	dma_cap_set(DMA_MEMSET, orion_xor0_channels_data[1].cap_mask);
 	dma_cap_set(DMA_MEMCPY, orion_xor0_channels_data[1].cap_mask);
 	dma_cap_set(DMA_XOR, orion_xor0_channels_data[1].cap_mask);
 
@@ -732,14 +727,9 @@ void __init orion_xor1_init(unsigned long mapbase_low,
 	orion_xor1_shared_resources[3].start = irq_1;
 	orion_xor1_shared_resources[3].end = irq_1;
 
-	/*
-	 * two engines can't do memset simultaneously, this limitation
-	 * satisfied by removing memset support from one of the engines.
-	 */
 	dma_cap_set(DMA_MEMCPY, orion_xor1_channels_data[0].cap_mask);
 	dma_cap_set(DMA_XOR, orion_xor1_channels_data[0].cap_mask);
 
-	dma_cap_set(DMA_MEMSET, orion_xor1_channels_data[1].cap_mask);
 	dma_cap_set(DMA_MEMCPY, orion_xor1_channels_data[1].cap_mask);
 	dma_cap_set(DMA_XOR, orion_xor1_channels_data[1].cap_mask);
 
diff --git a/crypto/async_tx/Kconfig b/crypto/async_tx/Kconfig
index 1b11abbb5c91..f38a58aef3ec 100644
--- a/crypto/async_tx/Kconfig
+++ b/crypto/async_tx/Kconfig
@@ -10,10 +10,6 @@ config ASYNC_XOR
 	select ASYNC_CORE
 	select XOR_BLOCKS
 
-config ASYNC_MEMSET
-	tristate
-	select ASYNC_CORE
-
 config ASYNC_PQ
 	tristate
 	select ASYNC_CORE
diff --git a/crypto/async_tx/Makefile b/crypto/async_tx/Makefile
index d1e0e6f72bc1..462e4abbfe69 100644
--- a/crypto/async_tx/Makefile
+++ b/crypto/async_tx/Makefile
@@ -1,6 +1,5 @@
 obj-$(CONFIG_ASYNC_CORE) += async_tx.o
 obj-$(CONFIG_ASYNC_MEMCPY) += async_memcpy.o
-obj-$(CONFIG_ASYNC_MEMSET) += async_memset.o
 obj-$(CONFIG_ASYNC_XOR) += async_xor.o
 obj-$(CONFIG_ASYNC_PQ) += async_pq.o
 obj-$(CONFIG_ASYNC_RAID6_RECOV) += async_raid6_recov.o
diff --git a/crypto/async_tx/async_memset.c b/crypto/async_tx/async_memset.c
deleted file mode 100644
index 05a4d1e00148..000000000000
--- a/crypto/async_tx/async_memset.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * memory fill offload engine support
- *
- * Copyright © 2006, Intel Corporation.
- *
- *      Dan Williams <dan.j.williams@intel.com>
- *
- *      with architecture considerations by:
- *      Neil Brown <neilb@suse.de>
- *      Jeff Garzik <jeff@garzik.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
- *
- */
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/async_tx.h>
-
-/**
- * async_memset - attempt to fill memory with a dma engine.
- * @dest: destination page
- * @val: fill value
- * @offset: offset in pages to start transaction
- * @len: length in bytes
- *
- * honored flags: ASYNC_TX_ACK
- */
-struct dma_async_tx_descriptor *
-async_memset(struct page *dest, int val, unsigned int offset, size_t len,
-	     struct async_submit_ctl *submit)
-{
-	struct dma_chan *chan = async_tx_find_channel(submit, DMA_MEMSET,
-						      &dest, 1, NULL, 0, len);
-	struct dma_device *device = chan ? chan->device : NULL;
-	struct dma_async_tx_descriptor *tx = NULL;
-
-	if (device && is_dma_fill_aligned(device, offset, 0, len)) {
-		dma_addr_t dma_dest;
-		unsigned long dma_prep_flags = 0;
-
-		if (submit->cb_fn)
-			dma_prep_flags |= DMA_PREP_INTERRUPT;
-		if (submit->flags & ASYNC_TX_FENCE)
-			dma_prep_flags |= DMA_PREP_FENCE;
-		dma_dest = dma_map_page(device->dev, dest, offset, len,
-					DMA_FROM_DEVICE);
-
-		tx = device->device_prep_dma_memset(chan, dma_dest, val, len,
-						    dma_prep_flags);
-	}
-
-	if (tx) {
-		pr_debug("%s: (async) len: %zu\n", __func__, len);
-		async_tx_submit(chan, tx, submit);
-	} else { /* run the memset synchronously */
-		void *dest_buf;
-		pr_debug("%s: (sync) len: %zu\n", __func__, len);
-
-		dest_buf = page_address(dest) + offset;
-
-		/* wait for any prerequisite operations */
-		async_tx_quiesce(&submit->depend_tx);
-
-		memset(dest_buf, val, len);
-
-		async_tx_sync_epilog(submit);
-	}
-
-	return tx;
-}
-EXPORT_SYMBOL_GPL(async_memset);
-
-MODULE_AUTHOR("Intel Corporation");
-MODULE_DESCRIPTION("asynchronous memset api");
-MODULE_LICENSE("GPL");
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 93f7992bee5c..9e56745f87bf 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -663,11 +663,6 @@ static bool device_has_all_tx_types(struct dma_device *device)
 		return false;
 	#endif
 
-	#if defined(CONFIG_ASYNC_MEMSET) || defined(CONFIG_ASYNC_MEMSET_MODULE)
-	if (!dma_has_cap(DMA_MEMSET, device->cap_mask))
-		return false;
-	#endif
-
 	#if defined(CONFIG_ASYNC_XOR) || defined(CONFIG_ASYNC_XOR_MODULE)
 	if (!dma_has_cap(DMA_XOR, device->cap_mask))
 		return false;
@@ -729,8 +724,6 @@ int dma_async_device_register(struct dma_device *device)
 		!device->device_prep_dma_pq);
 	BUG_ON(dma_has_cap(DMA_PQ_VAL, device->cap_mask) &&
 		!device->device_prep_dma_pq_val);
-	BUG_ON(dma_has_cap(DMA_MEMSET, device->cap_mask) &&
-		!device->device_prep_dma_memset);
 	BUG_ON(dma_has_cap(DMA_INTERRUPT, device->cap_mask) &&
 		!device->device_prep_dma_interrupt);
 	BUG_ON(dma_has_cap(DMA_SG, device->cap_mask) &&
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c
index 17a2393b3e25..5ff6fc1819dc 100644
--- a/drivers/dma/ioat/dma.c
+++ b/drivers/dma/ioat/dma.c
@@ -1105,12 +1105,11 @@ static ssize_t cap_show(struct dma_chan *c, char *page)
 {
 	struct dma_device *dma = c->device;
 
-	return sprintf(page, "copy%s%s%s%s%s%s\n",
+	return sprintf(page, "copy%s%s%s%s%s\n",
 		       dma_has_cap(DMA_PQ, dma->cap_mask) ? " pq" : "",
 		       dma_has_cap(DMA_PQ_VAL, dma->cap_mask) ? " pq_val" : "",
 		       dma_has_cap(DMA_XOR, dma->cap_mask) ? " xor" : "",
 		       dma_has_cap(DMA_XOR_VAL, dma->cap_mask) ? " xor_val" : "",
-		       dma_has_cap(DMA_MEMSET, dma->cap_mask)  ? " fill" : "",
 		       dma_has_cap(DMA_INTERRUPT, dma->cap_mask) ? " intr" : "");
 
 }
diff --git a/drivers/dma/ioat/dma_v2.h b/drivers/dma/ioat/dma_v2.h
index 29bf9448035d..212d584fe427 100644
--- a/drivers/dma/ioat/dma_v2.h
+++ b/drivers/dma/ioat/dma_v2.h
@@ -123,7 +123,6 @@ static inline u16 ioat2_xferlen_to_descs(struct ioat2_dma_chan *ioat, size_t len
 struct ioat_ring_ent {
 	union {
 		struct ioat_dma_descriptor *hw;
-		struct ioat_fill_descriptor *fill;
 		struct ioat_xor_descriptor *xor;
 		struct ioat_xor_ext_descriptor *xor_ex;
 		struct ioat_pq_descriptor *pq;
diff --git a/drivers/dma/ioat/dma_v3.c b/drivers/dma/ioat/dma_v3.c
index ca6ea9b3551b..b642e035579b 100644
--- a/drivers/dma/ioat/dma_v3.c
+++ b/drivers/dma/ioat/dma_v3.c
@@ -311,14 +311,6 @@ static void ioat3_dma_unmap(struct ioat2_dma_chan *ioat,
 		if (!desc->hw->ctl_f.null) /* skip 'interrupt' ops */
 			ioat_dma_unmap(chan, flags, len, desc->hw);
 		break;
-	case IOAT_OP_FILL: {
-		struct ioat_fill_descriptor *hw = desc->fill;
-
-		if (!(flags & DMA_COMPL_SKIP_DEST_UNMAP))
-			ioat_unmap(pdev, hw->dst_addr - offset, len,
-				   PCI_DMA_FROMDEVICE, flags, 1);
-		break;
-	}
 	case IOAT_OP_XOR_VAL:
 	case IOAT_OP_XOR: {
 		struct ioat_xor_descriptor *xor = desc->xor;
@@ -823,51 +815,6 @@ ioat3_tx_status(struct dma_chan *c, dma_cookie_t cookie,
 	return dma_cookie_status(c, cookie, txstate);
 }
 
-static struct dma_async_tx_descriptor *
-ioat3_prep_memset_lock(struct dma_chan *c, dma_addr_t dest, int value,
-		       size_t len, unsigned long flags)
-{
-	struct ioat2_dma_chan *ioat = to_ioat2_chan(c);
-	struct ioat_ring_ent *desc;
-	size_t total_len = len;
-	struct ioat_fill_descriptor *fill;
-	u64 src_data = (0x0101010101010101ULL) * (value & 0xff);
-	int num_descs, idx, i;
-
-	num_descs = ioat2_xferlen_to_descs(ioat, len);
-	if (likely(num_descs) && ioat2_check_space_lock(ioat, num_descs) == 0)
-		idx = ioat->head;
-	else
-		return NULL;
-	i = 0;
-	do {
-		size_t xfer_size = min_t(size_t, len, 1 << ioat->xfercap_log);
-
-		desc = ioat2_get_ring_ent(ioat, idx + i);
-		fill = desc->fill;
-
-		fill->size = xfer_size;
-		fill->src_data = src_data;
-		fill->dst_addr = dest;
-		fill->ctl = 0;
-		fill->ctl_f.op = IOAT_OP_FILL;
-
-		len -= xfer_size;
-		dest += xfer_size;
-		dump_desc_dbg(ioat, desc);
-	} while (++i < num_descs);
-
-	desc->txd.flags = flags;
-	desc->len = total_len;
-	fill->ctl_f.int_en = !!(flags & DMA_PREP_INTERRUPT);
-	fill->ctl_f.fence = !!(flags & DMA_PREP_FENCE);
-	fill->ctl_f.compl_write = 1;
-	dump_desc_dbg(ioat, desc);
-
-	/* we leave the channel locked to ensure in order submission */
-	return &desc->txd;
-}
-
 static struct dma_async_tx_descriptor *
 __ioat3_prep_xor_lock(struct dma_chan *c, enum sum_check_flags *result,
 		      dma_addr_t dest, dma_addr_t *src, unsigned int src_cnt,
@@ -1431,7 +1378,7 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
 	struct page *xor_srcs[IOAT_NUM_SRC_TEST];
 	struct page *xor_val_srcs[IOAT_NUM_SRC_TEST + 1];
 	dma_addr_t dma_srcs[IOAT_NUM_SRC_TEST + 1];
-	dma_addr_t dma_addr, dest_dma;
+	dma_addr_t dest_dma;
 	struct dma_async_tx_descriptor *tx;
 	struct dma_chan *dma_chan;
 	dma_cookie_t cookie;
@@ -1598,56 +1545,6 @@ static int ioat_xor_val_self_test(struct ioatdma_device *device)
 		goto free_resources;
 	}
 
-	/* skip memset if the capability is not present */
-	if (!dma_has_cap(DMA_MEMSET, dma_chan->device->cap_mask))
-		goto free_resources;
-
-	/* test memset */
-	op = IOAT_OP_FILL;
-
-	dma_addr = dma_map_page(dev, dest, 0,
-			PAGE_SIZE, DMA_FROM_DEVICE);
-	tx = dma->device_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE,
-					 DMA_PREP_INTERRUPT |
-					 DMA_COMPL_SKIP_SRC_UNMAP |
-					 DMA_COMPL_SKIP_DEST_UNMAP);
-	if (!tx) {
-		dev_err(dev, "Self-test memset prep failed\n");
-		err = -ENODEV;
-		goto dma_unmap;
-	}
-
-	async_tx_ack(tx);
-	init_completion(&cmp);
-	tx->callback = ioat3_dma_test_callback;
-	tx->callback_param = &cmp;
-	cookie = tx->tx_submit(tx);
-	if (cookie < 0) {
-		dev_err(dev, "Self-test memset setup failed\n");
-		err = -ENODEV;
-		goto dma_unmap;
-	}
-	dma->device_issue_pending(dma_chan);
-
-	tmo = wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000));
-
-	if (dma->device_tx_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
-		dev_err(dev, "Self-test memset timed out\n");
-		err = -ENODEV;
-		goto dma_unmap;
-	}
-
-	dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
-
-	for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
-		u32 *ptr = page_address(dest);
-		if (ptr[i]) {
-			dev_err(dev, "Self-test memset failed compare\n");
-			err = -ENODEV;
-			goto free_resources;
-		}
-	}
-
 	/* test for non-zero parity sum */
 	op = IOAT_OP_XOR_VAL;
 
@@ -1706,8 +1603,7 @@ dma_unmap:
 		for (i = 0; i < IOAT_NUM_SRC_TEST + 1; i++)
 			dma_unmap_page(dev, dma_srcs[i], PAGE_SIZE,
 				       DMA_TO_DEVICE);
-	} else if (op == IOAT_OP_FILL)
-		dma_unmap_page(dev, dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
+	}
 free_resources:
 	dma->device_free_chan_resources(dma_chan);
 out:
@@ -1944,12 +1840,6 @@ int ioat3_dma_probe(struct ioatdma_device *device, int dca)
 		}
 	}
 
-	if (is_raid_device && (device->cap & IOAT_CAP_FILL_BLOCK)) {
-		dma_cap_set(DMA_MEMSET, dma->cap_mask);
-		dma->device_prep_dma_memset = ioat3_prep_memset_lock;
-	}
-
-
 	dma->device_tx_status = ioat3_tx_status;
 	device->cleanup_fn = ioat3_cleanup_event;
 	device->timer_fn = ioat3_timer_event;
diff --git a/drivers/dma/ioat/hw.h b/drivers/dma/ioat/hw.h
index 5ee57d402a6e..62f83e983d8d 100644
--- a/drivers/dma/ioat/hw.h
+++ b/drivers/dma/ioat/hw.h
@@ -100,33 +100,6 @@ struct ioat_dma_descriptor {
 	uint64_t	user2;
 };
 
-struct ioat_fill_descriptor {
-	uint32_t	size;
-	union {
-		uint32_t ctl;
-		struct {
-			unsigned int int_en:1;
-			unsigned int rsvd:1;
-			unsigned int dest_snoop_dis:1;
-			unsigned int compl_write:1;
-			unsigned int fence:1;
-			unsigned int rsvd2:2;
-			unsigned int dest_brk:1;
-			unsigned int bundle:1;
-			unsigned int rsvd4:15;
-			#define IOAT_OP_FILL 0x01
-			unsigned int op:8;
-		} ctl_f;
-	};
-	uint64_t	src_data;
-	uint64_t	dst_addr;
-	uint64_t	next;
-	uint64_t	rsv1;
-	uint64_t	next_dst_addr;
-	uint64_t	user1;
-	uint64_t	user2;
-};
-
 struct ioat_xor_descriptor {
 	uint32_t	size;
 	union {
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
index 7dafb9f3785f..c9cc08c2dbba 100644
--- a/drivers/dma/iop-adma.c
+++ b/drivers/dma/iop-adma.c
@@ -632,39 +632,6 @@ iop_adma_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dma_dest,
 	return sw_desc ? &sw_desc->async_tx : NULL;
 }
 
-static struct dma_async_tx_descriptor *
-iop_adma_prep_dma_memset(struct dma_chan *chan, dma_addr_t dma_dest,
-			 int value, size_t len, unsigned long flags)
-{
-	struct iop_adma_chan *iop_chan = to_iop_adma_chan(chan);
-	struct iop_adma_desc_slot *sw_desc, *grp_start;
-	int slot_cnt, slots_per_op;
-
-	if (unlikely(!len))
-		return NULL;
-	BUG_ON(len > IOP_ADMA_MAX_BYTE_COUNT);
-
-	dev_dbg(iop_chan->device->common.dev, "%s len: %u\n",
-		__func__, len);
-
-	spin_lock_bh(&iop_chan->lock);
-	slot_cnt = iop_chan_memset_slot_count(len, &slots_per_op);
-	sw_desc = iop_adma_alloc_slots(iop_chan, slot_cnt, slots_per_op);
-	if (sw_desc) {
-		grp_start = sw_desc->group_head;
-		iop_desc_init_memset(grp_start, flags);
-		iop_desc_set_byte_count(grp_start, iop_chan, len);
-		iop_desc_set_block_fill_val(grp_start, value);
-		iop_desc_set_dest_addr(grp_start, iop_chan, dma_dest);
-		sw_desc->unmap_src_cnt = 1;
-		sw_desc->unmap_len = len;
-		sw_desc->async_tx.flags = flags;
-	}
-	spin_unlock_bh(&iop_chan->lock);
-
-	return sw_desc ? &sw_desc->async_tx : NULL;
-}
-
 static struct dma_async_tx_descriptor *
 iop_adma_prep_dma_xor(struct dma_chan *chan, dma_addr_t dma_dest,
 		      dma_addr_t *dma_src, unsigned int src_cnt, size_t len,
@@ -1176,33 +1143,6 @@ iop_adma_xor_val_self_test(struct iop_adma_device *device)
 		goto free_resources;
 	}
 
-	/* test memset */
-	dma_addr = dma_map_page(dma_chan->device->dev, dest, 0,
-			PAGE_SIZE, DMA_FROM_DEVICE);
-	tx = iop_adma_prep_dma_memset(dma_chan, dma_addr, 0, PAGE_SIZE,
-				      DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
-
-	cookie = iop_adma_tx_submit(tx);
-	iop_adma_issue_pending(dma_chan);
-	msleep(8);
-
-	if (iop_adma_status(dma_chan, cookie, NULL) != DMA_SUCCESS) {
-		dev_err(dma_chan->device->dev,
-			"Self-test memset timed out, disabling\n");
-		err = -ENODEV;
-		goto free_resources;
-	}
-
-	for (i = 0; i < PAGE_SIZE/sizeof(u32); i++) {
-		u32 *ptr = page_address(dest);
-		if (ptr[i]) {
-			dev_err(dma_chan->device->dev,
-				"Self-test memset failed compare, disabling\n");
-			err = -ENODEV;
-			goto free_resources;
-		}
-	}
-
 	/* test for non-zero parity sum */
 	zero_sum_result = 0;
 	for (i = 0; i < IOP_ADMA_NUM_SRC_TEST + 1; i++)
@@ -1487,8 +1427,6 @@ static int iop_adma_probe(struct platform_device *pdev)
 	/* set prep routines based on capability */
 	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
 		dma_dev->device_prep_dma_memcpy = iop_adma_prep_dma_memcpy;
-	if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask))
-		dma_dev->device_prep_dma_memset = iop_adma_prep_dma_memset;
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		dma_dev->max_xor = iop_adma_get_max_xor();
 		dma_dev->device_prep_dma_xor = iop_adma_prep_dma_xor;
@@ -1556,8 +1494,7 @@ static int iop_adma_probe(struct platform_device *pdev)
 			goto err_free_iop_chan;
 	}
 
-	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask) ||
-	    dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) {
+	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		ret = iop_adma_xor_val_self_test(adev);
 		dev_dbg(&pdev->dev, "xor self test returned %d\n", ret);
 		if (ret)
@@ -1584,7 +1521,6 @@ static int iop_adma_probe(struct platform_device *pdev)
 		 dma_has_cap(DMA_PQ_VAL, dma_dev->cap_mask) ? "pq_val " : "",
 		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
 		 dma_has_cap(DMA_XOR_VAL, dma_dev->cap_mask) ? "xor_val " : "",
-		 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)  ? "fill " : "",
 		 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
 		 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index d64ae14f2706..200f1a3c9a44 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -89,11 +89,6 @@ static void mv_desc_clear_next_desc(struct mv_xor_desc_slot *desc)
 	hw_desc->phy_next_desc = 0;
 }
 
-static void mv_desc_set_block_fill_val(struct mv_xor_desc_slot *desc, u32 val)
-{
-	desc->value = val;
-}
-
 static void mv_desc_set_dest_addr(struct mv_xor_desc_slot *desc,
 				  dma_addr_t addr)
 {
@@ -128,22 +123,6 @@ static void mv_chan_set_next_descriptor(struct mv_xor_chan *chan,
 	__raw_writel(next_desc_addr, XOR_NEXT_DESC(chan));
 }
 
-static void mv_chan_set_dest_pointer(struct mv_xor_chan *chan, u32 desc_addr)
-{
-	__raw_writel(desc_addr, XOR_DEST_POINTER(chan));
-}
-
-static void mv_chan_set_block_size(struct mv_xor_chan *chan, u32 block_size)
-{
-	__raw_writel(block_size, XOR_BLOCK_SIZE(chan));
-}
-
-static void mv_chan_set_value(struct mv_xor_chan *chan, u32 value)
-{
-	__raw_writel(value, XOR_INIT_VALUE_LOW(chan));
-	__raw_writel(value, XOR_INIT_VALUE_HIGH(chan));
-}
-
 static void mv_chan_unmask_interrupts(struct mv_xor_chan *chan)
 {
 	u32 val = __raw_readl(XOR_INTR_MASK(chan));
@@ -186,8 +165,6 @@ static int mv_can_chain(struct mv_xor_desc_slot *desc)
 
 	if (chain_old_tail->type != desc->type)
 		return 0;
-	if (desc->type == DMA_MEMSET)
-		return 0;
 
 	return 1;
 }
@@ -205,9 +182,6 @@ static void mv_set_mode(struct mv_xor_chan *chan,
 	case DMA_MEMCPY:
 		op_mode = XOR_OPERATION_MODE_MEMCPY;
 		break;
-	case DMA_MEMSET:
-		op_mode = XOR_OPERATION_MODE_MEMSET;
-		break;
 	default:
 		dev_err(mv_chan_to_devp(chan),
 			"error: unsupported operation %d\n",
@@ -274,18 +248,9 @@ static void mv_xor_start_new_chain(struct mv_xor_chan *mv_chan,
 	if (sw_desc->type != mv_chan->current_type)
 		mv_set_mode(mv_chan, sw_desc->type);
 
-	if (sw_desc->type == DMA_MEMSET) {
-		/* for memset requests we need to program the engine, no
-		 * descriptors used.
-		 */
-		struct mv_xor_desc *hw_desc = sw_desc->hw_desc;
-		mv_chan_set_dest_pointer(mv_chan, hw_desc->phy_dest_addr);
-		mv_chan_set_block_size(mv_chan, sw_desc->unmap_len);
-		mv_chan_set_value(mv_chan, sw_desc->value);
-	} else {
-		/* set the hardware chain */
-		mv_chan_set_next_descriptor(mv_chan, sw_desc->async_tx.phys);
-	}
+	/* set the hardware chain */
+	mv_chan_set_next_descriptor(mv_chan, sw_desc->async_tx.phys);
+
 	mv_chan->pending += sw_desc->slot_cnt;
 	mv_xor_issue_pending(&mv_chan->dmachan);
 }
@@ -687,43 +652,6 @@ mv_xor_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
 	return sw_desc ? &sw_desc->async_tx : NULL;
 }
 
-static struct dma_async_tx_descriptor *
-mv_xor_prep_dma_memset(struct dma_chan *chan, dma_addr_t dest, int value,
-		       size_t len, unsigned long flags)
-{
-	struct mv_xor_chan *mv_chan = to_mv_xor_chan(chan);
-	struct mv_xor_desc_slot *sw_desc, *grp_start;
-	int slot_cnt;
-
-	dev_dbg(mv_chan_to_devp(mv_chan),
-		"%s dest: %x len: %u flags: %ld\n",
-		__func__, dest, len, flags);
-	if (unlikely(len < MV_XOR_MIN_BYTE_COUNT))
-		return NULL;
-
-	BUG_ON(len > MV_XOR_MAX_BYTE_COUNT);
-
-	spin_lock_bh(&mv_chan->lock);
-	slot_cnt = mv_chan_memset_slot_count(len);
-	sw_desc = mv_xor_alloc_slots(mv_chan, slot_cnt, 1);
-	if (sw_desc) {
-		sw_desc->type = DMA_MEMSET;
-		sw_desc->async_tx.flags = flags;
-		grp_start = sw_desc->group_head;
-		mv_desc_init(grp_start, flags);
-		mv_desc_set_byte_count(grp_start, len);
-		mv_desc_set_dest_addr(sw_desc->group_head, dest);
-		mv_desc_set_block_fill_val(grp_start, value);
-		sw_desc->unmap_src_cnt = 1;
-		sw_desc->unmap_len = len;
-	}
-	spin_unlock_bh(&mv_chan->lock);
-	dev_dbg(mv_chan_to_devp(mv_chan),
-		"%s sw_desc %p async_tx %p \n",
-		__func__, sw_desc, &sw_desc->async_tx);
-	return sw_desc ? &sw_desc->async_tx : NULL;
-}
-
 static struct dma_async_tx_descriptor *
 mv_xor_prep_dma_xor(struct dma_chan *chan, dma_addr_t dest, dma_addr_t *src,
 		    unsigned int src_cnt, size_t len, unsigned long flags)
@@ -1137,8 +1065,6 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
 	/* set prep routines based on capability */
 	if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask))
 		dma_dev->device_prep_dma_memcpy = mv_xor_prep_dma_memcpy;
-	if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask))
-		dma_dev->device_prep_dma_memset = mv_xor_prep_dma_memset;
 	if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
 		dma_dev->max_xor = 8;
 		dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor;
@@ -1187,9 +1113,8 @@ mv_xor_channel_add(struct mv_xor_device *xordev,
 			goto err_free_irq;
 	}
 
-	dev_info(&pdev->dev, "Marvell XOR: ( %s%s%s%s)\n",
+	dev_info(&pdev->dev, "Marvell XOR: ( %s%s%s)\n",
 		 dma_has_cap(DMA_XOR, dma_dev->cap_mask) ? "xor " : "",
-		 dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)  ? "fill " : "",
 		 dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask) ? "cpy " : "",
 		 dma_has_cap(DMA_INTERRUPT, dma_dev->cap_mask) ? "intr " : "");
 
@@ -1298,8 +1223,6 @@ static int mv_xor_probe(struct platform_device *pdev)
 				dma_cap_set(DMA_MEMCPY, cap_mask);
 			if (of_property_read_bool(np, "dmacap,xor"))
 				dma_cap_set(DMA_XOR, cap_mask);
-			if (of_property_read_bool(np, "dmacap,memset"))
-				dma_cap_set(DMA_MEMSET, cap_mask);
 			if (of_property_read_bool(np, "dmacap,interrupt"))
 				dma_cap_set(DMA_INTERRUPT, cap_mask);
 
diff --git a/drivers/dma/mv_xor.h b/drivers/dma/mv_xor.h
index c632a4761fcf..c619359cb7fe 100644
--- a/drivers/dma/mv_xor.h
+++ b/drivers/dma/mv_xor.h
@@ -31,7 +31,6 @@
 
 #define XOR_OPERATION_MODE_XOR		0
 #define XOR_OPERATION_MODE_MEMCPY	2
-#define XOR_OPERATION_MODE_MEMSET	4
 
 #define XOR_CURR_DESC(chan)	(chan->mmr_base + 0x210 + (chan->idx * 4))
 #define XOR_NEXT_DESC(chan)	(chan->mmr_base + 0x200 + (chan->idx * 4))
diff --git a/drivers/dma/ppc4xx/adma.c b/drivers/dma/ppc4xx/adma.c
index 5d3d95569a1e..1e220f8dfd8c 100644
--- a/drivers/dma/ppc4xx/adma.c
+++ b/drivers/dma/ppc4xx/adma.c
@@ -2322,47 +2322,6 @@ static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_memcpy(
 	return sw_desc ? &sw_desc->async_tx : NULL;
 }
 
-/**
- * ppc440spe_adma_prep_dma_memset - prepare CDB for a MEMSET operation
- */
-static struct dma_async_tx_descriptor *ppc440spe_adma_prep_dma_memset(
-		struct dma_chan *chan, dma_addr_t dma_dest, int value,
-		size_t len, unsigned long flags)
-{
-	struct ppc440spe_adma_chan *ppc440spe_chan;
-	struct ppc440spe_adma_desc_slot *sw_desc, *group_start;
-	int slot_cnt, slots_per_op;
-
-	ppc440spe_chan = to_ppc440spe_adma_chan(chan);
-
-	if (unlikely(!len))
-		return NULL;
-
-	BUG_ON(len > PPC440SPE_ADMA_DMA_MAX_BYTE_COUNT);
-
-	spin_lock_bh(&ppc440spe_chan->lock);
-
-	dev_dbg(ppc440spe_chan->device->common.dev,
-		"ppc440spe adma%d: %s cal: %u len: %u int_en %d\n",
-		ppc440spe_chan->device->id, __func__, value, len,
-		flags & DMA_PREP_INTERRUPT ? 1 : 0);
-
-	slot_cnt = slots_per_op = 1;
-	sw_desc = ppc440spe_adma_alloc_slots(ppc440spe_chan, slot_cnt,
-		slots_per_op);
-	if (sw_desc) {
-		group_start = sw_desc->group_head;
-		ppc440spe_desc_init_memset(group_start, value, flags);
-		ppc440spe_adma_set_dest(group_start, dma_dest, 0);
-		ppc440spe_desc_set_byte_count(group_start, ppc440spe_chan, len);
-		sw_desc->unmap_len = len;
-		sw_desc->async_tx.flags = flags;
-	}
-	spin_unlock_bh(&ppc440spe_chan->lock);
-
-	return sw_desc ? &sw_desc->async_tx : NULL;
-}
-
 /**
  * ppc440spe_adma_prep_dma_xor - prepare CDB for a XOR operation
  */
@@ -4125,7 +4084,6 @@ static void ppc440spe_adma_init_capabilities(struct ppc440spe_adma_device *adev)
 	case PPC440SPE_DMA1_ID:
 		dma_cap_set(DMA_MEMCPY, adev->common.cap_mask);
 		dma_cap_set(DMA_INTERRUPT, adev->common.cap_mask);
-		dma_cap_set(DMA_MEMSET, adev->common.cap_mask);
 		dma_cap_set(DMA_PQ, adev->common.cap_mask);
 		dma_cap_set(DMA_PQ_VAL, adev->common.cap_mask);
 		dma_cap_set(DMA_XOR_VAL, adev->common.cap_mask);
@@ -4151,10 +4109,6 @@ static void ppc440spe_adma_init_capabilities(struct ppc440spe_adma_device *adev)
 		adev->common.device_prep_dma_memcpy =
 			ppc440spe_adma_prep_dma_memcpy;
 	}
-	if (dma_has_cap(DMA_MEMSET, adev->common.cap_mask)) {
-		adev->common.device_prep_dma_memset =
-			ppc440spe_adma_prep_dma_memset;
-	}
 	if (dma_has_cap(DMA_XOR, adev->common.cap_mask)) {
 		adev->common.max_xor = XOR_MAX_OPS;
 		adev->common.device_prep_dma_xor =
@@ -4217,7 +4171,6 @@ static void ppc440spe_adma_init_capabilities(struct ppc440spe_adma_device *adev)
 	  dma_has_cap(DMA_XOR, adev->common.cap_mask) ? "xor " : "",
 	  dma_has_cap(DMA_XOR_VAL, adev->common.cap_mask) ? "xor_val " : "",
 	  dma_has_cap(DMA_MEMCPY, adev->common.cap_mask) ? "memcpy " : "",
-	  dma_has_cap(DMA_MEMSET, adev->common.cap_mask)  ? "memset " : "",
 	  dma_has_cap(DMA_INTERRUPT, adev->common.cap_mask) ? "intr " : "");
 }
 
diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h
index a1c486a88e88..179b38ffd351 100644
--- a/include/linux/async_tx.h
+++ b/include/linux/async_tx.h
@@ -182,10 +182,6 @@ async_memcpy(struct page *dest, struct page *src, unsigned int dest_offset,
 	     unsigned int src_offset, size_t len,
 	     struct async_submit_ctl *submit);
 
-struct dma_async_tx_descriptor *
-async_memset(struct page *dest, int val, unsigned int offset,
-	     size_t len, struct async_submit_ctl *submit);
-
 struct dma_async_tx_descriptor *async_trigger_callback(struct async_submit_ctl *submit);
 
 struct dma_async_tx_descriptor *
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 96d3e4ab11a9..cb286b1acdb6 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -66,7 +66,6 @@ enum dma_transaction_type {
 	DMA_PQ,
 	DMA_XOR_VAL,
 	DMA_PQ_VAL,
-	DMA_MEMSET,
 	DMA_INTERRUPT,
 	DMA_SG,
 	DMA_PRIVATE,
@@ -520,7 +519,6 @@ struct dma_tx_state {
  * @device_prep_dma_xor_val: prepares a xor validation operation
  * @device_prep_dma_pq: prepares a pq operation
  * @device_prep_dma_pq_val: prepares a pqzero_sum operation
- * @device_prep_dma_memset: prepares a memset operation
  * @device_prep_dma_interrupt: prepares an end of chain interrupt operation
  * @device_prep_slave_sg: prepares a slave dma operation
  * @device_prep_dma_cyclic: prepare a cyclic dma operation suitable for audio.
@@ -573,9 +571,6 @@ struct dma_device {
 		struct dma_chan *chan, dma_addr_t *pq, dma_addr_t *src,
 		unsigned int src_cnt, const unsigned char *scf, size_t len,
 		enum sum_check_flags *pqres, unsigned long flags);
-	struct dma_async_tx_descriptor *(*device_prep_dma_memset)(
-		struct dma_chan *chan, dma_addr_t dest, int value, size_t len,
-		unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_interrupt)(
 		struct dma_chan *chan, unsigned long flags);
 	struct dma_async_tx_descriptor *(*device_prep_dma_sg)(
-- 
cgit v1.3-7-g2ca7


From 8318fde4ac78f6793b1cbaf57659902253a61617 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 3 Jul 2013 15:05:13 -0700
Subject: backlight: add devm_backlight_device_{register,unregister}()

These functions allow the driver core to automatically clean up any
allocation made by backlight drivers.  Thus it simplifies the error
paths.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/backlight.c | 75 +++++++++++++++++++++++++++++++++++++
 include/linux/backlight.h           |  6 +++
 2 files changed, 81 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index e3c279083253..53c52fbc9395 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -370,6 +370,81 @@ void backlight_device_unregister(struct backlight_device *bd)
 }
 EXPORT_SYMBOL(backlight_device_unregister);
 
+static void devm_backlight_device_release(struct device *dev, void *res)
+{
+	struct backlight_device *backlight = *(struct backlight_device **)res;
+
+	backlight_device_unregister(backlight);
+}
+
+static int devm_backlight_device_match(struct device *dev, void *res,
+					void *data)
+{
+	struct backlight_device **r = res;
+
+	return *r == data;
+}
+
+/**
+ * devm_backlight_device_register - resource managed backlight_device_register()
+ * @dev: the device to register
+ * @name: the name of the device
+ * @parent: a pointer to the parent device
+ * @devdata: an optional pointer to be stored for private driver use
+ * @ops: the backlight operations structure
+ * @props: the backlight properties
+ *
+ * @return a struct backlight on success, or an ERR_PTR on error
+ *
+ * Managed backlight_device_register(). The backlight_device returned
+ * from this function are automatically freed on driver detach.
+ * See backlight_device_register() for more information.
+ */
+struct backlight_device *devm_backlight_device_register(struct device *dev,
+	const char *name, struct device *parent, void *devdata,
+	const struct backlight_ops *ops,
+	const struct backlight_properties *props)
+{
+	struct backlight_device **ptr, *backlight;
+
+	ptr = devres_alloc(devm_backlight_device_release, sizeof(*ptr),
+			GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	backlight = backlight_device_register(name, parent, devdata, ops,
+						props);
+	if (!IS_ERR(backlight)) {
+		*ptr = backlight;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return backlight;
+}
+EXPORT_SYMBOL(devm_backlight_device_register);
+
+/**
+ * devm_backlight_device_unregister - resource managed backlight_device_unregister()
+ * @dev: the device to unregister
+ * @bd: the backlight device to unregister
+ *
+ * Deallocated a backlight allocated with devm_backlight_device_register().
+ * Normally this function will not need to be called and the resource management
+ * code will ensure that the resource is freed.
+ */
+void devm_backlight_device_unregister(struct device *dev,
+				struct backlight_device *bd)
+{
+	int rc;
+
+	rc = devres_release(dev, devm_backlight_device_release,
+				devm_backlight_device_match, bd);
+	WARN_ON(rc);
+}
+EXPORT_SYMBOL(devm_backlight_device_unregister);
+
 #ifdef CONFIG_OF
 static int of_parent_match(struct device *dev, const void *data)
 {
diff --git a/include/linux/backlight.h b/include/linux/backlight.h
index da9a0825e007..53b77949c79d 100644
--- a/include/linux/backlight.h
+++ b/include/linux/backlight.h
@@ -114,7 +114,13 @@ static inline void backlight_update_status(struct backlight_device *bd)
 extern struct backlight_device *backlight_device_register(const char *name,
 	struct device *dev, void *devdata, const struct backlight_ops *ops,
 	const struct backlight_properties *props);
+extern struct backlight_device *devm_backlight_device_register(
+	struct device *dev, const char *name, struct device *parent,
+	void *devdata, const struct backlight_ops *ops,
+	const struct backlight_properties *props);
 extern void backlight_device_unregister(struct backlight_device *bd);
+extern void devm_backlight_device_unregister(struct device *dev,
+					struct backlight_device *bd);
 extern void backlight_force_update(struct backlight_device *bd,
 				   enum backlight_update_reason reason);
 
-- 
cgit v1.3-7-g2ca7


From 1d0c48e66b3f1cf40660f69a87f55af3df0b2ae3 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 3 Jul 2013 15:05:14 -0700
Subject: lcd: add devm_lcd_device_{register,unregister}()

These functions allow the driver core to automatically clean up any
allocation made by lcd drivers.  Thus it simplifies the error paths.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/lcd.c | 70 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/lcd.h           |  5 ++++
 2 files changed, 75 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 3649fd9ddb3a..41964a71a036 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -260,6 +260,76 @@ void lcd_device_unregister(struct lcd_device *ld)
 }
 EXPORT_SYMBOL(lcd_device_unregister);
 
+static void devm_lcd_device_release(struct device *dev, void *res)
+{
+	struct lcd_device *lcd = *(struct lcd_device **)res;
+
+	lcd_device_unregister(lcd);
+}
+
+static int devm_lcd_device_match(struct device *dev, void *res, void *data)
+{
+	struct lcd_device **r = res;
+
+	return *r == data;
+}
+
+/**
+ * devm_lcd_device_register - resource managed lcd_device_register()
+ * @dev: the device to register
+ * @name: the name of the device
+ * @parent: a pointer to the parent device
+ * @devdata: an optional pointer to be stored for private driver use
+ * @ops: the lcd operations structure
+ *
+ * @return a struct lcd on success, or an ERR_PTR on error
+ *
+ * Managed lcd_device_register(). The lcd_device returned from this function
+ * are automatically freed on driver detach. See lcd_device_register()
+ * for more information.
+ */
+struct lcd_device *devm_lcd_device_register(struct device *dev,
+		const char *name, struct device *parent,
+		void *devdata, struct lcd_ops *ops)
+{
+	struct lcd_device **ptr, *lcd;
+
+	ptr = devres_alloc(devm_lcd_device_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	lcd = lcd_device_register(name, parent, devdata, ops);
+	if (!IS_ERR(lcd)) {
+		*ptr = lcd;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return lcd;
+}
+EXPORT_SYMBOL(devm_lcd_device_register);
+
+/**
+ * devm_lcd_device_unregister - resource managed lcd_device_unregister()
+ * @dev: the device to unregister
+ * @ld: the lcd device to unregister
+ *
+ * Deallocated a lcd allocated with devm_lcd_device_register(). Normally
+ * this function will not need to be called and the resource management
+ * code will ensure that the resource is freed.
+ */
+void devm_lcd_device_unregister(struct device *dev, struct lcd_device *ld)
+{
+	int rc;
+
+	rc = devres_release(dev, devm_lcd_device_release,
+				devm_lcd_device_match, ld);
+	WARN_ON(rc);
+}
+EXPORT_SYMBOL(devm_lcd_device_unregister);
+
+
 static void __exit lcd_class_exit(void)
 {
 	class_destroy(lcd_class);
diff --git a/include/linux/lcd.h b/include/linux/lcd.h
index e00c3b0ebc6b..504f6246f38f 100644
--- a/include/linux/lcd.h
+++ b/include/linux/lcd.h
@@ -112,7 +112,12 @@ static inline void lcd_set_power(struct lcd_device *ld, int power)
 
 extern struct lcd_device *lcd_device_register(const char *name,
 	struct device *parent, void *devdata, struct lcd_ops *ops);
+extern struct lcd_device *devm_lcd_device_register(struct device *dev,
+	const char *name, struct device *parent,
+	void *devdata, struct lcd_ops *ops);
 extern void lcd_device_unregister(struct lcd_device *ld);
+extern void devm_lcd_device_unregister(struct device *dev,
+	struct lcd_device *ld);
 
 #define to_lcd_device(obj) container_of(obj, struct lcd_device, dev)
 
-- 
cgit v1.3-7-g2ca7


From 81dabb464139324c005159f5afba377104d8828d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:26 -0700
Subject: exit.c: unexport __set_special_pids()

Move __set_special_pids() from exit.c to sys.c close to its single caller
and make it static.

And rename it to set_special_pids(), another helper with this name has
gone away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  2 --
 kernel/exit.c         | 11 -----------
 kernel/sys.c          | 13 ++++++++++++-
 3 files changed, 12 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ec80684a0127..cdd5407b37e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1950,8 +1950,6 @@ extern struct task_struct *find_task_by_vpid(pid_t nr);
 extern struct task_struct *find_task_by_pid_ns(pid_t nr,
 		struct pid_namespace *ns);
 
-extern void __set_special_pids(struct pid *pid);
-
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(kuid_t);
 static inline struct user_struct *get_uid(struct user_struct *u)
diff --git a/kernel/exit.c b/kernel/exit.c
index 7bb73f9d09db..3a77cd9390a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -312,17 +312,6 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
 	}
 }
 
-void __set_special_pids(struct pid *pid)
-{
-	struct task_struct *curr = current->group_leader;
-
-	if (task_session(curr) != pid)
-		change_pid(curr, PIDTYPE_SID, pid);
-
-	if (task_pgrp(curr) != pid)
-		change_pid(curr, PIDTYPE_PGID, pid);
-}
-
 /*
  * Let kernel threads use this to say that they allow a certain signal.
  * Must not be used if kthread was cloned with CLONE_SIGHAND.
diff --git a/kernel/sys.c b/kernel/sys.c
index 7bf50dcc6d53..071de900c824 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1309,6 +1309,17 @@ out:
 	return retval;
 }
 
+static void set_special_pids(struct pid *pid)
+{
+	struct task_struct *curr = current->group_leader;
+
+	if (task_session(curr) != pid)
+		change_pid(curr, PIDTYPE_SID, pid);
+
+	if (task_pgrp(curr) != pid)
+		change_pid(curr, PIDTYPE_PGID, pid);
+}
+
 SYSCALL_DEFINE0(setsid)
 {
 	struct task_struct *group_leader = current->group_leader;
@@ -1328,7 +1339,7 @@ SYSCALL_DEFINE0(setsid)
 		goto out;
 
 	group_leader->signal->leader = 1;
-	__set_special_pids(sid);
+	set_special_pids(sid);
 
 	proc_clear_tty(group_leader);
 
-- 
cgit v1.3-7-g2ca7


From 8190773985141f063e1d6dc10200527c655abfb5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 3 Jul 2013 15:08:31 -0700
Subject: kernel/fork.c:copy_process(): don't add the uninitialized child to
 thread/task/pid lists

copy_process() adds the new child to thread_group/init_task.tasks list and
then does attach_pid(child, PIDTYPE_PID).  This means that the lockless
next_thread() or next_task() can see this thread with the wrong pid.  Say,
"ls /proc/pid/task" can list the same inode twice.

We could move attach_pid(child, PIDTYPE_PID) up, but in this case
find_task_by_vpid() can find the new thread before it was fully
initialized.

And this is already true for PIDTYPE_PGID/PIDTYPE_SID, With this patch
copy_process() initializes child->pids[*].pid first, then calls
attach_pid() to insert the task into the pid->tasks list.

attach_pid() no longer need the "struct pid*" argument, it is always
called after pid_link->pid was already set.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Sergey Dyasly <dserrg@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid.h |  6 ++----
 kernel/fork.c       | 16 +++++++++++++---
 kernel/pid.c        | 12 ++++--------
 3 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid.h b/include/linux/pid.h
index a089a3c447fc..23705a53abba 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -86,11 +86,9 @@ extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
 extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
 
 /*
- * attach_pid() and detach_pid() must be called with the tasklist_lock
- * write-held.
+ * these helpers must be called with the tasklist_lock write-held.
  */
-extern void attach_pid(struct task_struct *task, enum pid_type type,
-			struct pid *pid);
+extern void attach_pid(struct task_struct *task, enum pid_type);
 extern void detach_pid(struct task_struct *task, enum pid_type);
 extern void change_pid(struct task_struct *task, enum pid_type,
 			struct pid *pid);
diff --git a/kernel/fork.c b/kernel/fork.c
index 417cb864e20c..7d6962fb6156 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1121,6 +1121,12 @@ static void posix_cpu_timers_init(struct task_struct *tsk)
 	INIT_LIST_HEAD(&tsk->cpu_timers[2]);
 }
 
+static inline void
+init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
+{
+	 task->pids[type].pid = pid;
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1449,7 +1455,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (likely(p->pid)) {
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
 
+		init_task_pid(p, PIDTYPE_PID, pid);
 		if (thread_group_leader(p)) {
+			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
+			init_task_pid(p, PIDTYPE_SID, task_session(current));
+
 			if (is_child_reaper(pid)) {
 				ns_of_pid(pid)->child_reaper = p;
 				p->signal->flags |= SIGNAL_UNKILLABLE;
@@ -1457,10 +1467,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 			p->signal->leader_pid = pid;
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
-			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail(&p->sibling, &p->real_parent->children);
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
+			attach_pid(p, PIDTYPE_PGID);
+			attach_pid(p, PIDTYPE_SID);
 			__this_cpu_inc(process_counts);
 		} else {
 			current->signal->nr_threads++;
@@ -1470,7 +1480,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			list_add_tail_rcu(&p->thread_group,
 					  &p->group_leader->thread_group);
 		}
-		attach_pid(p, PIDTYPE_PID, pid);
+		attach_pid(p, PIDTYPE_PID);
 		nr_threads++;
 	}
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06d..61980cefb1f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -373,14 +373,10 @@ EXPORT_SYMBOL_GPL(find_vpid);
 /*
  * attach_pid() must be called with the tasklist_lock write-held.
  */
-void attach_pid(struct task_struct *task, enum pid_type type,
-		struct pid *pid)
+void attach_pid(struct task_struct *task, enum pid_type type)
 {
-	struct pid_link *link;
-
-	link = &task->pids[type];
-	link->pid = pid;
-	hlist_add_head_rcu(&link->node, &pid->tasks[type]);
+	struct pid_link *link = &task->pids[type];
+	hlist_add_head_rcu(&link->node, &link->pid->tasks[type]);
 }
 
 static void __change_pid(struct task_struct *task, enum pid_type type,
@@ -412,7 +408,7 @@ void change_pid(struct task_struct *task, enum pid_type type,
 		struct pid *pid)
 {
 	__change_pid(task, type, pid);
-	attach_pid(task, type, pid);
+	attach_pid(task, type);
 }
 
 /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
-- 
cgit v1.3-7-g2ca7


From 2ec3ba69faf301fb599e3651515e808e8efa533e Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 3 Jul 2013 15:08:50 -0700
Subject: rapidio: convert switch drivers to modules

Rework RapidIO switch drivers to add an option to build them as loadable
kernel modules.

This patch removes RapidIO-specific vmlinux section and converts switch
drivers to be compatible with LDM driver registration method.  To simplify
registration of device-specific callback routines this patch introduces
rio_switch_ops data structure.  The sw_sysfs() callback is removed from
the list of device-specific operations because under the new structure its
functions can be handled by switch driver's probe() and remove() routines.

If a specific switch device driver is not loaded the RapidIO subsystem
core will use default standard-based operations to configure a switch.
Because the current implementation of RapidIO enumeration/discovery method
relies on availability of device-specific operations for error management,
switch device drivers must be loaded before the RapidIO
enumeration/discovery starts.

This patch also moves several common routines from enumeration/discovery
module into the RapidIO core code to make switch-specific operations
accessible to all components of RapidIO subsystem.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Andre van Herk <andre.van.herk@Prodrive.nl>
Cc: Micha Nelissen <micha.nelissen@Prodrive.nl>
Cc: Stef van Os <stef.van.os@Prodrive.nl>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/Kconfig             |   5 +
 drivers/rapidio/rio-scan.c          | 171 ++-------------------
 drivers/rapidio/rio-sysfs.c         |   4 -
 drivers/rapidio/rio.c               | 286 ++++++++++++++++++++++++++++++------
 drivers/rapidio/rio.h               |  41 +-----
 drivers/rapidio/switches/Kconfig    |  12 +-
 drivers/rapidio/switches/idt_gen2.c |  98 +++++++++---
 drivers/rapidio/switches/idtcps.c   |  86 +++++++++--
 drivers/rapidio/switches/tsi568.c   |  71 +++++++--
 drivers/rapidio/switches/tsi57x.c   |  81 ++++++++--
 include/asm-generic/vmlinux.lds.h   |   7 -
 include/linux/rio.h                 |  51 +++----
 12 files changed, 568 insertions(+), 345 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index 5ab056494bbe..3e3be57e9a1a 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -67,4 +67,9 @@ config RAPIDIO_ENUM_BASIC
 
 endchoice
 
+menu "RapidIO Switch drivers"
+	depends on RAPIDIO
+
 source "drivers/rapidio/switches/Kconfig"
+
+endmenu
diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 4b9b15ee8596..913950212605 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -406,6 +406,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 		rio_mport_write_config_32(port, destid, hopcount,
 					  RIO_COMPONENT_TAG_CSR, next_comptag);
 		rdev->comp_tag = next_comptag++;
+		rdev->do_enum = true;
 	}  else {
 		rio_mport_read_config_32(port, destid, hopcount,
 					 RIO_COMPONENT_TAG_CSR,
@@ -434,6 +435,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 		rswitch = rdev->rswitch;
 		rswitch->switchid = rdev->comp_tag & RIO_CTAG_UDEVID;
 		rswitch->port_ok = 0;
+		spin_lock_init(&rswitch->lock);
 		rswitch->route_table = kzalloc(sizeof(u8)*
 					RIO_MAX_ROUTE_ENTRIES(port->sys_size),
 					GFP_KERNEL);
@@ -445,11 +447,9 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 			rswitch->route_table[rdid] = RIO_INVALID_ROUTE;
 		dev_set_name(&rdev->dev, "%02x:s:%04x", rdev->net->id,
 			     rswitch->switchid);
-		rio_switch_init(rdev, do_enum);
 
-		if (do_enum && rswitch->clr_table)
-			rswitch->clr_table(port, destid, hopcount,
-					   RIO_GLOBAL_TABLE);
+		if (do_enum)
+			rio_route_clr_table(rdev, RIO_GLOBAL_TABLE, 0);
 
 		list_add_tail(&rswitch->node, &net->switches);
 
@@ -532,156 +532,6 @@ rio_sport_is_active(struct rio_mport *port, u16 destid, u8 hopcount, int sport)
 	return result & RIO_PORT_N_ERR_STS_PORT_OK;
 }
 
-/**
- * rio_lock_device - Acquires host device lock for specified device
- * @port: Master port to send transaction
- * @destid: Destination ID for device/switch
- * @hopcount: Hopcount to reach switch
- * @wait_ms: Max wait time in msec (0 = no timeout)
- *
- * Attepts to acquire host device lock for specified device
- * Returns 0 if device lock acquired or EINVAL if timeout expires.
- */
-static int
-rio_lock_device(struct rio_mport *port, u16 destid, u8 hopcount, int wait_ms)
-{
-	u32 result;
-	int tcnt = 0;
-
-	/* Attempt to acquire device lock */
-	rio_mport_write_config_32(port, destid, hopcount,
-				  RIO_HOST_DID_LOCK_CSR, port->host_deviceid);
-	rio_mport_read_config_32(port, destid, hopcount,
-				 RIO_HOST_DID_LOCK_CSR, &result);
-
-	while (result != port->host_deviceid) {
-		if (wait_ms != 0 && tcnt == wait_ms) {
-			pr_debug("RIO: timeout when locking device %x:%x\n",
-				destid, hopcount);
-			return -EINVAL;
-		}
-
-		/* Delay a bit */
-		mdelay(1);
-		tcnt++;
-		/* Try to acquire device lock again */
-		rio_mport_write_config_32(port, destid,
-			hopcount,
-			RIO_HOST_DID_LOCK_CSR,
-			port->host_deviceid);
-		rio_mport_read_config_32(port, destid,
-			hopcount,
-			RIO_HOST_DID_LOCK_CSR, &result);
-	}
-
-	return 0;
-}
-
-/**
- * rio_unlock_device - Releases host device lock for specified device
- * @port: Master port to send transaction
- * @destid: Destination ID for device/switch
- * @hopcount: Hopcount to reach switch
- *
- * Returns 0 if device lock released or EINVAL if fails.
- */
-static int
-rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount)
-{
-	u32 result;
-
-	/* Release device lock */
-	rio_mport_write_config_32(port, destid,
-				  hopcount,
-				  RIO_HOST_DID_LOCK_CSR,
-				  port->host_deviceid);
-	rio_mport_read_config_32(port, destid, hopcount,
-		RIO_HOST_DID_LOCK_CSR, &result);
-	if ((result & 0xffff) != 0xffff) {
-		pr_debug("RIO: badness when releasing device lock %x:%x\n",
-			 destid, hopcount);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/**
- * rio_route_add_entry- Add a route entry to a switch routing table
- * @rdev: RIO device
- * @table: Routing table ID
- * @route_destid: Destination ID to be routed
- * @route_port: Port number to be routed
- * @lock: lock switch device flag
- *
- * Calls the switch specific add_entry() method to add a route entry
- * on a switch. The route table can be specified using the @table
- * argument if a switch has per port routing tables or the normal
- * use is to specific all tables (or the global table) by passing
- * %RIO_GLOBAL_TABLE in @table. Returns %0 on success or %-EINVAL
- * on failure.
- */
-static int
-rio_route_add_entry(struct rio_dev *rdev,
-		    u16 table, u16 route_destid, u8 route_port, int lock)
-{
-	int rc;
-
-	if (lock) {
-		rc = rio_lock_device(rdev->net->hport, rdev->destid,
-				     rdev->hopcount, 1000);
-		if (rc)
-			return rc;
-	}
-
-	rc = rdev->rswitch->add_entry(rdev->net->hport, rdev->destid,
-				      rdev->hopcount, table,
-				      route_destid, route_port);
-	if (lock)
-		rio_unlock_device(rdev->net->hport, rdev->destid,
-				  rdev->hopcount);
-
-	return rc;
-}
-
-/**
- * rio_route_get_entry- Read a route entry in a switch routing table
- * @rdev: RIO device
- * @table: Routing table ID
- * @route_destid: Destination ID to be routed
- * @route_port: Pointer to read port number into
- * @lock: lock switch device flag
- *
- * Calls the switch specific get_entry() method to read a route entry
- * in a switch. The route table can be specified using the @table
- * argument if a switch has per port routing tables or the normal
- * use is to specific all tables (or the global table) by passing
- * %RIO_GLOBAL_TABLE in @table. Returns %0 on success or %-EINVAL
- * on failure.
- */
-static int
-rio_route_get_entry(struct rio_dev *rdev, u16 table,
-		    u16 route_destid, u8 *route_port, int lock)
-{
-	int rc;
-
-	if (lock) {
-		rc = rio_lock_device(rdev->net->hport, rdev->destid,
-				     rdev->hopcount, 1000);
-		if (rc)
-			return rc;
-	}
-
-	rc = rdev->rswitch->get_entry(rdev->net->hport, rdev->destid,
-				      rdev->hopcount, table,
-				      route_destid, route_port);
-	if (lock)
-		rio_unlock_device(rdev->net->hport, rdev->destid,
-				  rdev->hopcount);
-
-	return rc;
-}
-
 /**
  * rio_get_host_deviceid_lock- Reads the Host Device ID Lock CSR on a device
  * @port: Master port to send transaction
@@ -1094,12 +944,9 @@ static void rio_update_route_tables(struct rio_net *net)
 
 				sport = RIO_GET_PORT_NUM(swrdev->swpinfo);
 
-				if (rswitch->add_entry)	{
-					rio_route_add_entry(swrdev,
-						RIO_GLOBAL_TABLE, destid,
-						sport, 0);
-					rswitch->route_table[destid] = sport;
-				}
+				rio_route_add_entry(swrdev, RIO_GLOBAL_TABLE,
+						    destid, sport, 0);
+				rswitch->route_table[destid] = sport;
 			}
 		}
 	}
@@ -1115,8 +962,8 @@ static void rio_update_route_tables(struct rio_net *net)
 static void rio_init_em(struct rio_dev *rdev)
 {
 	if (rio_is_switch(rdev) && (rdev->em_efptr) &&
-	    (rdev->rswitch->em_init)) {
-		rdev->rswitch->em_init(rdev);
+	    rdev->rswitch->ops && rdev->rswitch->ops->em_init) {
+		rdev->rswitch->ops->em_init(rdev);
 	}
 }
 
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 66d4acd5e18f..864e52f193e1 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -257,8 +257,6 @@ int rio_create_sysfs_dev_files(struct rio_dev *rdev)
 		err |= device_create_file(&rdev->dev, &dev_attr_routes);
 		err |= device_create_file(&rdev->dev, &dev_attr_lnext);
 		err |= device_create_file(&rdev->dev, &dev_attr_hopcount);
-		if (!err && rdev->rswitch->sw_sysfs)
-			err = rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_CREATE);
 	}
 
 	if (err)
@@ -281,8 +279,6 @@ void rio_remove_sysfs_dev_files(struct rio_dev *rdev)
 		device_remove_file(&rdev->dev, &dev_attr_routes);
 		device_remove_file(&rdev->dev, &dev_attr_lnext);
 		device_remove_file(&rdev->dev, &dev_attr_hopcount);
-		if (rdev->rswitch->sw_sysfs)
-			rdev->rswitch->sw_sysfs(rdev, RIO_SW_SYSFS_REMOVE);
 	}
 }
 
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index cb1c08996fbb..b17d5218005e 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -7,7 +7,6 @@
  *
  * Copyright 2009 Integrated Device Technology, Inc.
  * Alex Bounine <alexandre.bounine@idt.com>
- * - Added Port-Write/Error Management initialization and handling
  *
  * This program is free software; you can redistribute  it and/or modify it
  * under  the terms of  the GNU General  Public License as published by the
@@ -579,44 +578,6 @@ int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock)
 }
 EXPORT_SYMBOL_GPL(rio_set_port_lockout);
 
-/**
- * rio_switch_init - Sets switch operations for a particular vendor switch
- * @rdev: RIO device
- * @do_enum: Enumeration/Discovery mode flag
- *
- * Searches the RIO switch ops table for known switch types. If the vid
- * and did match a switch table entry, then call switch initialization
- * routine to setup switch-specific routines.
- */
-void rio_switch_init(struct rio_dev *rdev, int do_enum)
-{
-	struct rio_switch_ops *cur = __start_rio_switch_ops;
-	struct rio_switch_ops *end = __end_rio_switch_ops;
-
-	while (cur < end) {
-		if ((cur->vid == rdev->vid) && (cur->did == rdev->did)) {
-			pr_debug("RIO: calling init routine for %s\n",
-				 rio_name(rdev));
-			cur->init_hook(rdev, do_enum);
-			break;
-		}
-		cur++;
-	}
-
-	if ((cur >= end) && (rdev->pef & RIO_PEF_STD_RT)) {
-		pr_debug("RIO: adding STD routing ops for %s\n",
-			rio_name(rdev));
-		rdev->rswitch->add_entry = rio_std_route_add_entry;
-		rdev->rswitch->get_entry = rio_std_route_get_entry;
-		rdev->rswitch->clr_table = rio_std_route_clr_table;
-	}
-
-	if (!rdev->rswitch->add_entry || !rdev->rswitch->get_entry)
-		printk(KERN_ERR "RIO: missing routing ops for %s\n",
-		       rio_name(rdev));
-}
-EXPORT_SYMBOL_GPL(rio_switch_init);
-
 /**
  * rio_enable_rx_tx_port - enable input receiver and output transmitter of
  * given port
@@ -970,8 +931,8 @@ int rio_inb_pwrite_handler(union rio_pw_msg *pw_msg)
 	/*
 	 * Process the port-write notification from switch
 	 */
-	if (rdev->rswitch->em_handle)
-		rdev->rswitch->em_handle(rdev, portnum);
+	if (rdev->rswitch->ops && rdev->rswitch->ops->em_handle)
+		rdev->rswitch->ops->em_handle(rdev, portnum);
 
 	rio_read_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_N_ERR_STS_CSR(portnum),
@@ -1207,8 +1168,9 @@ struct rio_dev *rio_get_device(u16 vid, u16 did, struct rio_dev *from)
  * @route_destid: destID entry in the RT
  * @route_port: destination port for specified destID
  */
-int rio_std_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
-		       u16 table, u16 route_destid, u8 route_port)
+static int
+rio_std_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
+			u16 table, u16 route_destid, u8 route_port)
 {
 	if (table == RIO_GLOBAL_TABLE) {
 		rio_mport_write_config_32(mport, destid, hopcount,
@@ -1234,8 +1196,9 @@ int rio_std_route_add_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
  * @route_destid: destID entry in the RT
  * @route_port: returned destination port for specified destID
  */
-int rio_std_route_get_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
-		       u16 table, u16 route_destid, u8 *route_port)
+static int
+rio_std_route_get_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
+			u16 table, u16 route_destid, u8 *route_port)
 {
 	u32 result;
 
@@ -1259,8 +1222,9 @@ int rio_std_route_get_entry(struct rio_mport *mport, u16 destid, u8 hopcount,
  * @hopcount: Number of switch hops to the device
  * @table: routing table ID (global or port-specific)
  */
-int rio_std_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount,
-		       u16 table)
+static int
+rio_std_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount,
+			u16 table)
 {
 	u32 max_destid = 0xff;
 	u32 i, pef, id_inc = 1, ext_cfg = 0;
@@ -1301,6 +1265,234 @@ int rio_std_route_clr_table(struct rio_mport *mport, u16 destid, u8 hopcount,
 	return 0;
 }
 
+/**
+ * rio_lock_device - Acquires host device lock for specified device
+ * @port: Master port to send transaction
+ * @destid: Destination ID for device/switch
+ * @hopcount: Hopcount to reach switch
+ * @wait_ms: Max wait time in msec (0 = no timeout)
+ *
+ * Attepts to acquire host device lock for specified device
+ * Returns 0 if device lock acquired or EINVAL if timeout expires.
+ */
+int rio_lock_device(struct rio_mport *port, u16 destid,
+		    u8 hopcount, int wait_ms)
+{
+	u32 result;
+	int tcnt = 0;
+
+	/* Attempt to acquire device lock */
+	rio_mport_write_config_32(port, destid, hopcount,
+				  RIO_HOST_DID_LOCK_CSR, port->host_deviceid);
+	rio_mport_read_config_32(port, destid, hopcount,
+				 RIO_HOST_DID_LOCK_CSR, &result);
+
+	while (result != port->host_deviceid) {
+		if (wait_ms != 0 && tcnt == wait_ms) {
+			pr_debug("RIO: timeout when locking device %x:%x\n",
+				destid, hopcount);
+			return -EINVAL;
+		}
+
+		/* Delay a bit */
+		mdelay(1);
+		tcnt++;
+		/* Try to acquire device lock again */
+		rio_mport_write_config_32(port, destid,
+			hopcount,
+			RIO_HOST_DID_LOCK_CSR,
+			port->host_deviceid);
+		rio_mport_read_config_32(port, destid,
+			hopcount,
+			RIO_HOST_DID_LOCK_CSR, &result);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rio_lock_device);
+
+/**
+ * rio_unlock_device - Releases host device lock for specified device
+ * @port: Master port to send transaction
+ * @destid: Destination ID for device/switch
+ * @hopcount: Hopcount to reach switch
+ *
+ * Returns 0 if device lock released or EINVAL if fails.
+ */
+int rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount)
+{
+	u32 result;
+
+	/* Release device lock */
+	rio_mport_write_config_32(port, destid,
+				  hopcount,
+				  RIO_HOST_DID_LOCK_CSR,
+				  port->host_deviceid);
+	rio_mport_read_config_32(port, destid, hopcount,
+		RIO_HOST_DID_LOCK_CSR, &result);
+	if ((result & 0xffff) != 0xffff) {
+		pr_debug("RIO: badness when releasing device lock %x:%x\n",
+			 destid, hopcount);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rio_unlock_device);
+
+/**
+ * rio_route_add_entry- Add a route entry to a switch routing table
+ * @rdev: RIO device
+ * @table: Routing table ID
+ * @route_destid: Destination ID to be routed
+ * @route_port: Port number to be routed
+ * @lock: apply a hardware lock on switch device flag (1=lock, 0=no_lock)
+ *
+ * If available calls the switch specific add_entry() method to add a route
+ * entry into a switch routing table. Otherwise uses standard RT update method
+ * as defined by RapidIO specification. A specific routing table can be selected
+ * using the @table argument if a switch has per port routing tables or
+ * the standard (or global) table may be used by passing
+ * %RIO_GLOBAL_TABLE in @table.
+ *
+ * Returns %0 on success or %-EINVAL on failure.
+ */
+int rio_route_add_entry(struct rio_dev *rdev,
+			u16 table, u16 route_destid, u8 route_port, int lock)
+{
+	int rc = -EINVAL;
+	struct rio_switch_ops *ops = rdev->rswitch->ops;
+
+	if (lock) {
+		rc = rio_lock_device(rdev->net->hport, rdev->destid,
+				     rdev->hopcount, 1000);
+		if (rc)
+			return rc;
+	}
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (ops == NULL || ops->add_entry == NULL) {
+		rc = rio_std_route_add_entry(rdev->net->hport, rdev->destid,
+					     rdev->hopcount, table,
+					     route_destid, route_port);
+	} else if (try_module_get(ops->owner)) {
+		rc = ops->add_entry(rdev->net->hport, rdev->destid,
+				    rdev->hopcount, table, route_destid,
+				    route_port);
+		module_put(ops->owner);
+	}
+
+	spin_unlock(&rdev->rswitch->lock);
+
+	if (lock)
+		rio_unlock_device(rdev->net->hport, rdev->destid,
+				  rdev->hopcount);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(rio_route_add_entry);
+
+/**
+ * rio_route_get_entry- Read an entry from a switch routing table
+ * @rdev: RIO device
+ * @table: Routing table ID
+ * @route_destid: Destination ID to be routed
+ * @route_port: Pointer to read port number into
+ * @lock: apply a hardware lock on switch device flag (1=lock, 0=no_lock)
+ *
+ * If available calls the switch specific get_entry() method to fetch a route
+ * entry from a switch routing table. Otherwise uses standard RT read method
+ * as defined by RapidIO specification. A specific routing table can be selected
+ * using the @table argument if a switch has per port routing tables or
+ * the standard (or global) table may be used by passing
+ * %RIO_GLOBAL_TABLE in @table.
+ *
+ * Returns %0 on success or %-EINVAL on failure.
+ */
+int rio_route_get_entry(struct rio_dev *rdev, u16 table,
+			u16 route_destid, u8 *route_port, int lock)
+{
+	int rc = -EINVAL;
+	struct rio_switch_ops *ops = rdev->rswitch->ops;
+
+	if (lock) {
+		rc = rio_lock_device(rdev->net->hport, rdev->destid,
+				     rdev->hopcount, 1000);
+		if (rc)
+			return rc;
+	}
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (ops == NULL || ops->get_entry == NULL) {
+		rc = rio_std_route_get_entry(rdev->net->hport, rdev->destid,
+					     rdev->hopcount, table,
+					     route_destid, route_port);
+	} else if (try_module_get(ops->owner)) {
+		rc = ops->get_entry(rdev->net->hport, rdev->destid,
+				    rdev->hopcount, table, route_destid,
+				    route_port);
+		module_put(ops->owner);
+	}
+
+	spin_unlock(&rdev->rswitch->lock);
+
+	if (lock)
+		rio_unlock_device(rdev->net->hport, rdev->destid,
+				  rdev->hopcount);
+	return rc;
+}
+EXPORT_SYMBOL_GPL(rio_route_get_entry);
+
+/**
+ * rio_route_clr_table - Clear a switch routing table
+ * @rdev: RIO device
+ * @table: Routing table ID
+ * @lock: apply a hardware lock on switch device flag (1=lock, 0=no_lock)
+ *
+ * If available calls the switch specific clr_table() method to clear a switch
+ * routing table. Otherwise uses standard RT write method as defined by RapidIO
+ * specification. A specific routing table can be selected using the @table
+ * argument if a switch has per port routing tables or the standard (or global)
+ * table may be used by passing %RIO_GLOBAL_TABLE in @table.
+ *
+ * Returns %0 on success or %-EINVAL on failure.
+ */
+int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock)
+{
+	int rc = -EINVAL;
+	struct rio_switch_ops *ops = rdev->rswitch->ops;
+
+	if (lock) {
+		rc = rio_lock_device(rdev->net->hport, rdev->destid,
+				     rdev->hopcount, 1000);
+		if (rc)
+			return rc;
+	}
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (ops == NULL || ops->clr_table == NULL) {
+		rc = rio_std_route_clr_table(rdev->net->hport, rdev->destid,
+					     rdev->hopcount, table);
+	} else if (try_module_get(ops->owner)) {
+		rc = ops->clr_table(rdev->net->hport, rdev->destid,
+				    rdev->hopcount, table);
+
+		module_put(ops->owner);
+	}
+
+	spin_unlock(&rdev->rswitch->lock);
+
+	if (lock)
+		rio_unlock_device(rdev->net->hport, rdev->destid,
+				  rdev->hopcount);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(rio_route_clr_table);
+
 #ifdef CONFIG_RAPIDIO_DMA_ENGINE
 
 static bool rio_chan_filter(struct dma_chan *chan, void *arg)
diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h
index c14f864dea5c..d59587762c76 100644
--- a/drivers/rapidio/rio.h
+++ b/drivers/rapidio/rio.h
@@ -28,18 +28,17 @@ extern u32 rio_mport_get_efb(struct rio_mport *port, int local, u16 destid,
 extern int rio_mport_chk_dev_access(struct rio_mport *mport, u16 destid,
 				    u8 hopcount);
 extern int rio_create_sysfs_dev_files(struct rio_dev *rdev);
-extern int rio_std_route_add_entry(struct rio_mport *mport, u16 destid,
-				   u8 hopcount, u16 table, u16 route_destid,
-				   u8 route_port);
-extern int rio_std_route_get_entry(struct rio_mport *mport, u16 destid,
-				   u8 hopcount, u16 table, u16 route_destid,
-				   u8 *route_port);
-extern int rio_std_route_clr_table(struct rio_mport *mport, u16 destid,
-				   u8 hopcount, u16 table);
+extern int rio_lock_device(struct rio_mport *port, u16 destid,
+			u8 hopcount, int wait_ms);
+extern int rio_unlock_device(struct rio_mport *port, u16 destid, u8 hopcount);
+extern int rio_route_add_entry(struct rio_dev *rdev,
+			u16 table, u16 route_destid, u8 route_port, int lock);
+extern int rio_route_get_entry(struct rio_dev *rdev, u16 table,
+			u16 route_destid, u8 *route_port, int lock);
+extern int rio_route_clr_table(struct rio_dev *rdev, u16 table, int lock);
 extern int rio_set_port_lockout(struct rio_dev *rdev, u32 pnum, int lock);
 extern struct rio_dev *rio_get_comptag(u32 comp_tag, struct rio_dev *from);
 extern int rio_add_device(struct rio_dev *rdev);
-extern void rio_switch_init(struct rio_dev *rdev, int do_enum);
 extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid,
 				 u8 hopcount, u8 port_num);
 extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops);
@@ -51,29 +50,5 @@ extern struct rio_mport *rio_find_mport(int mport_id);
 extern struct device_attribute rio_dev_attrs[];
 extern struct bus_attribute rio_bus_attrs[];
 
-extern struct rio_switch_ops __start_rio_switch_ops[];
-extern struct rio_switch_ops __end_rio_switch_ops[];
-
-/* Helpers internal to the RIO core code */
-#define DECLARE_RIO_SWITCH_SECTION(section, name, vid, did, init_hook) \
-	static const struct rio_switch_ops __rio_switch_##name __used \
-	__section(section) = { vid, did, init_hook };
-
-/**
- * DECLARE_RIO_SWITCH_INIT - Registers switch initialization routine
- * @vid: RIO vendor ID
- * @did: RIO device ID
- * @init_hook: Callback that performs switch-specific initialization
- *
- * Manipulating switch route tables and error management in RIO
- * is switch specific. This registers a switch by vendor and device ID with
- * initialization callback for setting up switch operations and (if required)
- * hardware initialization. A &struct rio_switch_ops is initialized with
- * pointer to the init routine and placed into a RIO-specific kernel section.
- */
-#define DECLARE_RIO_SWITCH_INIT(vid, did, init_hook)		\
-	DECLARE_RIO_SWITCH_SECTION(.rio_switch_ops, vid##did, \
-			vid, did, init_hook)
-
 #define RIO_GET_DID(size, x)	(size ? (x & 0xffff) : ((x & 0x00ff0000) >> 16))
 #define RIO_SET_DID(size, x)	(size ? (x & 0xffff) : ((x & 0x000000ff) << 16))
diff --git a/drivers/rapidio/switches/Kconfig b/drivers/rapidio/switches/Kconfig
index 62d4a065230f..345841562f95 100644
--- a/drivers/rapidio/switches/Kconfig
+++ b/drivers/rapidio/switches/Kconfig
@@ -2,27 +2,23 @@
 # RapidIO switches configuration
 #
 config RAPIDIO_TSI57X
-	bool "IDT Tsi57x SRIO switches support"
-	depends on RAPIDIO
+	tristate "IDT Tsi57x SRIO switches support"
 	---help---
 	  Includes support for IDT Tsi57x family of serial RapidIO switches.
 
 config RAPIDIO_CPS_XX
-	bool "IDT CPS-xx SRIO switches support"
-	depends on RAPIDIO
+	tristate "IDT CPS-xx SRIO switches support"
 	---help---
 	  Includes support for IDT CPS-16/12/10/8 serial RapidIO switches.
 
 config RAPIDIO_TSI568
-	bool "Tsi568 SRIO switch support"
-	depends on RAPIDIO
+	tristate "Tsi568 SRIO switch support"
 	default n
 	---help---
 	  Includes support for IDT Tsi568 serial RapidIO switch.
 
 config RAPIDIO_CPS_GEN2
-	bool "IDT CPS Gen.2 SRIO switch support"
-	depends on RAPIDIO
+	tristate "IDT CPS Gen.2 SRIO switch support"
 	default n
 	---help---
 	  Includes support for ITD CPS Gen.2 serial RapidIO switches.
diff --git a/drivers/rapidio/switches/idt_gen2.c b/drivers/rapidio/switches/idt_gen2.c
index 809b7a3336ba..00a71ebb5cac 100644
--- a/drivers/rapidio/switches/idt_gen2.c
+++ b/drivers/rapidio/switches/idt_gen2.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/stat.h>
+#include <linux/module.h>
 #include <linux/rio.h>
 #include <linux/rio_drv.h>
 #include <linux/rio_ids.h>
@@ -387,12 +388,12 @@ idtg2_show_errlog(struct device *dev, struct device_attribute *attr, char *buf)
 
 static DEVICE_ATTR(errlog, S_IRUGO, idtg2_show_errlog, NULL);
 
-static int idtg2_sysfs(struct rio_dev *rdev, int create)
+static int idtg2_sysfs(struct rio_dev *rdev, bool create)
 {
 	struct device *dev = &rdev->dev;
 	int err = 0;
 
-	if (create == RIO_SW_SYSFS_CREATE) {
+	if (create) {
 		/* Initialize sysfs entries */
 		err = device_create_file(dev, &dev_attr_errlog);
 		if (err)
@@ -403,29 +404,90 @@ static int idtg2_sysfs(struct rio_dev *rdev, int create)
 	return err;
 }
 
-static int idtg2_switch_init(struct rio_dev *rdev, int do_enum)
+static struct rio_switch_ops idtg2_switch_ops = {
+	.owner = THIS_MODULE,
+	.add_entry = idtg2_route_add_entry,
+	.get_entry = idtg2_route_get_entry,
+	.clr_table = idtg2_route_clr_table,
+	.set_domain = idtg2_set_domain,
+	.get_domain = idtg2_get_domain,
+	.em_init = idtg2_em_init,
+	.em_handle = idtg2_em_handler,
+};
+
+static int idtg2_probe(struct rio_dev *rdev, const struct rio_device_id *id)
 {
 	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
-	rdev->rswitch->add_entry = idtg2_route_add_entry;
-	rdev->rswitch->get_entry = idtg2_route_get_entry;
-	rdev->rswitch->clr_table = idtg2_route_clr_table;
-	rdev->rswitch->set_domain = idtg2_set_domain;
-	rdev->rswitch->get_domain = idtg2_get_domain;
-	rdev->rswitch->em_init = idtg2_em_init;
-	rdev->rswitch->em_handle = idtg2_em_handler;
-	rdev->rswitch->sw_sysfs = idtg2_sysfs;
-
-	if (do_enum) {
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (rdev->rswitch->ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return -EINVAL;
+	}
+
+	rdev->rswitch->ops = &idtg2_switch_ops;
+
+	if (rdev->do_enum) {
 		/* Ensure that default routing is disabled on startup */
 		rio_write_config_32(rdev,
 				    RIO_STD_RTE_DEFAULT_PORT, IDT_NO_ROUTE);
 	}
 
+	/* Create device-specific sysfs attributes */
+	idtg2_sysfs(rdev, true);
+
+	spin_unlock(&rdev->rswitch->lock);
 	return 0;
 }
 
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS1848, idtg2_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS1616, idtg2_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTVPS1616, idtg2_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTSPS1616, idtg2_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS1432, idtg2_switch_init);
+static void idtg2_remove(struct rio_dev *rdev)
+{
+	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
+	spin_lock(&rdev->rswitch->lock);
+	if (rdev->rswitch->ops != &idtg2_switch_ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return;
+	}
+	rdev->rswitch->ops = NULL;
+
+	/* Remove device-specific sysfs attributes */
+	idtg2_sysfs(rdev, false);
+
+	spin_unlock(&rdev->rswitch->lock);
+}
+
+static struct rio_device_id idtg2_id_table[] = {
+	{RIO_DEVICE(RIO_DID_IDTCPS1848, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS1616, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTVPS1616, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTSPS1616, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS1432, RIO_VID_IDT)},
+	{ 0, }	/* terminate list */
+};
+
+static struct rio_driver idtg2_driver = {
+	.name = "idt_gen2",
+	.id_table = idtg2_id_table,
+	.probe = idtg2_probe,
+	.remove = idtg2_remove,
+};
+
+static int __init idtg2_init(void)
+{
+	return rio_register_driver(&idtg2_driver);
+}
+
+static void __exit idtg2_exit(void)
+{
+	pr_debug("RIO: %s\n", __func__);
+	rio_unregister_driver(&idtg2_driver);
+	pr_debug("RIO: %s done\n", __func__);
+}
+
+device_initcall(idtg2_init);
+module_exit(idtg2_exit);
+
+MODULE_DESCRIPTION("IDT CPS Gen.2 Serial RapidIO switch family driver");
+MODULE_AUTHOR("Integrated Device Technology, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rapidio/switches/idtcps.c b/drivers/rapidio/switches/idtcps.c
index d06ee2d44b44..7fbb60d31796 100644
--- a/drivers/rapidio/switches/idtcps.c
+++ b/drivers/rapidio/switches/idtcps.c
@@ -13,6 +13,7 @@
 #include <linux/rio.h>
 #include <linux/rio_drv.h>
 #include <linux/rio_ids.h>
+#include <linux/module.h>
 #include "../rio.h"
 
 #define CPS_DEFAULT_ROUTE	0xde
@@ -118,18 +119,31 @@ idtcps_get_domain(struct rio_mport *mport, u16 destid, u8 hopcount,
 	return 0;
 }
 
-static int idtcps_switch_init(struct rio_dev *rdev, int do_enum)
+static struct rio_switch_ops idtcps_switch_ops = {
+	.owner = THIS_MODULE,
+	.add_entry = idtcps_route_add_entry,
+	.get_entry = idtcps_route_get_entry,
+	.clr_table = idtcps_route_clr_table,
+	.set_domain = idtcps_set_domain,
+	.get_domain = idtcps_get_domain,
+	.em_init = NULL,
+	.em_handle = NULL,
+};
+
+static int idtcps_probe(struct rio_dev *rdev, const struct rio_device_id *id)
 {
 	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
-	rdev->rswitch->add_entry = idtcps_route_add_entry;
-	rdev->rswitch->get_entry = idtcps_route_get_entry;
-	rdev->rswitch->clr_table = idtcps_route_clr_table;
-	rdev->rswitch->set_domain = idtcps_set_domain;
-	rdev->rswitch->get_domain = idtcps_get_domain;
-	rdev->rswitch->em_init = NULL;
-	rdev->rswitch->em_handle = NULL;
-
-	if (do_enum) {
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (rdev->rswitch->ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return -EINVAL;
+	}
+
+	rdev->rswitch->ops = &idtcps_switch_ops;
+
+	if (rdev->do_enum) {
 		/* set TVAL = ~50us */
 		rio_write_config_32(rdev,
 			rdev->phys_efptr + RIO_PORT_LINKTO_CTL_CSR, 0x8e << 8);
@@ -138,12 +152,52 @@ static int idtcps_switch_init(struct rio_dev *rdev, int do_enum)
 				    RIO_STD_RTE_DEFAULT_PORT, CPS_NO_ROUTE);
 	}
 
+	spin_unlock(&rdev->rswitch->lock);
 	return 0;
 }
 
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS6Q, idtcps_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS8, idtcps_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS10Q, idtcps_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS12, idtcps_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDTCPS16, idtcps_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_IDT, RIO_DID_IDT70K200, idtcps_switch_init);
+static void idtcps_remove(struct rio_dev *rdev)
+{
+	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
+	spin_lock(&rdev->rswitch->lock);
+	if (rdev->rswitch->ops != &idtcps_switch_ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return;
+	}
+	rdev->rswitch->ops = NULL;
+	spin_unlock(&rdev->rswitch->lock);
+}
+
+static struct rio_device_id idtcps_id_table[] = {
+	{RIO_DEVICE(RIO_DID_IDTCPS6Q, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS8, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS10Q, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS12, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDTCPS16, RIO_VID_IDT)},
+	{RIO_DEVICE(RIO_DID_IDT70K200, RIO_VID_IDT)},
+	{ 0, }	/* terminate list */
+};
+
+static struct rio_driver idtcps_driver = {
+	.name = "idtcps",
+	.id_table = idtcps_id_table,
+	.probe = idtcps_probe,
+	.remove = idtcps_remove,
+};
+
+static int __init idtcps_init(void)
+{
+	return rio_register_driver(&idtcps_driver);
+}
+
+static void __exit idtcps_exit(void)
+{
+	rio_unregister_driver(&idtcps_driver);
+}
+
+device_initcall(idtcps_init);
+module_exit(idtcps_exit);
+
+MODULE_DESCRIPTION("IDT CPS Gen.1 Serial RapidIO switch family driver");
+MODULE_AUTHOR("Integrated Device Technology, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rapidio/switches/tsi568.c b/drivers/rapidio/switches/tsi568.c
index 3994c00aa01f..8a43561b9d17 100644
--- a/drivers/rapidio/switches/tsi568.c
+++ b/drivers/rapidio/switches/tsi568.c
@@ -19,6 +19,7 @@
 #include <linux/rio_drv.h>
 #include <linux/rio_ids.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include "../rio.h"
 
 /* Global (broadcast) route registers */
@@ -129,18 +130,70 @@ tsi568_em_init(struct rio_dev *rdev)
 	return 0;
 }
 
-static int tsi568_switch_init(struct rio_dev *rdev, int do_enum)
+static struct rio_switch_ops tsi568_switch_ops = {
+	.owner = THIS_MODULE,
+	.add_entry = tsi568_route_add_entry,
+	.get_entry = tsi568_route_get_entry,
+	.clr_table = tsi568_route_clr_table,
+	.set_domain = NULL,
+	.get_domain = NULL,
+	.em_init = tsi568_em_init,
+	.em_handle = NULL,
+};
+
+static int tsi568_probe(struct rio_dev *rdev, const struct rio_device_id *id)
 {
 	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
-	rdev->rswitch->add_entry = tsi568_route_add_entry;
-	rdev->rswitch->get_entry = tsi568_route_get_entry;
-	rdev->rswitch->clr_table = tsi568_route_clr_table;
-	rdev->rswitch->set_domain = NULL;
-	rdev->rswitch->get_domain = NULL;
-	rdev->rswitch->em_init = tsi568_em_init;
-	rdev->rswitch->em_handle = NULL;
 
+	spin_lock(&rdev->rswitch->lock);
+
+	if (rdev->rswitch->ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return -EINVAL;
+	}
+
+	rdev->rswitch->ops = &tsi568_switch_ops;
+	spin_unlock(&rdev->rswitch->lock);
 	return 0;
 }
 
-DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI568, tsi568_switch_init);
+static void tsi568_remove(struct rio_dev *rdev)
+{
+	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
+	spin_lock(&rdev->rswitch->lock);
+	if (rdev->rswitch->ops != &tsi568_switch_ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return;
+	}
+	rdev->rswitch->ops = NULL;
+	spin_unlock(&rdev->rswitch->lock);
+}
+
+static struct rio_device_id tsi568_id_table[] = {
+	{RIO_DEVICE(RIO_DID_TSI568, RIO_VID_TUNDRA)},
+	{ 0, }	/* terminate list */
+};
+
+static struct rio_driver tsi568_driver = {
+	.name = "tsi568",
+	.id_table = tsi568_id_table,
+	.probe = tsi568_probe,
+	.remove = tsi568_remove,
+};
+
+static int __init tsi568_init(void)
+{
+	return rio_register_driver(&tsi568_driver);
+}
+
+static void __exit tsi568_exit(void)
+{
+	rio_unregister_driver(&tsi568_driver);
+}
+
+device_initcall(tsi568_init);
+module_exit(tsi568_exit);
+
+MODULE_DESCRIPTION("IDT Tsi568 Serial RapidIO switch driver");
+MODULE_AUTHOR("Integrated Device Technology, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/rapidio/switches/tsi57x.c b/drivers/rapidio/switches/tsi57x.c
index db8b8028988d..42c8b014fe15 100644
--- a/drivers/rapidio/switches/tsi57x.c
+++ b/drivers/rapidio/switches/tsi57x.c
@@ -19,6 +19,7 @@
 #include <linux/rio_drv.h>
 #include <linux/rio_ids.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include "../rio.h"
 
 /* Global (broadcast) route registers */
@@ -292,27 +293,79 @@ exit_es:
 	return 0;
 }
 
-static int tsi57x_switch_init(struct rio_dev *rdev, int do_enum)
+static struct rio_switch_ops tsi57x_switch_ops = {
+	.owner = THIS_MODULE,
+	.add_entry = tsi57x_route_add_entry,
+	.get_entry = tsi57x_route_get_entry,
+	.clr_table = tsi57x_route_clr_table,
+	.set_domain = tsi57x_set_domain,
+	.get_domain = tsi57x_get_domain,
+	.em_init = tsi57x_em_init,
+	.em_handle = tsi57x_em_handler,
+};
+
+static int tsi57x_probe(struct rio_dev *rdev, const struct rio_device_id *id)
 {
 	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
-	rdev->rswitch->add_entry = tsi57x_route_add_entry;
-	rdev->rswitch->get_entry = tsi57x_route_get_entry;
-	rdev->rswitch->clr_table = tsi57x_route_clr_table;
-	rdev->rswitch->set_domain = tsi57x_set_domain;
-	rdev->rswitch->get_domain = tsi57x_get_domain;
-	rdev->rswitch->em_init = tsi57x_em_init;
-	rdev->rswitch->em_handle = tsi57x_em_handler;
-
-	if (do_enum) {
+
+	spin_lock(&rdev->rswitch->lock);
+
+	if (rdev->rswitch->ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return -EINVAL;
+	}
+	rdev->rswitch->ops = &tsi57x_switch_ops;
+
+	if (rdev->do_enum) {
 		/* Ensure that default routing is disabled on startup */
 		rio_write_config_32(rdev, RIO_STD_RTE_DEFAULT_PORT,
 				    RIO_INVALID_ROUTE);
 	}
 
+	spin_unlock(&rdev->rswitch->lock);
 	return 0;
 }
 
-DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI572, tsi57x_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI574, tsi57x_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI577, tsi57x_switch_init);
-DECLARE_RIO_SWITCH_INIT(RIO_VID_TUNDRA, RIO_DID_TSI578, tsi57x_switch_init);
+static void tsi57x_remove(struct rio_dev *rdev)
+{
+	pr_debug("RIO: %s for %s\n", __func__, rio_name(rdev));
+	spin_lock(&rdev->rswitch->lock);
+	if (rdev->rswitch->ops != &tsi57x_switch_ops) {
+		spin_unlock(&rdev->rswitch->lock);
+		return;
+	}
+	rdev->rswitch->ops = NULL;
+	spin_unlock(&rdev->rswitch->lock);
+}
+
+static struct rio_device_id tsi57x_id_table[] = {
+	{RIO_DEVICE(RIO_DID_TSI572, RIO_VID_TUNDRA)},
+	{RIO_DEVICE(RIO_DID_TSI574, RIO_VID_TUNDRA)},
+	{RIO_DEVICE(RIO_DID_TSI577, RIO_VID_TUNDRA)},
+	{RIO_DEVICE(RIO_DID_TSI578, RIO_VID_TUNDRA)},
+	{ 0, }	/* terminate list */
+};
+
+static struct rio_driver tsi57x_driver = {
+	.name = "tsi57x",
+	.id_table = tsi57x_id_table,
+	.probe = tsi57x_probe,
+	.remove = tsi57x_remove,
+};
+
+static int __init tsi57x_init(void)
+{
+	return rio_register_driver(&tsi57x_driver);
+}
+
+static void __exit tsi57x_exit(void)
+{
+	rio_unregister_driver(&tsi57x_driver);
+}
+
+device_initcall(tsi57x_init);
+module_exit(tsi57x_exit);
+
+MODULE_DESCRIPTION("IDT Tsi57x Serial RapidIO switch family driver");
+MODULE_AUTHOR("Integrated Device Technology, Inc.");
+MODULE_LICENSE("GPL");
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 4f2737208c42..c74d88baea60 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -275,13 +275,6 @@
 		VMLINUX_SYMBOL(__end_builtin_fw) = .;			\
 	}								\
 									\
-	/* RapidIO route ops */						\
-	.rio_ops        : AT(ADDR(.rio_ops) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start_rio_switch_ops) = .;		\
-		*(.rio_switch_ops)					\
-		VMLINUX_SYMBOL(__end_rio_switch_ops) = .;		\
-	}								\
-									\
 	TRACEDATA							\
 									\
 	/* Kernel symbol table: Normal symbols */			\
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 18e099342e6f..fcd492e7aff4 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -94,6 +94,23 @@ union rio_pw_msg;
  * @switchid: Switch ID that is unique across a network
  * @route_table: Copy of switch routing table
  * @port_ok: Status of each port (one bit per port) - OK=1 or UNINIT=0
+ * @ops: pointer to switch-specific operations
+ * @lock: lock to serialize operations updates
+ * @nextdev: Array of per-port pointers to the next attached device
+ */
+struct rio_switch {
+	struct list_head node;
+	u16 switchid;
+	u8 *route_table;
+	u32 port_ok;
+	struct rio_switch_ops *ops;
+	spinlock_t lock;
+	struct rio_dev *nextdev[0];
+};
+
+/**
+ * struct rio_switch_ops - Per-switch operations
+ * @owner: The module owner of this structure
  * @add_entry: Callback for switch-specific route add function
  * @get_entry: Callback for switch-specific route get function
  * @clr_table: Callback for switch-specific clear route table function
@@ -101,14 +118,12 @@ union rio_pw_msg;
  * @get_domain: Callback for switch-specific domain get function
  * @em_init: Callback for switch-specific error management init function
  * @em_handle: Callback for switch-specific error management handler function
- * @sw_sysfs: Callback that initializes switch-specific sysfs attributes
- * @nextdev: Array of per-port pointers to the next attached device
+ *
+ * Defines the operations that are necessary to initialize/control
+ * a particular RIO switch device.
  */
-struct rio_switch {
-	struct list_head node;
-	u16 switchid;
-	u8 *route_table;
-	u32 port_ok;
+struct rio_switch_ops {
+	struct module *owner;
 	int (*add_entry) (struct rio_mport *mport, u16 destid, u8 hopcount,
 			  u16 table, u16 route_destid, u8 route_port);
 	int (*get_entry) (struct rio_mport *mport, u16 destid, u8 hopcount,
@@ -121,8 +136,6 @@ struct rio_switch {
 			   u8 *sw_domain);
 	int (*em_init) (struct rio_dev *dev);
 	int (*em_handle) (struct rio_dev *dev, u8 swport);
-	int (*sw_sysfs) (struct rio_dev *dev, int create);
-	struct rio_dev *nextdev[0];
 };
 
 /**
@@ -130,6 +143,7 @@ struct rio_switch {
  * @global_list: Node in list of all RIO devices
  * @net_list: Node in list of RIO devices in a network
  * @net: Network this device is a part of
+ * @do_enum: Enumeration flag
  * @did: Device ID
  * @vid: Vendor ID
  * @device_rev: Device revision
@@ -158,6 +172,7 @@ struct rio_dev {
 	struct list_head global_list;	/* node in list of all RIO devices */
 	struct list_head net_list;	/* node in per net list */
 	struct rio_net *net;	/* RIO net this device resides in */
+	bool do_enum;
 	u16 did;
 	u16 vid;
 	u32 device_rev;
@@ -297,10 +312,6 @@ struct rio_net {
 	struct rio_id_table destid_table;  /* destID allocation table */
 };
 
-/* Definitions used by switch sysfs initialization callback */
-#define RIO_SW_SYSFS_CREATE	1	/* Create switch attributes */
-#define RIO_SW_SYSFS_REMOVE	0	/* Remove switch attributes */
-
 /* Low-level architecture-dependent routines */
 
 /**
@@ -400,20 +411,6 @@ struct rio_device_id {
 	u16 asm_did, asm_vid;
 };
 
-/**
- * struct rio_switch_ops - Per-switch operations
- * @vid: RIO vendor ID
- * @did: RIO device ID
- * @init_hook: Callback that performs switch device initialization
- *
- * Defines the operations that are necessary to initialize/control
- * a particular RIO switch device.
- */
-struct rio_switch_ops {
-	u16 vid, did;
-	int (*init_hook) (struct rio_dev *rdev, int do_enum);
-};
-
 union rio_pw_msg {
 	struct {
 		u32 comptag;	/* Component Tag CSR */
-- 
cgit v1.3-7-g2ca7


From 9edbc30b434f56258d03faac5daf37a555384db3 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 3 Jul 2013 15:08:53 -0700
Subject: rapidio: update enumerator registration mechanism

Update enumeration/discovery method registration mechanism to allow
loading enumeration/discovery methods before all mports are registered.

Existing statically linked RapidIO subsystem expects that all available
RapidIO mport devices are initialized and registered before the
enumeration/discovery method is registered.  Switching to loadable mport
device drivers creates situation when mport device driver can be loaded
after enumeration/discovery method is attached (e.g., loadable mport
driver in a system with statically linked RapidIO core and enumerator).
This also will happen in a system with hot-pluggable RapidIO controllers.

To remove the dependency on the initialization/registration order this
patch introduces enumeration/discovery registration mechanism that
supports arbitrary registration order of mports and enumerator/discovery
methods.

The following registration rules are implemented:
- only one enumeration/discovery method can be registered for given mport ID
  (including RIO_MPORT_ANY);
- when new enumeration/discovery methods tries to attach to the registered mport
  device, method with matching mport ID will replace a default method previously
  registered for given mport (if any);
- enumeration/discovery method with target ID=RIO_MPORT_ANY will be attached
  only to mports that do not have another enumerator attached to them;
- when new mport device is registered with RapidIO subsystem, registration
  routine searches for the enumeration/discovery method with the best matching
  mport ID;

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Andre van Herk <andre.van.herk@Prodrive.nl>
Cc: Micha Nelissen <micha.nelissen@Prodrive.nl>
Cc: Stef van Os <stef.van.os@Prodrive.nl>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio-scan.c  |   1 +
 drivers/rapidio/rio-sysfs.c |  17 +----
 drivers/rapidio/rio.c       | 176 ++++++++++++++++++++++++++++++++++++--------
 drivers/rapidio/rio.h       |   3 +-
 include/linux/rio.h         |  15 ++++
 5 files changed, 164 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index 913950212605..ab837fdb2383 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -1162,6 +1162,7 @@ bail:
 }
 
 static struct rio_scan rio_scan_ops = {
+	.owner = THIS_MODULE,
 	.enumerate = rio_enum_mport,
 	.discover = rio_disc_mport,
 };
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 864e52f193e1..0c4473e54f86 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -286,7 +286,6 @@ static ssize_t bus_scan_store(struct bus_type *bus, const char *buf,
 				size_t count)
 {
 	long val;
-	struct rio_mport *port = NULL;
 	int rc;
 
 	if (kstrtol(buf, 0, &val) < 0)
@@ -300,21 +299,7 @@ static ssize_t bus_scan_store(struct bus_type *bus, const char *buf,
 	if (val < 0 || val >= RIO_MAX_MPORTS)
 		return -EINVAL;
 
-	port = rio_find_mport((int)val);
-
-	if (!port) {
-		pr_debug("RIO: %s: mport_%d not available\n",
-			 __func__, (int)val);
-		return -EINVAL;
-	}
-
-	if (!port->nscan)
-		return -EINVAL;
-
-	if (port->host_deviceid >= 0)
-		rc = port->nscan->enumerate(port, 0);
-	else
-		rc = port->nscan->discover(port, RIO_SCAN_ENUM_NO_WAIT);
+	rc = rio_mport_scan((int)val);
 exit:
 	if (!rc)
 		rc = count;
diff --git a/drivers/rapidio/rio.c b/drivers/rapidio/rio.c
index b17d5218005e..5eb727cd0652 100644
--- a/drivers/rapidio/rio.c
+++ b/drivers/rapidio/rio.c
@@ -34,6 +34,7 @@ static LIST_HEAD(rio_devices);
 static DEFINE_SPINLOCK(rio_global_list_lock);
 
 static LIST_HEAD(rio_mports);
+static LIST_HEAD(rio_scans);
 static DEFINE_MUTEX(rio_mport_list_lock);
 static unsigned char next_portid;
 static DEFINE_SPINLOCK(rio_mmap_lock);
@@ -1602,34 +1603,73 @@ found:
  * rio_register_scan - enumeration/discovery method registration interface
  * @mport_id: mport device ID for which fabric scan routine has to be set
  *            (RIO_MPORT_ANY = set for all available mports)
- * @scan_ops: enumeration/discovery control structure
+ * @scan_ops: enumeration/discovery operations structure
+ *
+ * Registers enumeration/discovery operations with RapidIO subsystem and
+ * attaches it to the specified mport device (or all available mports
+ * if RIO_MPORT_ANY is specified).
  *
- * Assigns enumeration or discovery method to the specified mport device (or all
- * available mports if RIO_MPORT_ANY is specified).
  * Returns error if the mport already has an enumerator attached to it.
- * In case of RIO_MPORT_ANY ignores ports with valid scan routines and returns
- * an error if was unable to find at least one available mport.
+ * In case of RIO_MPORT_ANY skips mports with valid scan routines (no error).
  */
 int rio_register_scan(int mport_id, struct rio_scan *scan_ops)
 {
 	struct rio_mport *port;
-	int rc = -EBUSY;
+	struct rio_scan_node *scan;
+	int rc = 0;
 
-	mutex_lock(&rio_mport_list_lock);
-	list_for_each_entry(port, &rio_mports, node) {
-		if (port->id == mport_id || mport_id == RIO_MPORT_ANY) {
-			if (port->nscan && mport_id == RIO_MPORT_ANY)
-				continue;
-			else if (port->nscan)
-				break;
+	pr_debug("RIO: %s for mport_id=%d\n", __func__, mport_id);
 
-			port->nscan = scan_ops;
-			rc = 0;
+	if ((mport_id != RIO_MPORT_ANY && mport_id >= RIO_MAX_MPORTS) ||
+	    !scan_ops)
+		return -EINVAL;
 
-			if (mport_id != RIO_MPORT_ANY)
-				break;
+	mutex_lock(&rio_mport_list_lock);
+
+	/*
+	 * Check if there is another enumerator already registered for
+	 * the same mport ID (including RIO_MPORT_ANY). Multiple enumerators
+	 * for the same mport ID are not supported.
+	 */
+	list_for_each_entry(scan, &rio_scans, node) {
+		if (scan->mport_id == mport_id) {
+			rc = -EBUSY;
+			goto err_out;
 		}
 	}
+
+	/*
+	 * Allocate and initialize new scan registration node.
+	 */
+	scan = kzalloc(sizeof(*scan), GFP_KERNEL);
+	if (!scan) {
+		rc = -ENOMEM;
+		goto err_out;
+	}
+
+	scan->mport_id = mport_id;
+	scan->ops = scan_ops;
+
+	/*
+	 * Traverse the list of registered mports to attach this new scan.
+	 *
+	 * The new scan with matching mport ID overrides any previously attached
+	 * scan assuming that old scan (if any) is the default one (based on the
+	 * enumerator registration check above).
+	 * If the new scan is the global one, it will be attached only to mports
+	 * that do not have their own individual operations already attached.
+	 */
+	list_for_each_entry(port, &rio_mports, node) {
+		if (port->id == mport_id) {
+			port->nscan = scan_ops;
+			break;
+		} else if (mport_id == RIO_MPORT_ANY && !port->nscan)
+			port->nscan = scan_ops;
+	}
+
+	list_add_tail(&scan->node, &rio_scans);
+
+err_out:
 	mutex_unlock(&rio_mport_list_lock);
 
 	return rc;
@@ -1639,30 +1679,81 @@ EXPORT_SYMBOL_GPL(rio_register_scan);
 /**
  * rio_unregister_scan - removes enumeration/discovery method from mport
  * @mport_id: mport device ID for which fabric scan routine has to be
- *            unregistered (RIO_MPORT_ANY = set for all available mports)
+ *            unregistered (RIO_MPORT_ANY = apply to all mports that use
+ *            the specified scan_ops)
+ * @scan_ops: enumeration/discovery operations structure
  *
  * Removes enumeration or discovery method assigned to the specified mport
- * device (or all available mports if RIO_MPORT_ANY is specified).
+ * device. If RIO_MPORT_ANY is specified, removes the specified operations from
+ * all mports that have them attached.
  */
-int rio_unregister_scan(int mport_id)
+int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops)
 {
 	struct rio_mport *port;
+	struct rio_scan_node *scan;
+
+	pr_debug("RIO: %s for mport_id=%d\n", __func__, mport_id);
+
+	if (mport_id != RIO_MPORT_ANY && mport_id >= RIO_MAX_MPORTS)
+		return -EINVAL;
 
 	mutex_lock(&rio_mport_list_lock);
-	list_for_each_entry(port, &rio_mports, node) {
-		if (port->id == mport_id || mport_id == RIO_MPORT_ANY) {
-			if (port->nscan)
-				port->nscan = NULL;
-			if (mport_id != RIO_MPORT_ANY)
-				break;
+
+	list_for_each_entry(port, &rio_mports, node)
+		if (port->id == mport_id ||
+		    (mport_id == RIO_MPORT_ANY && port->nscan == scan_ops))
+			port->nscan = NULL;
+
+	list_for_each_entry(scan, &rio_scans, node)
+		if (scan->mport_id == mport_id) {
+			list_del(&scan->node);
+			kfree(scan);
 		}
-	}
+
 	mutex_unlock(&rio_mport_list_lock);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(rio_unregister_scan);
 
+/**
+ * rio_mport_scan - execute enumeration/discovery on the specified mport
+ * @mport_id: number (ID) of mport device
+ */
+int rio_mport_scan(int mport_id)
+{
+	struct rio_mport *port = NULL;
+	int rc;
+
+	mutex_lock(&rio_mport_list_lock);
+	list_for_each_entry(port, &rio_mports, node) {
+		if (port->id == mport_id)
+			goto found;
+	}
+	mutex_unlock(&rio_mport_list_lock);
+	return -ENODEV;
+found:
+	if (!port->nscan) {
+		mutex_unlock(&rio_mport_list_lock);
+		return -EINVAL;
+	}
+
+	if (!try_module_get(port->nscan->owner)) {
+		mutex_unlock(&rio_mport_list_lock);
+		return -ENODEV;
+	}
+
+	mutex_unlock(&rio_mport_list_lock);
+
+	if (port->host_deviceid >= 0)
+		rc = port->nscan->enumerate(port, 0);
+	else
+		rc = port->nscan->discover(port, RIO_SCAN_ENUM_NO_WAIT);
+
+	module_put(port->nscan->owner);
+	return rc;
+}
+
 static void rio_fixup_device(struct rio_dev *dev)
 {
 }
@@ -1691,7 +1782,10 @@ static void disc_work_handler(struct work_struct *_work)
 	work = container_of(_work, struct rio_disc_work, work);
 	pr_debug("RIO: discovery work for mport %d %s\n",
 		 work->mport->id, work->mport->name);
-	work->mport->nscan->discover(work->mport, 0);
+	if (try_module_get(work->mport->nscan->owner)) {
+		work->mport->nscan->discover(work->mport, 0);
+		module_put(work->mport->nscan->owner);
+	}
 }
 
 int rio_init_mports(void)
@@ -1710,8 +1804,10 @@ int rio_init_mports(void)
 	mutex_lock(&rio_mport_list_lock);
 	list_for_each_entry(port, &rio_mports, node) {
 		if (port->host_deviceid >= 0) {
-			if (port->nscan)
+			if (port->nscan && try_module_get(port->nscan->owner)) {
 				port->nscan->enumerate(port, 0);
+				module_put(port->nscan->owner);
+			}
 		} else
 			n++;
 	}
@@ -1725,7 +1821,7 @@ int rio_init_mports(void)
 	 * for each of them. If the code below fails to allocate needed
 	 * resources, exit without error to keep results of enumeration
 	 * process (if any).
-	 * TODO: Implement restart of dicovery process for all or
+	 * TODO: Implement restart of discovery process for all or
 	 * individual discovering mports.
 	 */
 	rio_wq = alloc_workqueue("riodisc", 0, 0);
@@ -1751,9 +1847,9 @@ int rio_init_mports(void)
 			n++;
 		}
 	}
-	mutex_unlock(&rio_mport_list_lock);
 
 	flush_workqueue(rio_wq);
+	mutex_unlock(&rio_mport_list_lock);
 	pr_debug("RIO: destroy discovery workqueue\n");
 	destroy_workqueue(rio_wq);
 	kfree(work);
@@ -1784,6 +1880,8 @@ __setup("riohdid=", rio_hdid_setup);
 
 int rio_register_mport(struct rio_mport *port)
 {
+	struct rio_scan_node *scan = NULL;
+
 	if (next_portid >= RIO_MAX_MPORTS) {
 		pr_err("RIO: reached specified max number of mports\n");
 		return 1;
@@ -1792,9 +1890,25 @@ int rio_register_mport(struct rio_mport *port)
 	port->id = next_portid++;
 	port->host_deviceid = rio_get_hdid(port->id);
 	port->nscan = NULL;
+
 	mutex_lock(&rio_mport_list_lock);
 	list_add_tail(&port->node, &rio_mports);
+
+	/*
+	 * Check if there are any registered enumeration/discovery operations
+	 * that have to be attached to the added mport.
+	 */
+	list_for_each_entry(scan, &rio_scans, node) {
+		if (port->id == scan->mport_id ||
+		    scan->mport_id == RIO_MPORT_ANY) {
+			port->nscan = scan->ops;
+			if (port->id == scan->mport_id)
+				break;
+		}
+	}
 	mutex_unlock(&rio_mport_list_lock);
+
+	pr_debug("RIO: %s %s id=%d\n", __func__, port->name, port->id);
 	return 0;
 }
 
diff --git a/drivers/rapidio/rio.h b/drivers/rapidio/rio.h
index d59587762c76..085215cd8502 100644
--- a/drivers/rapidio/rio.h
+++ b/drivers/rapidio/rio.h
@@ -42,9 +42,10 @@ extern int rio_add_device(struct rio_dev *rdev);
 extern int rio_enable_rx_tx_port(struct rio_mport *port, int local, u16 destid,
 				 u8 hopcount, u8 port_num);
 extern int rio_register_scan(int mport_id, struct rio_scan *scan_ops);
-extern int rio_unregister_scan(int mport_id);
+extern int rio_unregister_scan(int mport_id, struct rio_scan *scan_ops);
 extern void rio_attach_device(struct rio_dev *rdev);
 extern struct rio_mport *rio_find_mport(int mport_id);
+extern int rio_mport_scan(int mport_id);
 
 /* Structures internal to the RIO core code */
 extern struct device_attribute rio_dev_attrs[];
diff --git a/include/linux/rio.h b/include/linux/rio.h
index fcd492e7aff4..8d3db1b7b998 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -465,14 +465,29 @@ static inline struct rio_mport *dma_to_mport(struct dma_device *ddev)
 
 /**
  * struct rio_scan - RIO enumeration and discovery operations
+ * @owner: The module owner of this structure
  * @enumerate: Callback to perform RapidIO fabric enumeration.
  * @discover: Callback to perform RapidIO fabric discovery.
  */
 struct rio_scan {
+	struct module *owner;
 	int (*enumerate)(struct rio_mport *mport, u32 flags);
 	int (*discover)(struct rio_mport *mport, u32 flags);
 };
 
+/**
+ * struct rio_scan_node - list node to register RapidIO enumeration and
+ * discovery methods with RapidIO core.
+ * @mport_id: ID of an mport (net) serviced by this enumerator
+ * @node: node in global list of registered enumerators
+ * @ops: RIO enumeration and discovery operations
+ */
+struct rio_scan_node {
+	int mport_id;
+	struct list_head node;
+	struct rio_scan *ops;
+};
+
 /* Architecture and hardware-specific functions */
 extern int rio_register_mport(struct rio_mport *);
 extern int rio_open_inb_mbox(struct rio_mport *, void *, int, int);
-- 
cgit v1.3-7-g2ca7


From 3bdbb62fe97537252b22f700009863eeb51aa750 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 3 Jul 2013 15:08:58 -0700
Subject: rapidio: add udev notification

Add RapidIO-specific modalias generation to enable udev notifications
about RapidIO-specific events.

The RapidIO modalias string format is shown below:

"rapidio:vNNNNdNNNNavNNNNadNNNN"

Where:
v  - Device Vendor ID (16 bit),
d  - Device ID (16 bit),
av - Assembly Vendor ID (16 bit),
ad - Assembly ID (16 bit),

as they are reported in corresponding Capability Registers (CARs)
of each RapidIO device.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Andre van Herk <andre.van.herk@Prodrive.nl>
Cc: Micha Nelissen <micha.nelissen@Prodrive.nl>
Cc: Stef van Os <stef.van.os@Prodrive.nl>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio-driver.c      | 18 ++++++++++++++++++
 drivers/rapidio/rio-sysfs.c       | 10 ++++++++++
 include/linux/mod_devicetable.h   | 19 +++++++++++++++++++
 include/linux/rio.h               | 16 +---------------
 include/linux/rio_ids.h           |  2 --
 scripts/mod/devicetable-offsets.c |  6 ++++++
 scripts/mod/file2alias.c          | 20 ++++++++++++++++++++
 7 files changed, 74 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-driver.c b/drivers/rapidio/rio-driver.c
index a0c875563d76..3e9b6a78ad18 100644
--- a/drivers/rapidio/rio-driver.c
+++ b/drivers/rapidio/rio-driver.c
@@ -199,6 +199,23 @@ static int rio_match_bus(struct device *dev, struct device_driver *drv)
       out:return 0;
 }
 
+static int rio_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	struct rio_dev *rdev;
+
+	if (!dev)
+		return -ENODEV;
+
+	rdev = to_rio_dev(dev);
+	if (!rdev)
+		return -ENODEV;
+
+	if (add_uevent_var(env, "MODALIAS=rapidio:v%04Xd%04Xav%04Xad%04X",
+			   rdev->vid, rdev->did, rdev->asm_vid, rdev->asm_did))
+		return -ENOMEM;
+	return 0;
+}
+
 struct device rio_bus = {
 	.init_name = "rapidio",
 };
@@ -210,6 +227,7 @@ struct bus_type rio_bus_type = {
 	.bus_attrs = rio_bus_attrs,
 	.probe = rio_device_probe,
 	.remove = rio_device_remove,
+	.uevent	= rio_uevent,
 };
 
 /**
diff --git a/drivers/rapidio/rio-sysfs.c b/drivers/rapidio/rio-sysfs.c
index 0c4473e54f86..9331be646dc3 100644
--- a/drivers/rapidio/rio-sysfs.c
+++ b/drivers/rapidio/rio-sysfs.c
@@ -84,6 +84,15 @@ static ssize_t lnext_show(struct device *dev,
 	return str - buf;
 }
 
+static ssize_t modalias_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rio_dev *rdev = to_rio_dev(dev);
+
+	return sprintf(buf, "rapidio:v%04Xd%04Xav%04Xad%04X\n",
+		       rdev->vid, rdev->did, rdev->asm_vid, rdev->asm_did);
+}
+
 struct device_attribute rio_dev_attrs[] = {
 	__ATTR_RO(did),
 	__ATTR_RO(vid),
@@ -93,6 +102,7 @@ struct device_attribute rio_dev_attrs[] = {
 	__ATTR_RO(asm_rev),
 	__ATTR_RO(lprev),
 	__ATTR_RO(destid),
+	__ATTR_RO(modalias),
 	__ATTR_NULL,
 };
 
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index b3bd7e737e8b..b62d4af6c667 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -579,4 +579,23 @@ struct mei_cl_device_id {
 	kernel_ulong_t driver_info;
 };
 
+/* RapidIO */
+
+#define RIO_ANY_ID	0xffff
+
+/**
+ * struct rio_device_id - RIO device identifier
+ * @did: RapidIO device ID
+ * @vid: RapidIO vendor ID
+ * @asm_did: RapidIO assembly device ID
+ * @asm_vid: RapidIO assembly vendor ID
+ *
+ * Identifies a RapidIO device based on both the device/vendor IDs and
+ * the assembly device/vendor IDs.
+ */
+struct rio_device_id {
+	__u16 did, vid;
+	__u16 asm_did, asm_vid;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/linux/rio.h b/include/linux/rio.h
index 8d3db1b7b998..e2faf7b8fe80 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -20,6 +20,7 @@
 #include <linux/errno.h>
 #include <linux/device.h>
 #include <linux/rio_regs.h>
+#include <linux/mod_devicetable.h>
 #ifdef CONFIG_RAPIDIO_DMA_ENGINE
 #include <linux/dmaengine.h>
 #endif
@@ -396,21 +397,6 @@ struct rio_driver {
 
 #define	to_rio_driver(drv) container_of(drv,struct rio_driver, driver)
 
-/**
- * struct rio_device_id - RIO device identifier
- * @did: RIO device ID
- * @vid: RIO vendor ID
- * @asm_did: RIO assembly device ID
- * @asm_vid: RIO assembly vendor ID
- *
- * Identifies a RIO device based on both the device/vendor IDs and
- * the assembly device/vendor IDs.
- */
-struct rio_device_id {
-	u16 did, vid;
-	u16 asm_did, asm_vid;
-};
-
 union rio_pw_msg {
 	struct {
 		u32 comptag;	/* Component Tag CSR */
diff --git a/include/linux/rio_ids.h b/include/linux/rio_ids.h
index b66d13d1bdc0..2543bc163d54 100644
--- a/include/linux/rio_ids.h
+++ b/include/linux/rio_ids.h
@@ -13,8 +13,6 @@
 #ifndef LINUX_RIO_IDS_H
 #define LINUX_RIO_IDS_H
 
-#define RIO_ANY_ID			0xffff
-
 #define RIO_VID_FREESCALE		0x0002
 #define RIO_DID_MPC8560			0x0003
 
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index e66d4d258e1a..bb5d115ca671 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -177,5 +177,11 @@ int main(void)
 	DEVID(mei_cl_device_id);
 	DEVID_FIELD(mei_cl_device_id, name);
 
+	DEVID(rio_device_id);
+	DEVID_FIELD(rio_device_id, did);
+	DEVID_FIELD(rio_device_id, vid);
+	DEVID_FIELD(rio_device_id, asm_did);
+	DEVID_FIELD(rio_device_id, asm_vid);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 45f9a3377dcd..d9e67b719f08 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1145,6 +1145,26 @@ static int do_mei_entry(const char *filename, void *symval,
 }
 ADD_TO_DEVTABLE("mei", mei_cl_device_id, do_mei_entry);
 
+/* Looks like: rapidio:vNdNavNadN */
+static int do_rio_entry(const char *filename,
+			void *symval, char *alias)
+{
+	DEF_FIELD(symval, rio_device_id, did);
+	DEF_FIELD(symval, rio_device_id, vid);
+	DEF_FIELD(symval, rio_device_id, asm_did);
+	DEF_FIELD(symval, rio_device_id, asm_vid);
+
+	strcpy(alias, "rapidio:");
+	ADD(alias, "v", vid != RIO_ANY_ID, vid);
+	ADD(alias, "d", did != RIO_ANY_ID, did);
+	ADD(alias, "av", asm_vid != RIO_ANY_ID, asm_vid);
+	ADD(alias, "ad", asm_did != RIO_ANY_ID, asm_did);
+
+	add_wildcard(alias);
+	return 1;
+}
+ADD_TO_DEVTABLE("rapidio", rio_device_id, do_rio_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit v1.3-7-g2ca7


From 6ca40c2565fc617534d20d10a5848b626213608c Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Wed, 3 Jul 2013 15:09:01 -0700
Subject: rapidio: change endpoint device name format

Change endpoint device name format to use a component tag value instead of
device destination ID.

RapidIO specification defines a component tag to be a unique identifier
for devices in a network.  RapidIO switches already use component tag as
part of their device name and also use it for device identification when
processing error management event notifications.

Forming an endpoint's device name using its component tag instead of
destination ID allows to keep sysfs device directories unchanged in case
if a routing process dynamically changes endpoint's destination ID as a
result of route optimization.

This change should not affect any existing users because a valid device
destination ID always should be obtained by reading "destid" attribute and
not by parsing device name.

This patch also removes switchid member from struct rio_switch because it
simply duplicates the component tag and does not have other use than in
device name generation.

Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Li Yang <leoli@freescale.com>
Cc: Kumar Gala <galak@kernel.crashing.org>
Cc: Andre van Herk <andre.van.herk@Prodrive.nl>
Cc: Micha Nelissen <micha.nelissen@Prodrive.nl>
Cc: Stef van Os <stef.van.os@Prodrive.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rapidio/rio-scan.c | 5 ++---
 include/linux/rio.h        | 2 --
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c
index ab837fdb2383..d3a6539a77cc 100644
--- a/drivers/rapidio/rio-scan.c
+++ b/drivers/rapidio/rio-scan.c
@@ -433,7 +433,6 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 	/* If a PE has both switch and other functions, show it as a switch */
 	if (rio_is_switch(rdev)) {
 		rswitch = rdev->rswitch;
-		rswitch->switchid = rdev->comp_tag & RIO_CTAG_UDEVID;
 		rswitch->port_ok = 0;
 		spin_lock_init(&rswitch->lock);
 		rswitch->route_table = kzalloc(sizeof(u8)*
@@ -446,7 +445,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 				rdid++)
 			rswitch->route_table[rdid] = RIO_INVALID_ROUTE;
 		dev_set_name(&rdev->dev, "%02x:s:%04x", rdev->net->id,
-			     rswitch->switchid);
+			     rdev->comp_tag & RIO_CTAG_UDEVID);
 
 		if (do_enum)
 			rio_route_clr_table(rdev, RIO_GLOBAL_TABLE, 0);
@@ -459,7 +458,7 @@ static struct rio_dev *rio_setup_device(struct rio_net *net,
 			rio_enable_rx_tx_port(port, 0, destid, hopcount, 0);
 
 		dev_set_name(&rdev->dev, "%02x:e:%04x", rdev->net->id,
-			     rdev->destid);
+			     rdev->comp_tag & RIO_CTAG_UDEVID);
 	}
 
 	rio_attach_device(rdev);
diff --git a/include/linux/rio.h b/include/linux/rio.h
index e2faf7b8fe80..b71d5738e683 100644
--- a/include/linux/rio.h
+++ b/include/linux/rio.h
@@ -92,7 +92,6 @@ union rio_pw_msg;
 /**
  * struct rio_switch - RIO switch info
  * @node: Node in global list of switches
- * @switchid: Switch ID that is unique across a network
  * @route_table: Copy of switch routing table
  * @port_ok: Status of each port (one bit per port) - OK=1 or UNINIT=0
  * @ops: pointer to switch-specific operations
@@ -101,7 +100,6 @@ union rio_pw_msg;
  */
 struct rio_switch {
 	struct list_head node;
-	u16 switchid;
 	u8 *route_table;
 	u32 port_ok;
 	struct rio_switch_ops *ops;
-- 
cgit v1.3-7-g2ca7


From c378f70adbc1bbecd9e6db145019f14b2f688c7c Mon Sep 17 00:00:00 2001
From: Paul Clements <paul.clements@steeleye.com>
Date: Wed, 3 Jul 2013 15:09:04 -0700
Subject: nbd: correct disconnect behavior

Currently, when a disconnect is requested by the user (via NBD_DISCONNECT
ioctl) the return from NBD_DO_IT is undefined (it is usually one of
several error codes).  This means that nbd-client does not know if a
manual disconnect was performed or whether a network error occurred.
Because of this, nbd-client's persist mode (which tries to reconnect after
error, but not after manual disconnect) does not always work correctly.

This change fixes this by causing NBD_DO_IT to always return 0 if a user
requests a disconnect.  This means that nbd-client can correctly either
persist the connection (if an error occurred) or disconnect (if the user
requested it).

Signed-off-by: Paul Clements <paul.clements@steeleye.com>
Acked-by: Rob Landley <rob@landley.net>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/nbd.c | 7 ++++++-
 include/linux/nbd.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 7ed1308c42d3..2dc3b5153f0d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -623,8 +623,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		if (!nbd->sock)
 			return -EINVAL;
 
+		nbd->disconnect = 1;
+
 		nbd_send_req(nbd, &sreq);
-                return 0;
+		return 0;
 	}
  
 	case NBD_CLEAR_SOCK: {
@@ -654,6 +656,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 				nbd->sock = SOCKET_I(inode);
 				if (max_part > 0)
 					bdev->bd_invalidated = 1;
+				nbd->disconnect = 0; /* we're connected now */
 				return 0;
 			} else {
 				fput(file);
@@ -743,6 +746,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
 			ioctl_by_bdev(bdev, BLKRRPART, 0);
+		if (nbd->disconnect) /* user requested, ignore socket errors */
+			return 0;
 		return nbd->harderror;
 	}
 
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 4871170a04a0..ae4981ebd18e 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -41,6 +41,7 @@ struct nbd_device {
 	u64 bytesize;
 	pid_t pid; /* pid of nbd-client, if attached */
 	int xmit_timeout;
+	int disconnect; /* a disconnect has been requested by user */
 };
 
 #endif
-- 
cgit v1.3-7-g2ca7