From b595076a180a56d1bb170e6eceda6eb9d76f4cd3 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon, 1 Nov 2010 15:38:34 -0400
Subject: tree-wide: fix comment/printk typos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"gadget", "through", "command", "maintain", "maintain", "controller", "address",
"between", "initiali[zs]e", "instead", "function", "select", "already",
"equal", "access", "management", "hierarchy", "registration", "interest",
"relative", "memory", "offset", "already",

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/debug/kdb/kdb_main.c  | 2 +-
 kernel/kexec.c               | 2 +-
 kernel/power/swap.c          | 2 +-
 kernel/sched.c               | 2 +-
 kernel/sysctl_binary.c       | 2 +-
 kernel/time/clocksource.c    | 2 +-
 kernel/trace/trace_entries.h | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 37755d621924..7242cc71bb7e 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2913,7 +2913,7 @@ static void __init kdb_cmd_init(void)
 	}
 }
 
-/* Intialize kdb_printf, breakpoint tables and kdb state */
+/* Initialize kdb_printf, breakpoint tables and kdb state */
 void __init kdb_init(int lvl)
 {
 	static int kdb_init_lvl = KDB_NOT_INITIALIZED;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b55045bc7563..ec19b92c7ebd 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * just verifies it is an address we can use.
 	 *
 	 * Since the kernel does everything in page size chunks ensure
-	 * the destination addreses are page aligned.  Too many
+	 * the destination addresses are page aligned.  Too many
 	 * special cases crop of when we don't do this.  The most
 	 * insidious is getting overlapping destination addresses
 	 * simply because addresses are changed to page size
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..cd09c22de03d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -865,7 +865,7 @@ out_finish:
 /**
  *	swsusp_read - read the hibernation image.
  *	@flags_p: flags passed by the "frozen" kernel in the image header should
- *		  be written into this memeory location
+ *		  be written into this memory location
  */
 
 int swsusp_read(unsigned int *flags_p)
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..554c0d6c489e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2568,7 +2568,7 @@ out:
  * try_to_wake_up_local - try to wake up a local task with rq lock held
  * @p: the thread to be awakened
  *
- * Put @p on the run-queue if it's not alredy there.  The caller must
+ * Put @p on the run-queue if it's not already there.  The caller must
  * ensure that this_rq() is locked, @p is bound to this_rq() and not
  * the current task.  this_rq() stays locked over invocation.
  */
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c5786064..d9c5fe4ff1b2 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1193,7 +1193,7 @@ static ssize_t bin_dn_node_address(struct file *file,
 
 		buf[result] = '\0';
 
-		/* Convert the decnet addresss to binary */
+		/* Convert the decnet address to binary */
 		result = -EIO;
 		nodep = strchr(buf, '.') + 1;
 		if (!nodep)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..ddbbaf2950ef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -678,7 +678,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
 int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
 {
 
-	/* Intialize mult/shift and max_idle_ns */
+	/* Initialize mult/shift and max_idle_ns */
 	__clocksource_updatefreq_scale(cs, scale, freq);
 
 	/* Add clocksource to the clcoksource list */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index e3dfecaf13e6..6cf223764be8 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -53,7 +53,7 @@
  */
 
 /*
- * Function trace entry - function address and parent function addres:
+ * Function trace entry - function address and parent function address:
  */
 FTRACE_ENTRY(function, ftrace_entry,
 
-- 
cgit v1.3-8-gc7d7


From 9db3b9bcc7f53487da8766b32e2d790ad03c53b9 Mon Sep 17 00:00:00 2001
From: Ross Kirk <Ross.Kirk@nexor.com>
Date: Fri, 22 Oct 2010 16:43:17 +0100
Subject: audit: error message typo correction

Fixes a typo in the error message raised by audit when auditd has died.

Signed-off-by: Ross Kirk <ross.kirk@nexor.com>

--
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 77770a034d59..e4956244ae50 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
 	if (err < 0) {
 		BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
 		printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
-		audit_log_lost("auditd dissapeared\n");
+		audit_log_lost("auditd disappeared\n");
 		audit_pid = 0;
 		/* we might get lucky and get this in the next auditd */
 		audit_hold_skb(skb);
-- 
cgit v1.3-8-gc7d7


From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:17 +0100
Subject: block: make blkdev_get/put() handle exclusive access

Over time, block layer has accumulated a set of APIs dealing with bdev
open, close, claim and release.

* blkdev_get/put() are the primary open and close functions.

* bd_claim/release() deal with exclusive open.

* open/close_bdev_exclusive() are combination of open and claim and
  the other way around, respectively.

* bd_link/unlink_disk_holder() to create and remove holder/slave
  symlinks.

* open_by_devnum() wraps bdget() + blkdev_get().

The interface is a bit confusing and the decoupling of open and claim
makes it impossible to properly guarantee exclusive access as
in-kernel open + claim sequence can disturb the existing exclusive
open even before the block layer knows the current open if for another
exclusive access.  Reorganize the interface such that,

* blkdev_get() is extended to include exclusive access management.
  @holder argument is added and, if is @FMODE_EXCL specified, it will
  gain exclusive access atomically w.r.t. other exclusive accesses.

* blkdev_put() is similarly extended.  It now takes @mode argument and
  if @FMODE_EXCL is set, it releases an exclusive access.  Also, when
  the last exclusive claim is released, the holder/slave symlinks are
  removed automatically.

* bd_claim/release() and close_bdev_exclusive() are no longer
  necessary and either made static or removed.

* bd_link_disk_holder() remains the same but bd_unlink_disk_holder()
  is no longer necessary and removed.

* open_bdev_exclusive() becomes a simple wrapper around lookup_bdev()
  and blkdev_get().  It also has an unexpected extra bdev_read_only()
  test which probably should be moved into blkdev_get().

* open_by_devnum() is modified to take @holder argument and pass it to
  blkdev_get().

Most of bdev open/close operations are unified into blkdev_get/put()
and most exclusive accesses are tested atomically at the open time (as
it should).  This cleans up code and removes some, both valid and
invalid, but unnecessary all the same, corner cases.

open_bdev_exclusive() and open_by_devnum() can use further cleanup -
rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop
special features.  Well, let's leave them for another day.

Most conversions are straight-forward.  drbd conversion is a bit more
involved as there was some reordering, but the logic should stay the
same.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Neil Brown <neilb@suse.de>
Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Peter Osterlund <petero2@telia.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <joel.becker@oracle.com>
Cc: Alex Elder <aelder@sgi.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: dm-devel@redhat.com
Cc: drbd-dev@lists.linbit.com
Cc: Leo Chen <leochen@broadcom.com>
Cc: Scott Branden <sbranden@broadcom.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Cc: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Cc: Joern Engel <joern@logfs.org>
Cc: reiserfs-devel@vger.kernel.org
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 block/ioctl.c                   |   5 +-
 drivers/block/drbd/drbd_int.h   |   2 -
 drivers/block/drbd/drbd_main.c  |   7 +-
 drivers/block/drbd/drbd_nl.c    | 103 ++++++++++-----------------
 drivers/block/pktcdvd.c         |  22 +++---
 drivers/char/raw.c              |  14 ++--
 drivers/md/dm-table.c           |  15 +---
 drivers/md/md.c                 |  14 +---
 drivers/mtd/devices/block2mtd.c |  17 ++---
 drivers/s390/block/dasd_genhd.c |   2 +-
 fs/block_dev.c                  | 149 ++++++++++++++--------------------------
 fs/btrfs/volumes.c              |  14 ++--
 fs/ext3/super.c                 |  12 +---
 fs/ext4/super.c                 |  12 +---
 fs/gfs2/ops_fstype.c            |   4 +-
 fs/jfs/jfs_logmgr.c             |  17 ++---
 fs/logfs/dev_bdev.c             |   4 +-
 fs/nilfs2/super.c               |   4 +-
 fs/ocfs2/cluster/heartbeat.c    |   2 +-
 fs/partitions/check.c           |   2 +-
 fs/reiserfs/journal.c           |  17 ++---
 fs/super.c                      |  14 ++--
 fs/xfs/linux-2.6/xfs_super.c    |   2 +-
 include/linux/fs.h              |  14 ++--
 kernel/power/swap.c             |   5 +-
 mm/swapfile.c                   |   7 +-
 26 files changed, 162 insertions(+), 318 deletions(-)

(limited to 'kernel')

diff --git a/block/ioctl.c b/block/ioctl.c
index d724ceb1d465..cc46d499fd27 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EINVAL;
 		if (get_user(n, (int __user *) arg))
 			return -EFAULT;
-		if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0)
+		if (!(mode & FMODE_EXCL) &&
+		    blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
 			return -EBUSY;
 		ret = set_blocksize(bdev, n);
 		if (!(mode & FMODE_EXCL))
-			bd_release(bdev);
+			blkdev_put(bdev, mode | FMODE_EXCL);
 		return ret;
 	case BLKPG:
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 9bdcf4393c0a..0590b9f67ec6 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -923,8 +923,6 @@ struct drbd_md {
 struct drbd_backing_dev {
 	struct block_device *backing_bdev;
 	struct block_device *md_bdev;
-	struct file *lo_file;
-	struct file *md_file;
 	struct drbd_md md;
 	struct disk_conf dc; /* The user provided config... */
 	sector_t known_size; /* last known size of that backing device */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 25c7a73c5062..7ec1a82064a9 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -3361,11 +3361,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev)
 	if (ldev == NULL)
 		return;
 
-	bd_release(ldev->backing_bdev);
-	bd_release(ldev->md_bdev);
-
-	fput(ldev->lo_file);
-	fput(ldev->md_file);
+	blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+	blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 
 	kfree(ldev);
 }
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 87925e97e613..fd0346090289 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	sector_t max_possible_sectors;
 	sector_t min_md_device_sectors;
 	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
-	struct inode *inode, *inode2;
+	struct block_device *bdev;
 	struct lru_cache *resync_lru = NULL;
 	union drbd_state ns, os;
 	unsigned int max_seg_s;
@@ -902,46 +902,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->lo_file)) {
+	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
+				   FMODE_READ | FMODE_WRITE, mdev);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
-		    PTR_ERR(nbc->lo_file));
-		nbc->lo_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_DISK;
 		goto fail;
 	}
+	nbc->backing_bdev = bdev;
 
-	inode = nbc->lo_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode->i_mode)) {
-		retcode = ERR_DISK_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
-	if (IS_ERR(nbc->md_file)) {
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
+				   FMODE_READ | FMODE_WRITE,
+				   (nbc->dc.meta_dev_idx < 0) ?
+				   (void *)mdev : (void *)drbd_m_holder);
+	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
-		    PTR_ERR(nbc->md_file));
-		nbc->md_file = NULL;
+			PTR_ERR(bdev));
 		retcode = ERR_OPEN_MD_DISK;
 		goto fail;
 	}
+	nbc->md_bdev = bdev;
 
-	inode2 = nbc->md_file->f_dentry->d_inode;
-
-	if (!S_ISBLK(inode2->i_mode)) {
-		retcode = ERR_MD_NOT_BDEV;
-		goto fail;
-	}
-
-	nbc->backing_bdev = inode->i_bdev;
-	if (bd_claim(nbc->backing_bdev, mdev)) {
-		printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
-		       nbc->backing_bdev, mdev,
-		       nbc->backing_bdev->bd_holder,
-		       nbc->backing_bdev->bd_contains->bd_holder,
-		       nbc->backing_bdev->bd_holders);
-		retcode = ERR_BDCLAIM_DISK;
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
 		goto fail;
 	}
 
@@ -950,28 +944,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			offsetof(struct bm_extent, lce));
 	if (!resync_lru) {
 		retcode = ERR_NOMEM;
-		goto release_bdev_fail;
-	}
-
-	/* meta_dev_idx >= 0: external fixed size,
-	 * possibly multiple drbd sharing one meta device.
-	 * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
-	 * not yet used by some other drbd minor!
-	 * (if you use drbd.conf + drbdadm,
-	 * that should check it for you already; but if you don't, or someone
-	 * fooled it, we need to double check here) */
-	nbc->md_bdev = inode2->i_bdev;
-	if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
-				: (void *) drbd_m_holder)) {
-		retcode = ERR_BDCLAIM_MD_DISK;
-		goto release_bdev_fail;
-	}
-
-	if ((nbc->backing_bdev == nbc->md_bdev) !=
-	    (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
-	     nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
-		retcode = ERR_MD_IDX_INVALID;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
@@ -982,7 +955,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	if (nbc->dc.meta_dev_idx < 0) {
@@ -999,7 +972,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	/* Make sure the new disk is big enough
@@ -1007,7 +980,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
 		retcode = ERR_DISK_TO_SMALL;
-		goto release_bdev2_fail;
+		goto fail;
 	}
 
 	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
@@ -1030,7 +1003,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
 	drbd_resume_io(mdev);
 	if (retcode < SS_SUCCESS)
-		goto release_bdev2_fail;
+		goto fail;
 
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		goto force_diskless;
@@ -1264,18 +1237,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
  force_diskless:
 	drbd_force_state(mdev, NS(disk, D_DISKLESS));
 	drbd_md_sync(mdev);
- release_bdev2_fail:
-	if (nbc)
-		bd_release(nbc->md_bdev);
- release_bdev_fail:
-	if (nbc)
-		bd_release(nbc->backing_bdev);
  fail:
 	if (nbc) {
-		if (nbc->lo_file)
-			fput(nbc->lo_file);
-		if (nbc->md_file)
-			fput(nbc->md_file);
+		if (nbc->backing_bdev)
+			blkdev_put(nbc->backing_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		if (nbc->md_bdev)
+			blkdev_put(nbc->md_bdev,
+				   FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 		kfree(nbc);
 	}
 	lc_destroy(resync_lru);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 19b3568e9326..77d70eebb6b2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	 * so bdget() can't fail.
 	 */
 	bdget(pd->bdev->bd_dev);
-	if ((ret = blkdev_get(pd->bdev, FMODE_READ)))
+	if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
 		goto out;
 
-	if ((ret = bd_claim(pd->bdev, pd)))
-		goto out_putdev;
-
 	if ((ret = pkt_get_last_written(pd, &lba))) {
 		printk(DRIVER_NAME": pkt_get_last_written failed\n");
-		goto out_unclaim;
+		goto out_putdev;
 	}
 
 	set_capacity(pd->disk, lba << 2);
@@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	q = bdev_get_queue(pd->bdev);
 	if (write) {
 		if ((ret = pkt_open_write(pd)))
-			goto out_unclaim;
+			goto out_putdev;
 		/*
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
@@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 	}
 
 	if ((ret = pkt_set_segment_merging(pd, q)))
-		goto out_unclaim;
+		goto out_putdev;
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
 			printk(DRIVER_NAME": not enough memory for buffers\n");
 			ret = -ENOMEM;
-			goto out_unclaim;
+			goto out_putdev;
 		}
 		printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
 	}
 
 	return 0;
 
-out_unclaim:
-	bd_release(pd->bdev);
 out_putdev:
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 out:
 	return ret;
 }
@@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 	pkt_lock_door(pd, 0);
 
 	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
-	bd_release(pd->bdev);
-	blkdev_put(pd->bdev, FMODE_READ);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
 
 	pkt_shrink_pktlist(pd);
 }
@@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	bdev = bdget(dev);
 	if (!bdev)
 		return -ENOMEM;
-	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY);
+	ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
 	if (ret)
 		return ret;
 
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index bfe25ea9766b..b4b9d5a47885 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp)
 	if (!bdev)
 		goto out;
 	igrab(bdev->bd_inode);
-	err = blkdev_get(bdev, filp->f_mode);
+	err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open);
 	if (err)
 		goto out;
-	err = bd_claim(bdev, raw_open);
-	if (err)
-		goto out1;
 	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
-		goto out2;
+		goto out1;
 	filp->f_flags |= O_DIRECT;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	if (++raw_devices[minor].inuse == 1)
@@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp)
 	mutex_unlock(&raw_mutex);
 	return 0;
 
-out2:
-	bd_release(bdev);
 out1:
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 out:
 	mutex_unlock(&raw_mutex);
 	return err;
@@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp)
 	}
 	mutex_unlock(&raw_mutex);
 
-	bd_release(bdev);
-	blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode | FMODE_EXCL);
 	return 0;
 }
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2c876ffc63df..9e88ca0c55e9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,20 +325,13 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode);
+	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
-	r = bd_claim(bdev, _claim_ptr);
-	if (r) {
-		blkdev_put(bdev, d->dm_dev.mode);
-		return r;
-	}
-
 	r = bd_link_disk_holder(bdev, dm_disk(md));
 	if (r) {
-		bd_release(bdev);
-		blkdev_put(bdev, d->dm_dev.mode);
+		blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
 		return r;
 	}
 
@@ -354,9 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 	if (!d->dm_dev.bdev)
 		return;
 
-	bd_unlink_disk_holder(d->dm_dev.bdev);
-	bd_release(d->dm_dev.bdev);
-	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode);
+	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
 	d->dm_dev.bdev = NULL;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c47644fca1a1..6af951ffe0bb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1907,7 +1907,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
 		MD_BUG();
 		return;
 	}
-	bd_unlink_disk_holder(rdev->bdev);
 	list_del_rcu(&rdev->same_set);
 	printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
 	rdev->mddev = NULL;
@@ -1935,19 +1934,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
 		return PTR_ERR(bdev);
 	}
-	err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
-	if (err) {
-		printk(KERN_ERR "md: could not bd_claim %s.\n",
-			bdevname(bdev, b));
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return err;
-	}
 	if (!shared)
 		set_bit(AllReserved, &rdev->flags);
 	rdev->bdev = bdev;
@@ -1960,8 +1953,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
 	rdev->bdev = NULL;
 	if (!bdev)
 		MD_BUG();
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 void md_autodetect_dev(dev_t dev);
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index a9e2d3b38aeb..aa557beb8f51 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 	if (dev->blkdev) {
 		invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping,
 					0, -1);
-		close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 	}
 
 	kfree(dev);
@@ -234,7 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev)
 /* FIXME: ensure that mtd->size % erase_size == 0 */
 static struct block2mtd_dev *add_device(char *devname, int erase_size)
 {
-	const fmode_t mode = FMODE_READ | FMODE_WRITE;
+	const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	struct block_device *bdev;
 	struct block2mtd_dev *dev;
 	char *name;
@@ -255,17 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		   to resolve the device name by other means. */
 
 		dev_t devt = name_to_dev_t(devname);
-		if (devt) {
-			bdev = open_by_devnum(devt, mode);
-			if (!IS_ERR(bdev)) {
-				int ret;
-				ret = bd_claim(bdev, dev);
-				if (ret) {
-					blkdev_put(bdev, mode);
-					bdev = ERR_PTR(ret);
-				}
-			}
-		}
+		if (devt)
+			bdev = open_by_devnum(devt, mode, dev);
 	}
 #endif
 
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 30a1ca3d08b7..5505bc07e1e7 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block)
 	struct block_device *bdev;
 
 	bdev = bdget_disk(block->gdp, 0);
-	if (!bdev || blkdev_get(bdev, FMODE_READ) < 0)
+	if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0)
 		return -ENODEV;
 	/*
 	 * See fs/partition/check.c:register_disk,rescan_partitions
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9329068684d2..fc48912354d1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -660,7 +660,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 	else if (bdev->bd_contains == bdev)
 		return true;  	 /* is a whole device which isn't held */
 
-	else if (whole->bd_holder == bd_claim)
+	else if (whole->bd_holder == bd_may_claim)
 		return true; 	 /* is a partition of a device that is being partitioned */
 	else if (whole->bd_holder != NULL)
 		return false;	 /* is a partition of a held device */
@@ -807,10 +807,10 @@ static void __bd_claim(struct block_device *bdev, struct block_device *whole,
 {
 	/* note that for a whole device bd_holders
 	 * will be incremented twice, and bd_holder will
-	 * be set to bd_claim before being set to holder
+	 * be set to bd_may_claim before being set to holder
 	 */
 	whole->bd_holders++;
-	whole->bd_holder = bd_claim;
+	whole->bd_holder = bd_may_claim;
 	bdev->bd_holders++;
 	bdev->bd_holder = holder;
 }
@@ -835,37 +835,7 @@ static void bd_finish_claiming(struct block_device *bdev,
 	__bd_abort_claiming(whole, holder); /* not actually an abort */
 }
 
-/**
- * bd_claim - claim a block device
- * @bdev: block device to claim
- * @holder: holder trying to claim @bdev
- *
- * Try to claim @bdev which must have been opened successfully.
- *
- * CONTEXT:
- * Might sleep.
- *
- * RETURNS:
- * 0 if successful, -EBUSY if @bdev is already claimed.
- */
-int bd_claim(struct block_device *bdev, void *holder)
-{
-	struct block_device *whole = bdev->bd_contains;
-	int res;
-
-	might_sleep();
-
-	spin_lock(&bdev_lock);
-	res = bd_prepare_to_claim(bdev, whole, holder);
-	if (res == 0)
-		__bd_claim(bdev, whole, holder);
-	spin_unlock(&bdev_lock);
-
-	return res;
-}
-EXPORT_SYMBOL(bd_claim);
-
-void bd_release(struct block_device *bdev)
+static void bd_release(struct block_device *bdev)
 {
 	spin_lock(&bdev_lock);
 	if (!--bdev->bd_contains->bd_holders)
@@ -875,8 +845,6 @@ void bd_release(struct block_device *bdev)
 	spin_unlock(&bdev_lock);
 }
 
-EXPORT_SYMBOL(bd_release);
-
 #ifdef CONFIG_SYSFS
 static int add_symlink(struct kobject *from, struct kobject *to)
 {
@@ -943,7 +911,7 @@ out_unlock:
 }
 EXPORT_SYMBOL_GPL(bd_link_disk_holder);
 
-void bd_unlink_disk_holder(struct block_device *bdev)
+static void bd_unlink_disk_holder(struct block_device *bdev)
 {
 	struct gendisk *disk = bdev->bd_holder_disk;
 
@@ -954,7 +922,9 @@ void bd_unlink_disk_holder(struct block_device *bdev)
 	del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 	del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 }
-EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
+#else
+static inline void bd_unlink_disk_holder(struct block_device *bdev)
+{ }
 #endif
 
 /*
@@ -964,12 +934,12 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * to be used for internal purposes.  If you ever need it - reconsider
  * your API.
  */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode)
+struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
 {
 	struct block_device *bdev = bdget(dev);
 	int err = -ENOMEM;
 	if (bdev)
-		err = blkdev_get(bdev, mode);
+		err = blkdev_get(bdev, mode, holder);
 	return err ? ERR_PTR(err) : bdev;
 }
 
@@ -1235,17 +1205,37 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
-int blkdev_get(struct block_device *bdev, fmode_t mode)
+int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
-	return __blkdev_get(bdev, mode, 0);
+	struct block_device *whole = NULL;
+	int res;
+
+	WARN_ON_ONCE((mode & FMODE_EXCL) && !holder);
+
+	if ((mode & FMODE_EXCL) && holder) {
+		whole = bd_start_claiming(bdev, holder);
+		if (IS_ERR(whole)) {
+			bdput(bdev);
+			return PTR_ERR(whole);
+		}
+	}
+
+	res = __blkdev_get(bdev, mode, 0);
+
+	if (whole) {
+		if (res == 0)
+			bd_finish_claiming(bdev, whole, holder);
+		else
+			bd_abort_claiming(whole, holder);
+	}
+
+	return res;
 }
 EXPORT_SYMBOL(blkdev_get);
 
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
-	struct block_device *whole = NULL;
 	struct block_device *bdev;
-	int res;
 
 	/*
 	 * Preserve backwards compatibility and allow large file access
@@ -1266,26 +1256,9 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	if (bdev == NULL)
 		return -ENOMEM;
 
-	if (filp->f_mode & FMODE_EXCL) {
-		whole = bd_start_claiming(bdev, filp);
-		if (IS_ERR(whole)) {
-			bdput(bdev);
-			return PTR_ERR(whole);
-		}
-	}
-
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 
-	res = blkdev_get(bdev, filp->f_mode);
-
-	if (whole) {
-		if (res == 0)
-			bd_finish_claiming(bdev, whole, filp);
-		else
-			bd_abort_claiming(whole, filp);
-	}
-
-	return res;
+	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
 static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
@@ -1329,6 +1302,13 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
+	if (mode & FMODE_EXCL) {
+		mutex_lock(&bdev->bd_mutex);
+		bd_release(bdev);
+		if (!bdev->bd_holders)
+			bd_unlink_disk_holder(bdev);
+		mutex_unlock(&bdev->bd_mutex);
+	}
 	return __blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
@@ -1336,8 +1316,7 @@ EXPORT_SYMBOL(blkdev_put);
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-	if (bdev->bd_holder == filp)
-		bd_release(bdev);
+
 	return blkdev_put(bdev, filp->f_mode);
 }
 
@@ -1494,55 +1473,27 @@ EXPORT_SYMBOL(lookup_bdev);
  */
 struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
 {
-	struct block_device *bdev, *whole;
+	struct block_device *bdev;
 	int error;
 
 	bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return bdev;
 
-	whole = bd_start_claiming(bdev, holder);
-	if (IS_ERR(whole)) {
-		bdput(bdev);
-		return whole;
-	}
-
-	error = blkdev_get(bdev, mode);
+	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
 	if (error)
-		goto out_abort_claiming;
+		return ERR_PTR(error);
 
-	error = -EACCES;
-	if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
-		goto out_blkdev_put;
+	if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
+		blkdev_put(bdev, mode);
+		return ERR_PTR(-EACCES);
+	}
 
-	bd_finish_claiming(bdev, whole, holder);
 	return bdev;
-
-out_blkdev_put:
-	blkdev_put(bdev, mode);
-out_abort_claiming:
-	bd_abort_claiming(whole, holder);
-	return ERR_PTR(error);
 }
 
 EXPORT_SYMBOL(open_bdev_exclusive);
 
-/**
- * close_bdev_exclusive  -  close a blockdevice opened by open_bdev_exclusive()
- *
- * @bdev:	blockdevice to close
- * @mode:	mode, must match that used to open.
- *
- * This is the counterpart to open_bdev_exclusive().
- */
-void close_bdev_exclusive(struct block_device *bdev, fmode_t mode)
-{
-	bd_release(bdev);
-	blkdev_put(bdev, mode);
-}
-
-EXPORT_SYMBOL(close_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d39596224d21..f1b729d3b883 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			close_bdev_exclusive(device->bdev, device->mode);
+			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -638,7 +638,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		close_bdev_exclusive(bdev, flags);
+		blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 		continue;
 	}
@@ -716,7 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	close_bdev_exclusive(bdev, flags);
+	blkdev_put(bdev, flags | FMODE_EXCL);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1244,7 +1244,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		close_bdev_exclusive(device->bdev, device->mode);
+		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1287,7 +1287,7 @@ error_brelse:
 	brelse(bh);
 error_close:
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ);
+		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
@@ -1565,7 +1565,7 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	return ret;
 error:
-	close_bdev_exclusive(bdev, 0);
+	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 2fedaf8b5012..23e7513dba9c 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -364,8 +364,7 @@ fail:
  */
 static int ext3_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
@@ -2136,13 +2135,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext3_msg(sb, KERN_ERR,
-			"error: failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe6254e..5dd0b3e76fa8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
@@ -663,8 +663,7 @@ fail:
  */
 static int ext4_blkdev_put(struct block_device *bdev)
 {
-	bd_release(bdev);
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
@@ -3758,13 +3757,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if (bdev == NULL)
 		return NULL;
 
-	if (bd_claim(bdev, sb)) {
-		ext4_msg(sb, KERN_ERR,
-			"failed to claim external journal device");
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
-		return NULL;
-	}
-
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3eb1393f7b81..c1f0763a022b 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index e1b8493b9aaa..5a290f22dcc3 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE);
+	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+			      log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
 	}
 
-	if ((rc = bd_claim(bdev, log))) {
-		goto close;
-	}
-
 	log->bdev = bdev;
 	memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid));
 
@@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb)
 	 * initialize log:
 	 */
 	if ((rc = lmLogInit(log)))
-		goto unclaim;
+		goto close;
 
 	list_add(&log->journal_list, &jfs_external_logs);
 
@@ -1163,11 +1160,8 @@ journal_found:
 	list_del(&log->journal_list);
 	lbmLogShutdown(log);
 
-      unclaim:
-	bd_release(bdev);
-
       close:		/* close external log device */
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb)
 	bdev = log->bdev;
 	rc = lmLogShutdown(log);
 
-	bd_release(bdev);
-	blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 
 	kfree(log);
 
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 92ca6fbe09bd..734b9025858e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page)
 
 static void bdev_put_device(struct logfs_super *s)
 {
-	close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE);
+	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
@@ -331,7 +331,7 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 
 	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
 		int mtdnr = MINOR(bdev->bd_dev);
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 		return logfs_get_sb_mtd(p, mtdnr);
 	}
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f804d41ec9d3..756a6798d7c8 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		close_bdev_exclusive(sd.bdev, mode);
+		blkdev_put(sd.bdev, mode | FMODE_EXCL);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 52c7557f3e25..d0a2721eaceb 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1674,7 +1674,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		goto out;
 
 	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
-	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ);
+	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
 	if (ret) {
 		reg->hr_bdev = NULL;
 		goto out;
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 0a8b0ad0c7e2..2e6501d034ab 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -549,7 +549,7 @@ void register_disk(struct gendisk *disk)
 		goto exit;
 
 	bdev->bd_invalidated = 1;
-	err = blkdev_get(bdev, FMODE_READ);
+	err = blkdev_get(bdev, FMODE_READ, NULL);
 	if (err < 0)
 		goto exit;
 	blkdev_put(bdev, FMODE_READ);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 076c8b194682..b488136f5ace 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2552,8 +2552,6 @@ static int release_journal_dev(struct super_block *super,
 	result = 0;
 
 	if (journal->j_dev_bd != NULL) {
-		if (journal->j_dev_bd->bd_dev != super->s_dev)
-			bd_release(journal->j_dev_bd);
 		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
@@ -2571,7 +2569,7 @@ static int journal_init_dev(struct super_block *super,
 {
 	int result;
 	dev_t jdev;
-	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE;
+	fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL;
 	char b[BDEVNAME_SIZE];
 
 	result = 0;
@@ -2585,7 +2583,9 @@ static int journal_init_dev(struct super_block *super,
 
 	/* there is no "jdev" option and journal is on separate device */
 	if ((!jdev_name || !jdev_name[0])) {
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
+		if (jdev == super->s_dev)
+			blkdev_mode &= ~FMODE_EXCL;
+		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2594,15 +2594,8 @@ static int journal_init_dev(struct super_block *super,
 					 "cannot init journal device '%s': %i",
 					 __bdevname(jdev, b), result);
 			return result;
-		} else if (jdev != super->s_dev) {
-			result = bd_claim(journal->j_dev_bd, journal);
-			if (result) {
-				blkdev_put(journal->j_dev_bd, blkdev_mode);
-				return result;
-			}
-
+		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-		}
 
 		return 0;
 	}
diff --git a/fs/super.c b/fs/super.c
index ca696155cd9a..22374bf0ba87 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -801,13 +801,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 
 		/*
 		 * s_umount nests inside bd_mutex during
-		 * __invalidate_device().  close_bdev_exclusive()
-		 * acquires bd_mutex and can't be called under
-		 * s_umount.  Drop s_umount temporarily.  This is safe
-		 * as we're holding an active reference.
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		close_bdev_exclusive(bdev, mode);
+		blkdev_put(bdev, mode | FMODE_EXCL);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 error:
 	return ERR_PTR(error);
 }
@@ -862,7 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
-	close_bdev_exclusive(bdev, mode);
+	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
 EXPORT_SYMBOL(kill_block_super);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9f3a78fe6ae4..a1a6e5ceea67 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -623,7 +623,7 @@ xfs_blkdev_put(
 	struct block_device	*bdev)
 {
 	if (bdev)
-		close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 66b7f2c5d7e9..1a033e8ebe4c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,7 +2006,8 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t, fmode_t);
+extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
+					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2037,22 +2038,16 @@ extern const struct file_operations def_fifo_fops;
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
-extern int blkdev_get(struct block_device *, fmode_t);
-extern int blkdev_put(struct block_device *, fmode_t);
-extern int bd_claim(struct block_device *, void *);
-extern void bd_release(struct block_device *);
+extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
-extern void bd_unlink_disk_holder(struct block_device *bdev);
 #else
 static inline int bd_link_disk_holder(struct block_device *bdev,
 				      struct gendisk *disk)
 {
 	return 0;
 }
-static inline void bd_unlink_disk_holder(struct block_device *bdev)
-{
-}
 #endif
 #endif
 
@@ -2089,7 +2084,6 @@ extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
-extern void close_bdev_exclusive(struct block_device *, fmode_t);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0e4a86ccf94..513a77f1a0b3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -223,7 +223,7 @@ static int swsusp_swap_check(void)
 		return res;
 
 	root_swap = res;
-	res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
+	res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL);
 	if (res)
 		return res;
 
@@ -907,7 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
+	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
+					 FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..b6adcfbf6f48 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	} else {
 		mutex_lock(&inode->i_mutex);
 		inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = -EINVAL;
 	if (S_ISBLK(inode->i_mode)) {
 		bdev = I_BDEV(inode);
-		error = bd_claim(bdev, sys_swapon);
+		error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				   sys_swapon);
 		if (error < 0) {
 			bdev = NULL;
 			error = -EINVAL;
@@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 bad_swap:
 	if (bdev) {
 		set_blocksize(bdev, p->old_block_size);
-		bd_release(bdev);
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
 	}
 	destroy_swap_extents(p);
 	swap_cgroup_swapoff(type);
-- 
cgit v1.3-8-gc7d7


From d4d77629953eabd3c14f6fa5746f6b28babfc55f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 13 Nov 2010 11:55:18 +0100
Subject: block: clean up blkdev_get() wrappers and their users

After recent blkdev_get() modifications, open_by_devnum() and
open_bdev_exclusive() are simple wrappers around blkdev_get().
Replace them with blkdev_get_by_dev() and blkdev_get_by_path().

blkdev_get_by_dev() is identical to open_by_devnum().
blkdev_get_by_path() is slightly different in that it doesn't
automatically add %FMODE_EXCL to @mode.

All users are converted.  Most conversions are mechanical and don't
introduce any behavior difference.  There are several exceptions.

* btrfs now sets FMODE_EXCL in btrfs_device->mode, so there's no
  reason to OR it explicitly on blkdev_put().

* gfs2, nilfs2 and the generic mount_bdev() now set FMODE_EXCL in
  sb->s_mode.

* With the above changes, sb->s_mode now always should contain
  FMODE_EXCL.  WARN_ON_ONCE() added to kill_block_super() to detect
  errors.

The new blkdev_get_*() functions are with proper docbook comments.
While at it, add function description to blkdev_get() too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Joern Engel <joern@lazybastard.org>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Jan Kara <jack@suse.cz>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Cc: KONISHI Ryusuke <konishi.ryusuke@lab.ntt.co.jp>
Cc: reiserfs-devel@vger.kernel.org
Cc: xfs-masters@oss.sgi.com
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
---
 drivers/block/drbd/drbd_nl.c    |  12 ++--
 drivers/md/dm-table.c           |   2 +-
 drivers/md/md.c                 |   4 +-
 drivers/mtd/devices/block2mtd.c |   4 +-
 fs/block_dev.c                  | 139 +++++++++++++++++++++++++++-------------
 fs/btrfs/volumes.c              |  24 ++++---
 fs/btrfs/volumes.h              |   2 +-
 fs/ext3/super.c                 |   2 +-
 fs/ext4/super.c                 |   2 +-
 fs/gfs2/ops_fstype.c            |   8 +--
 fs/jfs/jfs_logmgr.c             |   4 +-
 fs/logfs/dev_bdev.c             |   3 +-
 fs/nilfs2/super.c               |   8 +--
 fs/reiserfs/journal.c           |   6 +-
 fs/super.c                      |   9 +--
 fs/xfs/linux-2.6/xfs_super.c    |   3 +-
 include/linux/fs.h              |   7 +-
 kernel/power/swap.c             |   4 +-
 18 files changed, 149 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fd0346090289..650e43ba4f7c 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -902,8 +902,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		}
 	}
 
-	bdev = open_bdev_exclusive(nbc->dc.backing_dev,
-				   FMODE_READ | FMODE_WRITE, mdev);
+	bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
 			PTR_ERR(bdev));
@@ -920,10 +920,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * should check it for you already; but if you don't, or
 	 * someone fooled it, we need to double check here)
 	 */
-	bdev = open_bdev_exclusive(nbc->dc.meta_dev,
-				   FMODE_READ | FMODE_WRITE,
-				   (nbc->dc.meta_dev_idx < 0) ?
-				   (void *)mdev : (void *)drbd_m_holder);
+	bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+				  (nbc->dc.meta_dev_idx < 0) ?
+				  (void *)mdev : (void *)drbd_m_holder);
 	if (IS_ERR(bdev)) {
 		dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
 			PTR_ERR(bdev));
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9e88ca0c55e9..67150c32986c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -325,7 +325,7 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev,
 
 	BUG_ON(d->dm_dev.bdev);
 
-	bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
+	bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 6af951ffe0bb..5aaa6bfbe638 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1934,8 +1934,8 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      shared ? (mdk_rdev_t *)lock_rdev : rdev);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 shared ? (mdk_rdev_t *)lock_rdev : rdev);
 	if (IS_ERR(bdev)) {
 		printk(KERN_ERR "md: could not open %s.\n",
 			__bdevname(dev, b));
diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c
index aa557beb8f51..f29a6f9df6e7 100644
--- a/drivers/mtd/devices/block2mtd.c
+++ b/drivers/mtd/devices/block2mtd.c
@@ -247,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 		return NULL;
 
 	/* Get a handle on the device */
-	bdev = open_bdev_exclusive(devname, mode, dev);
+	bdev = blkdev_get_by_path(devname, mode, dev);
 #ifndef MODULE
 	if (IS_ERR(bdev)) {
 
@@ -256,7 +256,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size)
 
 		dev_t devt = name_to_dev_t(devname);
 		if (devt)
-			bdev = open_by_devnum(devt, mode, dev);
+			bdev = blkdev_get_by_dev(devt, mode, dev);
 	}
 #endif
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 606a5259f87f..c1c1b8c3fb99 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -854,24 +854,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev)
 { }
 #endif
 
-/*
- * Tries to open block device by device number.  Use it ONLY if you
- * really do not have anything better - i.e. when you are behind a
- * truly sucky interface and all you are given is a device number.  _Never_
- * to be used for internal purposes.  If you ever need it - reconsider
- * your API.
- */
-struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder)
-{
-	struct block_device *bdev = bdget(dev);
-	int err = -ENOMEM;
-	if (bdev)
-		err = blkdev_get(bdev, mode, holder);
-	return err ? ERR_PTR(err) : bdev;
-}
-
-EXPORT_SYMBOL(open_by_devnum);
-
 /**
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
@@ -1132,6 +1114,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	return ret;
 }
 
+/**
+ * blkdev_get - open a block device
+ * @bdev: block_device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open @bdev with @mode.  If @mode includes %FMODE_EXCL, @bdev is
+ * open with exclusive access.  Specifying %FMODE_EXCL with %NULL
+ * @holder is invalid.  Exclusive opens may nest for the same @holder.
+ *
+ * On success, the reference count of @bdev is unchanged.  On failure,
+ * @bdev is put.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
 int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 {
 	struct block_device *whole = NULL;
@@ -1186,6 +1187,80 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
 }
 EXPORT_SYMBOL(blkdev_get);
 
+/**
+ * blkdev_get_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by the device file at @path.  @mode
+ * and @holder are identical to blkdev_get().
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev))
+		return bdev;
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_path);
+
+/**
+ * blkdev_get_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: FMODE_* mask
+ * @holder: exclusive holder identifier
+ *
+ * Open the blockdevice described by device number @dev.  @mode and
+ * @holder are identical to blkdev_get().
+ *
+ * Use it ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a
+ * device number.  _Never_ to be used for internal purposes.  If you
+ * ever need it - reconsider your API.
+ *
+ * On success, the returned block_device has reference count of one.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to block_device on success, ERR_PTR(-errno) on failure.
+ */
+struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
+{
+	struct block_device *bdev;
+	int err;
+
+	bdev = bdget(dev);
+	if (!bdev)
+		return ERR_PTR(-ENOMEM);
+
+	err = blkdev_get(bdev, mode, holder);
+	if (err)
+		return ERR_PTR(err);
+
+	return bdev;
+}
+EXPORT_SYMBOL(blkdev_get_by_dev);
+
 static int blkdev_open(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev;
@@ -1436,34 +1511,6 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-/**
- * open_bdev_exclusive  -  open a block device by name and set it up for use
- *
- * @path:	special file representing the block device
- * @mode:	FMODE_... combination to pass be used
- * @holder:	owner for exclusion
- *
- * Open the blockdevice described by the special file at @path, claim it
- * for the @holder.
- */
-struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
-{
-	struct block_device *bdev;
-	int error;
-
-	bdev = lookup_bdev(path);
-	if (IS_ERR(bdev))
-		return bdev;
-
-	error = blkdev_get(bdev, mode | FMODE_EXCL, holder);
-	if (error)
-		return ERR_PTR(error);
-
-	return bdev;
-}
-
-EXPORT_SYMBOL(open_bdev_exclusive);
-
 int __invalidate_device(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1b729d3b883..95324e9f9280 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -489,7 +489,7 @@ again:
 			continue;
 
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
 			fs_devices->open_devices--;
 		}
@@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
-			blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+			blkdev_put(device->bdev, device->mode);
 			fs_devices->open_devices--;
 		}
 		if (device->writeable) {
@@ -580,13 +580,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
+	flags |= FMODE_EXCL;
+
 	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
 			continue;
 
-		bdev = open_bdev_exclusive(device->name, flags, holder);
+		bdev = blkdev_get_by_path(device->name, flags, holder);
 		if (IS_ERR(bdev)) {
 			printk(KERN_INFO "open %s failed\n", device->name);
 			goto error;
@@ -638,7 +640,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 error_brelse:
 		brelse(bh);
 error_close:
-		blkdev_put(bdev, flags | FMODE_EXCL);
+		blkdev_put(bdev, flags);
 error:
 		continue;
 	}
@@ -684,7 +686,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	mutex_lock(&uuid_mutex);
 
-	bdev = open_bdev_exclusive(path, flags, holder);
+	flags |= FMODE_EXCL;
+	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
 		ret = PTR_ERR(bdev);
@@ -716,7 +719,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 
 	brelse(bh);
 error_close:
-	blkdev_put(bdev, flags | FMODE_EXCL);
+	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
 	return ret;
@@ -1179,8 +1182,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 			goto out;
 		}
 	} else {
-		bdev = open_bdev_exclusive(device_path, FMODE_READ,
-				      root->fs_info->bdev_holder);
+		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
+					  root->fs_info->bdev_holder);
 		if (IS_ERR(bdev)) {
 			ret = PTR_ERR(bdev);
 			goto out;
@@ -1244,7 +1247,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
 
 	if (device->bdev) {
-		blkdev_put(device->bdev, device->mode | FMODE_EXCL);
+		blkdev_put(device->bdev, device->mode);
 		device->bdev = NULL;
 		device->fs_devices->open_devices--;
 	}
@@ -1439,7 +1442,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
 		return -EINVAL;
 
-	bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+				  root->fs_info->bdev_holder);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2b638b6e4eea..856e75770304 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -49,7 +49,7 @@ struct btrfs_device {
 
 	struct block_device *bdev;
 
-	/* the mode sent to open_bdev_exclusive */
+	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
 	char *name;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 23e7513dba9c..123720ba786d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5dd0b3e76fa8..bd63e6927219 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
 
-	bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
+	bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb);
 	if (IS_ERR(bdev))
 		goto fail;
 	return bdev;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c1f0763a022b..bc56ccf98ffd 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error;
 	struct gfs2_args args;
 	struct gfs2_sbd *sdp;
@@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		goto error_bdev;
 
 	if (s->s_root)
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1342,7 +1342,7 @@ error_super:
 	deactivate_locked_super(s);
 	return ERR_PTR(error);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 	return ERR_PTR(error);
 }
 
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 5a290f22dcc3..278e3fb40b71 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1120,8 +1120,8 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-			      log);
+	bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				 log);
 	if (IS_ERR(bdev)) {
 		rc = -PTR_ERR(bdev);
 		goto free;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 734b9025858e..723bc5bca09a 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -325,7 +325,8 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
 {
 	struct block_device *bdev;
 
-	bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
+	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  type);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 756a6798d7c8..0030640e2d72 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1147,14 +1147,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 {
 	struct nilfs_super_data sd;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	struct dentry *root_dentry;
 	int err, s_new = false;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(sd.bdev))
 		return ERR_CAST(sd.bdev);
 
@@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 
 	return root_dentry;
 
@@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 
  failed:
 	if (!s_new)
-		blkdev_put(sd.bdev, mode | FMODE_EXCL);
+		blkdev_put(sd.bdev, mode);
 	return ERR_PTR(err);
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index b488136f5ace..e2fce519c0f2 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2585,7 +2585,8 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			blkdev_mode &= ~FMODE_EXCL;
-		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal);
+		journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode,
+						      journal);
 		journal->j_dev_mode = blkdev_mode;
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
@@ -2601,8 +2602,7 @@ static int journal_init_dev(struct super_block *super,
 	}
 
 	journal->j_dev_mode = blkdev_mode;
-	journal->j_dev_bd = open_bdev_exclusive(jdev_name,
-						blkdev_mode, journal);
+	journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal);
 	if (IS_ERR(journal->j_dev_bd)) {
 		result = PTR_ERR(journal->j_dev_bd);
 		journal->j_dev_bd = NULL;
diff --git a/fs/super.c b/fs/super.c
index 22374bf0ba87..5d9a4497849a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -766,13 +766,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 {
 	struct block_device *bdev;
 	struct super_block *s;
-	fmode_t mode = FMODE_READ;
+	fmode_t mode = FMODE_READ | FMODE_EXCL;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = open_bdev_exclusive(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -807,7 +807,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		 * holding an active reference.
 		 */
 		up_write(&s->s_umount);
-		blkdev_put(bdev, mode | FMODE_EXCL);
+		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 error_s:
 	error = PTR_ERR(s);
 error_bdev:
-	blkdev_put(bdev, mode | FMODE_EXCL);
+	blkdev_put(bdev, mode);
 error:
 	return ERR_PTR(error);
 }
@@ -862,6 +862,7 @@ void kill_block_super(struct super_block *sb)
 	bdev->bd_super = NULL;
 	generic_shutdown_super(sb);
 	sync_blockdev(bdev);
+	WARN_ON_ONCE(!(mode & FMODE_EXCL));
 	blkdev_put(bdev, mode | FMODE_EXCL);
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a1a6e5ceea67..9209cd199c47 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -609,7 +609,8 @@ xfs_blkdev_get(
 {
 	int			error = 0;
 
-	*bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp);
+	*bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				    mp);
 	if (IS_ERR(*bdevp)) {
 		error = PTR_ERR(*bdevp);
 		printk("XFS: Invalid device [%s], error=%d\n", name, error);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1a033e8ebe4c..f48501563917 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2006,8 +2006,6 @@ extern struct block_device *bdgrab(struct block_device *bdev);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
-extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode,
-					   void *holder);
 extern void invalidate_bdev(struct block_device *);
 extern int sync_blockdev(struct block_device *bdev);
 extern struct super_block *freeze_bdev(struct block_device *);
@@ -2039,6 +2037,10 @@ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
 extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long);
 extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder);
+extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
+					       void *holder);
+extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
+					      void *holder);
 extern int blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
@@ -2083,7 +2085,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name)
 extern const char *__bdevname(dev_t, char *buffer);
 extern const char *bdevname(struct block_device *bdev, char *buffer);
 extern struct block_device *lookup_bdev(const char *);
-extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *);
 extern void blkdev_show(struct seq_file *,off_t);
 
 #else
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 513a77f1a0b3..b019609d1b45 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -907,8 +907,8 @@ int swsusp_check(void)
 {
 	int error;
 
-	hib_resume_bdev = open_by_devnum(swsusp_resume_device,
-					 FMODE_READ, NULL);
+	hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
+					    FMODE_READ, NULL);
 	if (!IS_ERR(hib_resume_bdev)) {
 		set_blocksize(hib_resume_bdev, PAGE_SIZE);
 		clear_page(swsusp_header);
-- 
cgit v1.3-8-gc7d7


From d07335e51df0c6dec202d315fc4f1f7e100eec4e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 16 Nov 2010 12:52:38 +0100
Subject: block: Rename "block_remap" tracepoint to "block_bio_remap" to
 clarify the event.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c             | 10 +++++-----
 drivers/md/dm.c              |  4 ++--
 include/trace/events/block.h |  6 +++---
 kernel/trace/blktrace.c      | 12 ++++++------
 4 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 4ce953f1b390..151070541e21 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,7 +33,7 @@
 
 #include "blk.h"
 
-EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
@@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev,
-				    bio->bi_sector - p->start_sect);
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+				      bdev->bd_dev,
+				      bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, old_sector);
+			trace_block_bio_remap(q, bio, old_dev, old_sector);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7cb1352f7e7a..0a2b5516bc21 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,8 +990,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 
-		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector);
+		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
+				      tio->io->bio->bi_bdev->bd_dev, sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d8ce278515c3..b56c65dc105d 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -486,16 +486,16 @@ TRACE_EVENT(block_split,
 );
 
 /**
- * block_remap - map request for a partition to the raw device
+ * block_bio_remap - map request for a logical device to the raw device
  * @q: queue holding the operation
  * @bio: revised operation
  * @dev: device for the operation
  * @from: original sector for the operation
  *
- * An operation for a partition on a block device has been mapped to the
+ * An operation for a logical device has been mapped to the
  * raw block device.
  */
-TRACE_EVENT(block_remap,
+TRACE_EVENT(block_bio_remap,
 
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
 		 sector_t from),
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7b8ec0281548..2b8e2ee7c0ef 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -887,7 +887,7 @@ static void blk_add_trace_split(void *ignore,
 }
 
 /**
- * blk_add_trace_remap - Add a trace for a remap operation
+ * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
  * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @bio:	the source bio
@@ -899,9 +899,9 @@ static void blk_add_trace_split(void *ignore,
  *     it spans a stripe (or similar). Add a trace for that action.
  *
  **/
-static void blk_add_trace_remap(void *ignore,
-				struct request_queue *q, struct bio *bio,
-				dev_t dev, sector_t from)
+static void blk_add_trace_bio_remap(void *ignore,
+				    struct request_queue *q, struct bio *bio,
+				    dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -1016,7 +1016,7 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_remap(blk_add_trace_remap, NULL);
+	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	WARN_ON(ret);
@@ -1025,7 +1025,7 @@ static void blk_register_tracepoints(void)
 static void blk_unregister_tracepoints(void)
 {
 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
-	unregister_trace_block_remap(blk_add_trace_remap, NULL);
+	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
 	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
-- 
cgit v1.3-8-gc7d7


From 073ef1f6e508688392580e4f35dcad9aabd1e100 Mon Sep 17 00:00:00 2001
From: Lionel Debroux <lionel_debroux@yahoo.fr>
Date: Tue, 9 Nov 2010 21:48:49 +0100
Subject: hibernation: constify platform_hibernation_ops

Patch against mainline.

Changes since v1: added one hunk; no longer adding "const" qualifier to
pointers in platform_hibernation_ops after seeing
b4144e4f6e3b448d322095ca08af393682a69e33.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/acpi/sleep.c     | 4 ++--
 include/linux/suspend.h  | 4 ++--
 kernel/power/hibernate.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 721d93b3ceee..5149c9bf7015 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -498,7 +498,7 @@ static void acpi_pm_thaw(void)
 	acpi_enable_all_runtime_gpes();
 }
 
-static struct platform_hibernation_ops acpi_hibernation_ops = {
+static const struct platform_hibernation_ops acpi_hibernation_ops = {
 	.begin = acpi_hibernation_begin,
 	.end = acpi_pm_end,
 	.pre_snapshot = acpi_pm_prepare,
@@ -541,7 +541,7 @@ static int acpi_hibernation_begin_old(void)
  * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
  * been requested.
  */
-static struct platform_hibernation_ops acpi_hibernation_ops_old = {
+static const struct platform_hibernation_ops acpi_hibernation_ops_old = {
 	.begin = acpi_hibernation_begin_old,
 	.end = acpi_pm_end,
 	.pre_snapshot = acpi_pm_pre_suspend,
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..40ead943fd6a 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -245,7 +245,7 @@ extern void swsusp_set_page_free(struct page *);
 extern void swsusp_unset_page_free(struct page *);
 extern unsigned long get_safe_page(gfp_t gfp_mask);
 
-extern void hibernation_set_ops(struct platform_hibernation_ops *ops);
+extern void hibernation_set_ops(const struct platform_hibernation_ops *ops);
 extern int hibernate(void);
 extern bool system_entering_hibernation(void);
 #else /* CONFIG_HIBERNATION */
@@ -253,7 +253,7 @@ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; }
 static inline void swsusp_set_page_free(struct page *p) {}
 static inline void swsusp_unset_page_free(struct page *p) {}
 
-static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {}
+static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {}
 static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 657272e91d0a..491b81a27111 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -51,14 +51,14 @@ enum {
 
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
 
-static struct platform_hibernation_ops *hibernation_ops;
+static const struct platform_hibernation_ops *hibernation_ops;
 
 /**
  * hibernation_set_ops - set the global hibernate operations
  * @ops: the hibernation operations to use in subsequent hibernation transitions
  */
 
-void hibernation_set_ops(struct platform_hibernation_ops *ops)
+void hibernation_set_ops(const struct platform_hibernation_ops *ops)
 {
 	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
 	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-- 
cgit v1.3-8-gc7d7


From 2f55ac072f5344519348c0c94b3d2f4cca46847b Mon Sep 17 00:00:00 2001
From: Lionel Debroux <lionel_debroux@yahoo.fr>
Date: Tue, 16 Nov 2010 14:14:02 +0100
Subject: suspend: constify platform_suspend_ops

While at it, fix two checkpatch errors.
Several non-const struct instances constified by this patch were added after
the introduction of platform_suspend_ops in checkpatch.pl's list of "should
be const" structs (79404849e90a41ea2109bd0e2f7c7164b0c4ce73).

Patch against mainline.
Inspired by hunks of the grsecurity patch, updated for newer kernels.

Signed-off-by: Lionel Debroux <lionel_debroux@yahoo.fr>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-at91/pm.c                   | 2 +-
 arch/arm/mach-davinci/pm.c                | 2 +-
 arch/arm/mach-imx/pm-imx27.c              | 2 +-
 arch/arm/mach-lpc32xx/pm.c                | 2 +-
 arch/arm/mach-omap1/pm.c                  | 2 +-
 arch/arm/mach-omap2/pm24xx.c              | 2 +-
 arch/arm/mach-omap2/pm34xx.c              | 2 +-
 arch/arm/mach-omap2/pm44xx.c              | 2 +-
 arch/arm/mach-pnx4008/pm.c                | 2 +-
 arch/arm/mach-pxa/pm.c                    | 2 +-
 arch/arm/mach-pxa/sharpsl_pm.c            | 2 +-
 arch/arm/mach-sa1100/pm.c                 | 2 +-
 arch/arm/plat-samsung/pm.c                | 2 +-
 arch/avr32/mach-at32ap/pm.c               | 2 +-
 arch/blackfin/mach-common/pm.c            | 2 +-
 arch/mips/alchemy/devboards/pm.c          | 2 +-
 arch/mips/jz4740/pm.c                     | 2 +-
 arch/mips/loongson/common/pm.c            | 2 +-
 arch/powerpc/platforms/52xx/lite5200_pm.c | 2 +-
 arch/powerpc/platforms/52xx/mpc52xx_pm.c  | 2 +-
 arch/powerpc/platforms/83xx/suspend.c     | 2 +-
 arch/powerpc/platforms/pseries/suspend.c  | 2 +-
 arch/powerpc/sysdev/fsl_pmc.c             | 2 +-
 arch/sh/boards/mach-hp6xx/pm.c            | 2 +-
 arch/sh/kernel/cpu/shmobile/pm.c          | 2 +-
 drivers/acpi/sleep.c                      | 4 ++--
 drivers/macintosh/via-pmu.c               | 2 +-
 include/linux/suspend.h                   | 4 ++--
 kernel/power/suspend.c                    | 4 ++--
 29 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c
index dafbacc25eb1..ea53f4d9b283 100644
--- a/arch/arm/mach-at91/pm.c
+++ b/arch/arm/mach-at91/pm.c
@@ -301,7 +301,7 @@ static void at91_pm_end(void)
 }
 
 
-static struct platform_suspend_ops at91_pm_ops ={
+static const struct platform_suspend_ops at91_pm_ops = {
 	.valid	= at91_pm_valid_state,
 	.begin	= at91_pm_begin,
 	.enter	= at91_pm_enter,
diff --git a/arch/arm/mach-davinci/pm.c b/arch/arm/mach-davinci/pm.c
index fab953b43dea..1bd73a04be20 100644
--- a/arch/arm/mach-davinci/pm.c
+++ b/arch/arm/mach-davinci/pm.c
@@ -110,7 +110,7 @@ static int davinci_pm_enter(suspend_state_t state)
 	return ret;
 }
 
-static struct platform_suspend_ops davinci_pm_ops = {
+static const struct platform_suspend_ops davinci_pm_ops = {
 	.enter		= davinci_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/arm/mach-imx/pm-imx27.c b/arch/arm/mach-imx/pm-imx27.c
index afc17ce0bb54..2153ca71bb82 100644
--- a/arch/arm/mach-imx/pm-imx27.c
+++ b/arch/arm/mach-imx/pm-imx27.c
@@ -32,7 +32,7 @@ static int mx27_suspend_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops mx27_suspend_ops = {
+static const struct platform_suspend_ops mx27_suspend_ops = {
 	.enter = mx27_suspend_enter,
 	.valid = suspend_valid_only_mem,
 };
diff --git a/arch/arm/mach-lpc32xx/pm.c b/arch/arm/mach-lpc32xx/pm.c
index a6e2aed9a49f..e76d41bb7056 100644
--- a/arch/arm/mach-lpc32xx/pm.c
+++ b/arch/arm/mach-lpc32xx/pm.c
@@ -123,7 +123,7 @@ static int lpc32xx_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops lpc32xx_pm_ops = {
+static const struct platform_suspend_ops lpc32xx_pm_ops = {
 	.valid	= suspend_valid_only_mem,
 	.enter	= lpc32xx_pm_enter,
 };
diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c
index b1d3f9fade23..4cf3b67dd998 100644
--- a/arch/arm/mach-omap1/pm.c
+++ b/arch/arm/mach-omap1/pm.c
@@ -647,7 +647,7 @@ static struct irqaction omap_wakeup_irq = {
 
 
-static struct platform_suspend_ops omap_pm_ops ={
+static const struct platform_suspend_ops omap_pm_ops = {
 	.prepare	= omap_pm_prepare,
 	.enter		= omap_pm_enter,
 	.finish		= omap_pm_finish,
diff --git a/arch/arm/mach-omap2/pm24xx.c b/arch/arm/mach-omap2/pm24xx.c
index a40457d81927..aa9764eeb941 100644
--- a/arch/arm/mach-omap2/pm24xx.c
+++ b/arch/arm/mach-omap2/pm24xx.c
@@ -326,7 +326,7 @@ static void omap2_pm_finish(void)
 	enable_hlt();
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.prepare	= omap2_pm_prepare,
 	.enter		= omap2_pm_enter,
 	.finish		= omap2_pm_finish,
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 75c0cd13ad8e..4000c3c8bbd5 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -594,7 +594,7 @@ static void omap3_pm_end(void)
 	return;
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.begin		= omap3_pm_begin,
 	.end		= omap3_pm_end,
 	.prepare	= omap3_pm_prepare,
diff --git a/arch/arm/mach-omap2/pm44xx.c b/arch/arm/mach-omap2/pm44xx.c
index 54544b4fc76b..dc8b1ef9f84c 100644
--- a/arch/arm/mach-omap2/pm44xx.c
+++ b/arch/arm/mach-omap2/pm44xx.c
@@ -75,7 +75,7 @@ static void omap4_pm_end(void)
 	return;
 }
 
-static struct platform_suspend_ops omap_pm_ops = {
+static const struct platform_suspend_ops omap_pm_ops = {
 	.begin		= omap4_pm_begin,
 	.end		= omap4_pm_end,
 	.prepare	= omap4_pm_prepare,
diff --git a/arch/arm/mach-pnx4008/pm.c b/arch/arm/mach-pnx4008/pm.c
index ee3c29c57ae3..f3e60a049f98 100644
--- a/arch/arm/mach-pnx4008/pm.c
+++ b/arch/arm/mach-pnx4008/pm.c
@@ -119,7 +119,7 @@ static int pnx4008_pm_valid(suspend_state_t state)
 	       (state == PM_SUSPEND_MEM);
 }
 
-static struct platform_suspend_ops pnx4008_pm_ops = {
+static const struct platform_suspend_ops pnx4008_pm_ops = {
 	.enter = pnx4008_pm_enter,
 	.valid = pnx4008_pm_valid,
 };
diff --git a/arch/arm/mach-pxa/pm.c b/arch/arm/mach-pxa/pm.c
index 166c15f62916..978e1b289544 100644
--- a/arch/arm/mach-pxa/pm.c
+++ b/arch/arm/mach-pxa/pm.c
@@ -96,7 +96,7 @@ void pxa_pm_finish(void)
 		pxa_cpu_pm_fns->finish();
 }
 
-static struct platform_suspend_ops pxa_pm_ops = {
+static const struct platform_suspend_ops pxa_pm_ops = {
 	.valid		= pxa_pm_valid,
 	.enter		= pxa_pm_enter,
 	.prepare	= pxa_pm_prepare,
diff --git a/arch/arm/mach-pxa/sharpsl_pm.c b/arch/arm/mach-pxa/sharpsl_pm.c
index 8fed027b12dc..02874e96ec72 100644
--- a/arch/arm/mach-pxa/sharpsl_pm.c
+++ b/arch/arm/mach-pxa/sharpsl_pm.c
@@ -868,7 +868,7 @@ static void sharpsl_apm_get_power_status(struct apm_power_info *info)
 }
 
 #ifdef CONFIG_PM
-static struct platform_suspend_ops sharpsl_pm_ops = {
+static const struct platform_suspend_ops sharpsl_pm_ops = {
 	.prepare	= pxa_pm_prepare,
 	.finish		= pxa_pm_finish,
 	.enter		= corgi_pxa_pm_enter,
diff --git a/arch/arm/mach-sa1100/pm.c b/arch/arm/mach-sa1100/pm.c
index c83fdc80edfd..ab9fc4470d36 100644
--- a/arch/arm/mach-sa1100/pm.c
+++ b/arch/arm/mach-sa1100/pm.c
@@ -120,7 +120,7 @@ unsigned long sleep_phys_sp(void *sp)
 	return virt_to_phys(sp);
 }
 
-static struct platform_suspend_ops sa11x0_pm_ops = {
+static const struct platform_suspend_ops sa11x0_pm_ops = {
 	.enter		= sa11x0_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/arm/plat-samsung/pm.c b/arch/arm/plat-samsung/pm.c
index 27cfca597699..5bf3f2f09e74 100644
--- a/arch/arm/plat-samsung/pm.c
+++ b/arch/arm/plat-samsung/pm.c
@@ -355,7 +355,7 @@ static void s3c_pm_finish(void)
 	s3c_pm_check_cleanup();
 }
 
-static struct platform_suspend_ops s3c_pm_ops = {
+static const struct platform_suspend_ops s3c_pm_ops = {
 	.enter		= s3c_pm_enter,
 	.prepare	= s3c_pm_prepare,
 	.finish		= s3c_pm_finish,
diff --git a/arch/avr32/mach-at32ap/pm.c b/arch/avr32/mach-at32ap/pm.c
index f021edfeaab0..32d680eb6f48 100644
--- a/arch/avr32/mach-at32ap/pm.c
+++ b/arch/avr32/mach-at32ap/pm.c
@@ -176,7 +176,7 @@ out:
 	return 0;
 }
 
-static struct platform_suspend_ops avr32_pm_ops = {
+static const struct platform_suspend_ops avr32_pm_ops = {
 	.valid	= avr32_pm_valid_state,
 	.enter	= avr32_pm_enter,
 };
diff --git a/arch/blackfin/mach-common/pm.c b/arch/blackfin/mach-common/pm.c
index 80884b136a0c..745af2d96553 100644
--- a/arch/blackfin/mach-common/pm.c
+++ b/arch/blackfin/mach-common/pm.c
@@ -233,7 +233,7 @@ static int bfin_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-struct platform_suspend_ops bfin_pm_ops = {
+static const struct platform_suspend_ops bfin_pm_ops = {
 	.enter = bfin_pm_enter,
 	.valid	= bfin_pm_valid,
 };
diff --git a/arch/mips/alchemy/devboards/pm.c b/arch/mips/alchemy/devboards/pm.c
index 4bbd3133e451..acaf91b5e461 100644
--- a/arch/mips/alchemy/devboards/pm.c
+++ b/arch/mips/alchemy/devboards/pm.c
@@ -110,7 +110,7 @@ static void db1x_pm_end(void)
 
 }
 
-static struct platform_suspend_ops db1x_pm_ops = {
+static const struct platform_suspend_ops db1x_pm_ops = {
 	.valid		= suspend_valid_only_mem,
 	.begin		= db1x_pm_begin,
 	.enter		= db1x_pm_enter,
diff --git a/arch/mips/jz4740/pm.c b/arch/mips/jz4740/pm.c
index a9994585424d..902d5b50124c 100644
--- a/arch/mips/jz4740/pm.c
+++ b/arch/mips/jz4740/pm.c
@@ -42,7 +42,7 @@ static int jz4740_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops jz4740_pm_ops = {
+static const struct platform_suspend_ops jz4740_pm_ops = {
 	.valid		= suspend_valid_only_mem,
 	.enter		= jz4740_pm_enter,
 };
diff --git a/arch/mips/loongson/common/pm.c b/arch/mips/loongson/common/pm.c
index 6c1fd9001712..f55e07aee071 100644
--- a/arch/mips/loongson/common/pm.c
+++ b/arch/mips/loongson/common/pm.c
@@ -147,7 +147,7 @@ static int loongson_pm_valid_state(suspend_state_t state)
 	}
 }
 
-static struct platform_suspend_ops loongson_pm_ops = {
+static const struct platform_suspend_ops loongson_pm_ops = {
 	.valid	= loongson_pm_valid_state,
 	.enter	= loongson_pm_enter,
 };
diff --git a/arch/powerpc/platforms/52xx/lite5200_pm.c b/arch/powerpc/platforms/52xx/lite5200_pm.c
index 80234e5921f5..eda0fc2a3914 100644
--- a/arch/powerpc/platforms/52xx/lite5200_pm.c
+++ b/arch/powerpc/platforms/52xx/lite5200_pm.c
@@ -232,7 +232,7 @@ static void lite5200_pm_end(void)
 	lite5200_pm_target_state = PM_SUSPEND_ON;
 }
 
-static struct platform_suspend_ops lite5200_pm_ops = {
+static const struct platform_suspend_ops lite5200_pm_ops = {
 	.valid		= lite5200_pm_valid,
 	.begin		= lite5200_pm_begin,
 	.prepare	= lite5200_pm_prepare,
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
index 568cef636275..8310e8b5b57f 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c
@@ -186,7 +186,7 @@ void mpc52xx_pm_finish(void)
 	iounmap(mbar);
 }
 
-static struct platform_suspend_ops mpc52xx_pm_ops = {
+static const struct platform_suspend_ops mpc52xx_pm_ops = {
 	.valid		= mpc52xx_pm_valid,
 	.prepare	= mpc52xx_pm_prepare,
 	.enter		= mpc52xx_pm_enter,
diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
index 75ae77f1af6a..fd4f2f2f19e6 100644
--- a/arch/powerpc/platforms/83xx/suspend.c
+++ b/arch/powerpc/platforms/83xx/suspend.c
@@ -311,7 +311,7 @@ static int mpc83xx_is_pci_agent(void)
 	return ret;
 }
 
-static struct platform_suspend_ops mpc83xx_suspend_ops = {
+static const struct platform_suspend_ops mpc83xx_suspend_ops = {
 	.valid = mpc83xx_suspend_valid,
 	.begin = mpc83xx_suspend_begin,
 	.enter = mpc83xx_suspend_enter,
diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c
index ed72098bb4e3..a8ca289ff267 100644
--- a/arch/powerpc/platforms/pseries/suspend.c
+++ b/arch/powerpc/platforms/pseries/suspend.c
@@ -153,7 +153,7 @@ static struct sysdev_class suspend_sysdev_class = {
 	.name = "power",
 };
 
-static struct platform_suspend_ops pseries_suspend_ops = {
+static const struct platform_suspend_ops pseries_suspend_ops = {
 	.valid		= suspend_valid_only_mem,
 	.begin		= pseries_suspend_begin,
 	.prepare_late	= pseries_prepare_late,
diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c
index 44de8559c975..e9381bfefb21 100644
--- a/arch/powerpc/sysdev/fsl_pmc.c
+++ b/arch/powerpc/sysdev/fsl_pmc.c
@@ -53,7 +53,7 @@ static int pmc_suspend_valid(suspend_state_t state)
 	return 1;
 }
 
-static struct platform_suspend_ops pmc_suspend_ops = {
+static const struct platform_suspend_ops pmc_suspend_ops = {
 	.valid = pmc_suspend_valid,
 	.enter = pmc_suspend_enter,
 };
diff --git a/arch/sh/boards/mach-hp6xx/pm.c b/arch/sh/boards/mach-hp6xx/pm.c
index 4499a3749d40..adc9b4bba828 100644
--- a/arch/sh/boards/mach-hp6xx/pm.c
+++ b/arch/sh/boards/mach-hp6xx/pm.c
@@ -143,7 +143,7 @@ static int hp6x0_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops hp6x0_pm_ops = {
+static const struct platform_suspend_ops hp6x0_pm_ops = {
 	.enter		= hp6x0_pm_enter,
 	.valid		= suspend_valid_only_mem,
 };
diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c
index e55968712706..a6f95ae4aae7 100644
--- a/arch/sh/kernel/cpu/shmobile/pm.c
+++ b/arch/sh/kernel/cpu/shmobile/pm.c
@@ -141,7 +141,7 @@ static int sh_pm_enter(suspend_state_t state)
 	return 0;
 }
 
-static struct platform_suspend_ops sh_pm_ops = {
+static const struct platform_suspend_ops sh_pm_ops = {
 	.enter          = sh_pm_enter,
 	.valid          = suspend_valid_only_mem,
 };
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 5149c9bf7015..ba1caf724a16 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -319,7 +319,7 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state)
 	}
 }
 
-static struct platform_suspend_ops acpi_suspend_ops = {
+static const struct platform_suspend_ops acpi_suspend_ops = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin,
 	.prepare_late = acpi_pm_prepare,
@@ -347,7 +347,7 @@ static int acpi_suspend_begin_old(suspend_state_t pm_state)
  * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
  * been requested.
  */
-static struct platform_suspend_ops acpi_suspend_ops_old = {
+static const struct platform_suspend_ops acpi_suspend_ops_old = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin_old,
 	.prepare_late = acpi_pm_pre_suspend,
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index cd29c8248386..8b021eb0d48c 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -2257,7 +2257,7 @@ static int pmu_sleep_valid(suspend_state_t state)
 		&& (pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, -1) >= 0);
 }
 
-static struct platform_suspend_ops pmu_pm_ops = {
+static const struct platform_suspend_ops pmu_pm_ops = {
 	.enter = powerbook_sleep,
 	.valid = pmu_sleep_valid,
 };
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 40ead943fd6a..f45f3ccfdfa9 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -122,7 +122,7 @@ struct platform_suspend_ops {
  * suspend_set_ops - set platform dependent suspend operations
  * @ops: The new suspend operations to set.
  */
-extern void suspend_set_ops(struct platform_suspend_ops *ops);
+extern void suspend_set_ops(const struct platform_suspend_ops *ops);
 extern int suspend_valid_only_mem(suspend_state_t state);
 
 /**
@@ -147,7 +147,7 @@ extern int pm_suspend(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
 #define suspend_valid_only_mem	NULL
 
-static inline void suspend_set_ops(struct platform_suspend_ops *ops) {}
+static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {}
 static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; }
 #endif /* !CONFIG_SUSPEND */
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7335952ee473..80051bdde6f1 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -30,13 +30,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
 	[PM_SUSPEND_MEM]	= "mem",
 };
 
-static struct platform_suspend_ops *suspend_ops;
+static const struct platform_suspend_ops *suspend_ops;
 
 /**
  *	suspend_set_ops - Set the global suspend method table.
  *	@ops:	Pointer to ops structure.
  */
-void suspend_set_ops(struct platform_suspend_ops *ops)
+void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
 	mutex_lock(&pm_mutex);
 	suspend_ops = ops;
-- 
cgit v1.3-8-gc7d7


From fa9f90be745d3b600a9d97a063be404c5e5d9071 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Sun, 28 Nov 2010 21:39:34 +0100
Subject: Kill off a bunch of warning: ‘inline’ is not at beginning of
 declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These warnings are spewed during a build of a 'allnoconfig' kernel
(especially the ones from u64_stats_sync.h show up a lot) when building
with -Wextra (which I often do)..
They are
  a) annoying
  b) easy to get rid of.
This patch kills them off.

include/linux/u64_stats_sync.h:70:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:77:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:84:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:96:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:115:1: warning: ‘inline’ is not at beginning of declaration
include/linux/u64_stats_sync.h:127:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:241:1: warning: ‘inline’ is not at beginning of declaration
kernel/time.c:257:1: warning: ‘inline’ is not at beginning of declaration
kernel/perf_event.c:4513:1: warning: ‘inline’ is not at beginning of declaration
mm/page_alloc.c:4012:1: warning: ‘inline’ is not at beginning of declaration

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/u64_stats_sync.h | 12 ++++++------
 kernel/perf_event.c            |  2 +-
 kernel/time.c                  |  4 ++--
 mm/page_alloc.c                |  2 +-
 4 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h
index fa261a0da280..8da8c4e87da3 100644
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -67,21 +67,21 @@ struct u64_stats_sync {
 #endif
 };
 
-static void inline u64_stats_update_begin(struct u64_stats_sync *syncp)
+static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_begin(&syncp->seq);
 #endif
 }
 
-static void inline u64_stats_update_end(struct u64_stats_sync *syncp)
+static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	write_seqcount_end(&syncp->seq);
 #endif
 }
 
-static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
+static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
@@ -93,7 +93,7 @@ static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *sy
 #endif
 }
 
-static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
+static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
@@ -112,7 +112,7 @@ static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
  * - UP 32bit must disable BH.
  * - 64bit have no problem atomically reading u64 values, irq safe.
  */
-static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
+static inline unsigned int u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
 	return read_seqcount_begin(&syncp->seq);
@@ -124,7 +124,7 @@ static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync
 #endif
 }
 
-static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
+static inline bool u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp,
 					 unsigned int start)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 517d827f4982..06682e7b12e2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4454,7 +4454,7 @@ int perf_swevent_get_recursion_context(void)
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
-void inline perf_swevent_put_recursion_context(int rctx)
+inline void perf_swevent_put_recursion_context(int rctx)
 {
 	struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
 
diff --git a/kernel/time.c b/kernel/time.c
index ba9b338d1835..32174359576f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time);
  * Avoid unnecessary multiplications/divisions in the
  * two most common HZ cases:
  */
-unsigned int inline jiffies_to_msecs(const unsigned long j)
+inline unsigned int jiffies_to_msecs(const unsigned long j)
 {
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
 	return (MSEC_PER_SEC / HZ) * j;
@@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j)
 }
 EXPORT_SYMBOL(jiffies_to_msecs);
 
-unsigned int inline jiffies_to_usecs(const unsigned long j)
+inline unsigned int jiffies_to_usecs(const unsigned long j)
 {
 #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
 	return (USEC_PER_SEC / HZ) * j;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 07a654486f75..f823ee192e94 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4013,7 +4013,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
 		zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
 				struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
 
-- 
cgit v1.3-8-gc7d7


From 41263fc6716dea402125c95f38ed83ebf59d5172 Mon Sep 17 00:00:00 2001
From: Ben Gardiner <bengardiner@nanometrics.ca>
Date: Tue, 14 Dec 2010 11:39:44 -0500
Subject: kbuild: fix interaction of CONFIG_IKCONFIG and KCONFIG_CONFIG

If you try to build a kernel with KCONFIG_CONFIG set (to a value
not equal to .config) and that config sets CONFIG_IKCONFIG then the
build will fail with:

make[1]: *** No rule to make target `.config', needed by \
`kernel/config_data.gz'.  Stop.

because the kernel/Makefile contains a direct reference to .config.

This issue has been present since the introduction of KCONFIG_CONFIG
in 14cdd3c402bf7c66f0bcd76e290f0770a54a4b21.

Signed-off-by: Ben Gardiner <bengardiner@nanometrics.ca>
CC: Roman Zippel <zippel@linux-m68k.org>
CC: Michal Marek <mmarek@suse.cz>
Reviewed-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 Makefile        | 1 +
 kernel/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 6619720f50dd..ee77a3d88e0a 100644
--- a/Makefile
+++ b/Makefile
@@ -224,6 +224,7 @@ ifeq ($(ARCH),m68knommu)
 endif
 
 KCONFIG_CONFIG	?= .config
+export KCONFIG_CONFIG
 
 # SHELL used by kbuild
 CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \
diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..33e0a39cf359 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -121,7 +121,7 @@ $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
 # Info from config_data can be extracted from /proc/config*
 targets += config_data.gz
-$(obj)/config_data.gz: .config FORCE
+$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
 	$(call if_changed,gzip)
 
 quiet_cmd_ikconfiggz = IKCFG   $@
-- 
cgit v1.3-8-gc7d7


From 43b210139a3cb09d49a18f0dc9bed3674c55f235 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Wed, 22 Dec 2010 19:01:47 +0100
Subject: hrtimer: fix a typo in comment

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/hrtimer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 72206cf5c6cf..66163dfe810b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1774,7 +1774,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
 	}
 
 	/*
-	 * A NULL parameter means "inifinte"
+	 * A NULL parameter means "infinite"
 	 */
 	if (!expires) {
 		schedule();
-- 
cgit v1.3-8-gc7d7


From 133f1128b2bf178a1976b17c54bd14ce6feb90bf Mon Sep 17 00:00:00 2001
From: Tracey Dent <tdent48227@gmail.com>
Date: Thu, 25 Nov 2010 23:41:29 +0100
Subject: PM: Use proper ccflag flag in kernel/power/Makefile

Use the ccflags-$ flag instead of EXTRA_CFLAGS because EXTRA_CFLAGS is
deprecated and should now be switched.  According to
(documentation/kbuild/makefiles.txt).

Signed-off-by: Tracey Dent <tdent48227@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/Makefile | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..b75597235d85 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,4 @@
-
-ifeq ($(CONFIG_PM_DEBUG),y)
-EXTRA_CFLAGS	+=	-DDEBUG
-endif
+ccflags-$(CONFIG_PM_DEBUG)	:=	-DDEBUG
 
 obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
-- 
cgit v1.3-8-gc7d7


From 8cfe400ca54fd1ed96f962bea5f7e20b09b6d69f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 26 Nov 2010 23:07:27 +0100
Subject: Freezer: Fix a race during freezing of TASK_STOPPED tasks

After calling freeze_task(), try_to_freeze_tasks() see whether the
task is stopped or traced and if so, considers it to be frozen;
however, nothing guarantees that either the task being frozen sees
TIF_FREEZE or the freezer sees TASK_STOPPED -> TASK_RUNNING
transition.  The task being frozen may wake up and not see TIF_FREEZE
while the freezer fails to notice the transition and believes the task
is still stopped.

This patch fixes the race by making freeze_task() always go through
fake_signal_wake_up() for applicable tasks.  The function goes through
the target task's scheduler lock and thus guarantees that either the
target sees TIF_FREEZE or try_to_freeze_task() sees TASK_RUNNING.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/freezer.c       | 9 +++++++--
 kernel/power/process.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/freezer.c b/kernel/freezer.c
index bd1d42b17cb2..66ecd2ead215 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only)
 	}
 
 	if (should_send_signal(p)) {
-		if (!signal_pending(p))
-			fake_signal_wake_up(p);
+		fake_signal_wake_up(p);
+		/*
+		 * fake_signal_wake_up() goes through p's scheduler
+		 * lock and guarantees that TASK_STOPPED/TRACED ->
+		 * TASK_RUNNING transition can't race with task state
+		 * testing in try_to_freeze_tasks().
+		 */
 	} else if (sig_only) {
 		return false;
 	} else {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index e50b4c1b2a0f..eb2c88a9e562 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only)
 			 * perturb a task in TASK_STOPPED or TASK_TRACED.
 			 * It is "frozen enough".  If the task does wake
 			 * up, it will immediately call try_to_freeze.
+			 *
+			 * Because freeze_task() goes through p's
+			 * scheduler lock after setting TIF_FREEZE, it's
+			 * guaranteed that either we see TASK_RUNNING or
+			 * try_to_stop() after schedule() in ptrace/signal
+			 * stop sees TIF_FREEZE.
 			 */
 			if (!task_is_stopped_or_traced(p) &&
 			    !freezer_should_skip(p))
-- 
cgit v1.3-8-gc7d7


From 5729c63a51f0f8a351e0f1dc7b3250ebac12c309 Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Fri, 26 Nov 2010 23:07:48 +0100
Subject: PM / Hibernate: hibernation_ops->leave should be checked too

Because hibernate calls hibernation_ops->leave() without checking
whether hibernation_ops->leave is NULL or not, hiberantion_set_ops
should WARN_ON if hibernation_ops->leave is NULL.

This patch added one more condition to check hibernation_ops->leave.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 048d0b514831..ab2836c25038 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -62,7 +62,7 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
 {
 	if (ops && !(ops->begin && ops->end &&  ops->pre_snapshot
 	    && ops->prepare && ops->finish && ops->enter && ops->pre_restore
-	    && ops->restore_cleanup)) {
+	    && ops->restore_cleanup && ops->leave)) {
 		WARN_ON(1);
 		return;
 	}
-- 
cgit v1.3-8-gc7d7


From 5262a47502adcfc3a64403120768f528418a3b79 Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Fri, 26 Nov 2010 23:07:56 +0100
Subject: PM / Hibernate: When failed, in_suspend should be reset

When hibernation failed due to an error in swsusp_write() called by
hibernate(), it skips calling "power_down()" and returns. When
hibernate() is called again (probably after fixing up so that
swsusp_write() wouldn't fail again), before "in_suspend = 1" of
create_image is called, in_suspend should be 0. However, because
hibernate() did not reset "in_suspend" after a failure, it's already 1.

This patch fixes such inconsistency of "in_suspend" value.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index ab2836c25038..c9a98beffee4 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -647,6 +647,7 @@ int hibernate(void)
 		swsusp_free();
 		if (!error)
 			power_down();
+		in_suspend = 0;
 		pm_restore_gfp_mask();
 	} else {
 		pr_debug("PM: Image restored successfully.\n");
-- 
cgit v1.3-8-gc7d7


From a2867e08c8e3bdbc00caf56bc3bdde19ccc058e3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 3 Dec 2010 22:58:31 +0100
Subject: PM / Wakeup: Replace pm_check_wakeup_events() with
 pm_wakeup_pending()

To avoid confusion with the meaning and return value of
pm_check_wakeup_events() replace it with pm_wakeup_pending() that
will work the other way around (ie. return true when system-wide
power transition should be aborted).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/base/power/main.c   |  2 +-
 drivers/base/power/wakeup.c | 20 ++++++++++----------
 include/linux/suspend.h     |  4 ++--
 kernel/power/hibernate.c    |  4 ++--
 kernel/power/process.c      |  2 +-
 kernel/power/suspend.c      |  2 +-
 6 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 4747a1e8b44a..8a5258339ca2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1056,7 +1056,7 @@ static int dpm_prepare(pm_message_t state)
 		if (pm_runtime_barrier(dev) && device_may_wakeup(dev))
 			pm_wakeup_event(dev, 0);
 
-		if (!pm_check_wakeup_events()) {
+		if (pm_wakeup_pending()) {
 			pm_runtime_put_sync(dev);
 			error = -EBUSY;
 		} else {
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 71c5528e1c35..8ec406d8f548 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -542,26 +542,26 @@ static void pm_wakeup_update_hit_counts(void)
 }
 
 /**
- * pm_check_wakeup_events - Check for new wakeup events.
+ * pm_wakeup_pending - Check if power transition in progress should be aborted.
  *
  * Compare the current number of registered wakeup events with its preserved
- * value from the past to check if new wakeup events have been registered since
- * the old value was stored.  Check if the current number of wakeup events being
- * processed is zero.
+ * value from the past and return true if new wakeup events have been registered
+ * since the old value was stored.  Also return true if the current number of
+ * wakeup events being processed is different from zero.
  */
-bool pm_check_wakeup_events(void)
+bool pm_wakeup_pending(void)
 {
 	unsigned long flags;
-	bool ret = true;
+	bool ret = false;
 
 	spin_lock_irqsave(&events_lock, flags);
 	if (events_check_enabled) {
-		ret = ((unsigned int)atomic_read(&event_count) == saved_count)
-			&& !atomic_read(&events_in_progress);
-		events_check_enabled = ret;
+		ret = ((unsigned int)atomic_read(&event_count) != saved_count)
+			|| atomic_read(&events_in_progress);
+		events_check_enabled = !ret;
 	}
 	spin_unlock_irqrestore(&events_lock, flags);
-	if (!ret)
+	if (ret)
 		pm_wakeup_update_hit_counts();
 	return ret;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..144b34be5c32 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -292,7 +292,7 @@ extern int unregister_pm_notifier(struct notifier_block *nb);
 /* drivers/base/power/wakeup.c */
 extern bool events_check_enabled;
 
-extern bool pm_check_wakeup_events(void);
+extern bool pm_wakeup_pending(void);
 extern bool pm_get_wakeup_count(unsigned int *count);
 extern bool pm_save_wakeup_count(unsigned int count);
 #else /* !CONFIG_PM_SLEEP */
@@ -309,7 +309,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb)
 
 #define pm_notifier(fn, pri)	do { (void)(fn); } while (0)
 
-static inline bool pm_check_wakeup_events(void) { return true; }
+static inline bool pm_wakeup_pending(void) { return false; }
 #endif /* !CONFIG_PM_SLEEP */
 
 extern struct mutex pm_mutex;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c9a98beffee4..870f72bc72ae 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -278,7 +278,7 @@ static int create_image(int platform_mode)
 		goto Enable_irqs;
 	}
 
-	if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
+	if (hibernation_test(TEST_CORE) || pm_wakeup_pending())
 		goto Power_up;
 
 	in_suspend = 1;
@@ -516,7 +516,7 @@ int hibernation_platform_enter(void)
 
 	local_irq_disable();
 	sysdev_suspend(PMSG_HIBERNATE);
-	if (!pm_check_wakeup_events()) {
+	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
 		goto Power_up;
 	}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index eb2c88a9e562..d6d2a10320e0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -85,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		if (!todo || time_after(jiffies, end_time))
 			break;
 
-		if (!pm_check_wakeup_events()) {
+		if (pm_wakeup_pending()) {
 			wakeup = true;
 			break;
 		}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ecf770509d0d..5e644e3a6bf3 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,7 +163,7 @@ static int suspend_enter(suspend_state_t state)
 
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
-		if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
+		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
 			events_check_enabled = false;
 		}
-- 
cgit v1.3-8-gc7d7


From 26fcaf60fe3861409eb4c455c5c0d0f00f599b08 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Fri, 7 Jan 2011 01:42:31 +0100
Subject: PM: Fix oops in suspend/hibernate code related to failing ioremap()

When ioremap() fails (which might happen for some reason), we nicely
oops in suspend_nvs_save() due to NULL dereference by memcpy() in there.
Fail gracefully instead.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep.c    | 5 ++---
 include/linux/suspend.h | 4 ++--
 kernel/power/nvs.c      | 8 +++++++-
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index febb153b5a68..d8bca6c90719 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -124,8 +124,7 @@ static int acpi_pm_freeze(void)
 static int acpi_pm_pre_suspend(void)
 {
 	acpi_pm_freeze();
-	suspend_nvs_save();
-	return 0;
+	return suspend_nvs_save();
 }
 
 /**
@@ -151,7 +150,7 @@ static int acpi_pm_prepare(void)
 {
 	int error = __acpi_pm_prepare();
 	if (!error)
-		acpi_pm_pre_suspend();
+		error = acpi_pm_pre_suspend();
 
 	return error;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 26697514c5ec..acb7d911bb0c 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -262,7 +262,7 @@ static inline bool system_entering_hibernation(void) { return false; }
 extern int suspend_nvs_register(unsigned long start, unsigned long size);
 extern int suspend_nvs_alloc(void);
 extern void suspend_nvs_free(void);
-extern void suspend_nvs_save(void);
+extern int suspend_nvs_save(void);
 extern void suspend_nvs_restore(void);
 #else /* CONFIG_SUSPEND_NVS */
 static inline int suspend_nvs_register(unsigned long a, unsigned long b)
@@ -271,7 +271,7 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b)
 }
 static inline int suspend_nvs_alloc(void) { return 0; }
 static inline void suspend_nvs_free(void) {}
-static inline void suspend_nvs_save(void) {}
+static inline int suspend_nvs_save(void) {}
 static inline void suspend_nvs_restore(void) {}
 #endif /* CONFIG_SUSPEND_NVS */
 
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
index 1836db60bbb6..57c6fabbb6b6 100644
--- a/kernel/power/nvs.c
+++ b/kernel/power/nvs.c
@@ -105,7 +105,7 @@ int suspend_nvs_alloc(void)
 /**
  *	suspend_nvs_save - save NVS memory regions
  */
-void suspend_nvs_save(void)
+int suspend_nvs_save(void)
 {
 	struct nvs_page *entry;
 
@@ -114,8 +114,14 @@ void suspend_nvs_save(void)
 	list_for_each_entry(entry, &nvs_list, node)
 		if (entry->data) {
 			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			if (!entry->kaddr) {
+				suspend_nvs_free();
+				return -ENOMEM;
+			}
 			memcpy(entry->data, entry->kaddr, entry->size);
 		}
+
+	return 0;
 }
 
 /**
-- 
cgit v1.3-8-gc7d7


From 976513dbfc1547c7b1822566923058655f0c32fd Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 7 Jan 2011 01:43:44 +0100
Subject: PM / ACPI: Move NVS saving and restoring code to drivers/acpi

The saving of the ACPI NVS area during hibernation and suspend and
restoring it during the subsequent resume is entirely specific to
ACPI, so move it to drivers/acpi and drop the CONFIG_SUSPEND_NVS
configuration option which is redundant.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/e820.c  |   1 +
 drivers/acpi/Makefile   |   2 +-
 drivers/acpi/internal.h |   8 +++
 drivers/acpi/nvs.c      | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/acpi.h    |   9 +++
 include/linux/suspend.h |  17 ------
 kernel/power/Kconfig    |   5 --
 kernel/power/Makefile   |   1 -
 kernel/power/nvs.c      | 142 ------------------------------------------------
 9 files changed, 161 insertions(+), 166 deletions(-)
 create mode 100644 drivers/acpi/nvs.c
 delete mode 100644 kernel/power/nvs.c

(limited to 'kernel')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
 #include <linux/suspend.h>
+#include <linux/acpi.h>
 #include <linux/firmware-map.h>
 #include <linux/memblock.h>
 
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 3d031d02e54b..9cc9f2c4da79 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -24,7 +24,7 @@ acpi-y				+= atomicio.o
 # sleep related files
 acpi-y				+= wakeup.o
 acpi-y				+= sleep.o
-acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o
+acpi-$(CONFIG_ACPI_SLEEP)	+= proc.o nvs.o
 
 
 #
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index a212bfeddf8c..7c23b76e8eca 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -82,8 +82,16 @@ extern int acpi_sleep_init(void);
 
 #ifdef CONFIG_ACPI_SLEEP
 int acpi_sleep_proc_init(void);
+int suspend_nvs_alloc(void);
+void suspend_nvs_free(void);
+int suspend_nvs_save(void);
+void suspend_nvs_restore(void);
 #else
 static inline int acpi_sleep_proc_init(void) { return 0; }
+static inline int suspend_nvs_alloc(void) { return 0; }
+static inline void suspend_nvs_free(void) {}
+static inline int suspend_nvs_save(void) {}
+static inline void suspend_nvs_restore(void) {}
 #endif
 
 #endif /* _ACPI_INTERNAL_H_ */
diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c
new file mode 100644
index 000000000000..57c6fabbb6b6
--- /dev/null
+++ b/drivers/acpi/nvs.c
@@ -0,0 +1,142 @@
+/*
+ * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
+ *
+ * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/suspend.h>
+
+/*
+ * Platforms, like ACPI, may want us to save some memory used by them during
+ * suspend and to restore the contents of this memory during the subsequent
+ * resume.  The code below implements a mechanism allowing us to do that.
+ */
+
+struct nvs_page {
+	unsigned long phys_start;
+	unsigned int size;
+	void *kaddr;
+	void *data;
+	struct list_head node;
+};
+
+static LIST_HEAD(nvs_list);
+
+/**
+ *	suspend_nvs_register - register platform NVS memory region to save
+ *	@start - physical address of the region
+ *	@size - size of the region
+ *
+ *	The NVS region need not be page-aligned (both ends) and we arrange
+ *	things so that the data from page-aligned addresses in this region will
+ *	be copied into separate RAM pages.
+ */
+int suspend_nvs_register(unsigned long start, unsigned long size)
+{
+	struct nvs_page *entry, *next;
+
+	while (size > 0) {
+		unsigned int nr_bytes;
+
+		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
+		if (!entry)
+			goto Error;
+
+		list_add_tail(&entry->node, &nvs_list);
+		entry->phys_start = start;
+		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
+		entry->size = (size < nr_bytes) ? size : nr_bytes;
+
+		start += entry->size;
+		size -= entry->size;
+	}
+	return 0;
+
+ Error:
+	list_for_each_entry_safe(entry, next, &nvs_list, node) {
+		list_del(&entry->node);
+		kfree(entry);
+	}
+	return -ENOMEM;
+}
+
+/**
+ *	suspend_nvs_free - free data pages allocated for saving NVS regions
+ */
+void suspend_nvs_free(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			free_page((unsigned long)entry->data);
+			entry->data = NULL;
+			if (entry->kaddr) {
+				iounmap(entry->kaddr);
+				entry->kaddr = NULL;
+			}
+		}
+}
+
+/**
+ *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
+ */
+int suspend_nvs_alloc(void)
+{
+	struct nvs_page *entry;
+
+	list_for_each_entry(entry, &nvs_list, node) {
+		entry->data = (void *)__get_free_page(GFP_KERNEL);
+		if (!entry->data) {
+			suspend_nvs_free();
+			return -ENOMEM;
+		}
+	}
+	return 0;
+}
+
+/**
+ *	suspend_nvs_save - save NVS memory regions
+ */
+int suspend_nvs_save(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Saving platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data) {
+			entry->kaddr = ioremap(entry->phys_start, entry->size);
+			if (!entry->kaddr) {
+				suspend_nvs_free();
+				return -ENOMEM;
+			}
+			memcpy(entry->data, entry->kaddr, entry->size);
+		}
+
+	return 0;
+}
+
+/**
+ *	suspend_nvs_restore - restore NVS memory regions
+ *
+ *	This function is going to be called with interrupts disabled, so it
+ *	cannot iounmap the virtual addresses used to access the NVS region.
+ */
+void suspend_nvs_restore(void)
+{
+	struct nvs_page *entry;
+
+	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
+
+	list_for_each_entry(entry, &nvs_list, node)
+		if (entry->data)
+			memcpy(entry->kaddr, entry->data, entry->size);
+}
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 67c91b4418b0..fa7ed6a983d0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -254,6 +254,15 @@ void __init acpi_old_suspend_ordering(void);
 void __init acpi_nvs_nosave(void);
 #endif /* CONFIG_PM_SLEEP */
 
+#ifdef CONFIG_ACPI_SLEEP
+int suspend_nvs_register(unsigned long start, unsigned long size);
+#else
+static inline int suspend_nvs_register(unsigned long a, unsigned long b)
+{
+	return 0;
+}
+#endif
+
 struct acpi_osc_context {
 	char *uuid_str; /* uuid string */
 	int rev;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index acb7d911bb0c..0e288e3c37be 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -258,23 +258,6 @@ static inline int hibernate(void) { return -ENOSYS; }
 static inline bool system_entering_hibernation(void) { return false; }
 #endif /* CONFIG_HIBERNATION */
 
-#ifdef CONFIG_SUSPEND_NVS
-extern int suspend_nvs_register(unsigned long start, unsigned long size);
-extern int suspend_nvs_alloc(void);
-extern void suspend_nvs_free(void);
-extern int suspend_nvs_save(void);
-extern void suspend_nvs_restore(void);
-#else /* CONFIG_SUSPEND_NVS */
-static inline int suspend_nvs_register(unsigned long a, unsigned long b)
-{
-	return 0;
-}
-static inline int suspend_nvs_alloc(void) { return 0; }
-static inline void suspend_nvs_free(void) {}
-static inline int suspend_nvs_save(void) {}
-static inline void suspend_nvs_restore(void) {}
-#endif /* CONFIG_SUSPEND_NVS */
-
 #ifdef CONFIG_PM_SLEEP
 void save_processor_state(void);
 void restore_processor_state(void);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index a5aff3ebad38..265729966ece 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG
 	depends on PM_ADVANCED_DEBUG
 	default n
 
-config SUSPEND_NVS
-       bool
-
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM && ARCH_SUSPEND_POSSIBLE
-	select SUSPEND_NVS if HAS_IOMEM
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
@@ -140,7 +136,6 @@ config HIBERNATION
 	depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
-	select SUSPEND_NVS if HAS_IOMEM
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index f9063c6b185d..120a15823325 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,5 @@ obj-$(CONFIG_SUSPEND)		+= suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)	+= suspend_test.o
 obj-$(CONFIG_HIBERNATION)	+= hibernate.o snapshot.o swap.o user.o \
 				   block_io.o
-obj-$(CONFIG_SUSPEND_NVS)	+= nvs.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c
deleted file mode 100644
index 57c6fabbb6b6..000000000000
--- a/kernel/power/nvs.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory
- *
- * Copyright (C) 2008,2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
- *
- * This file is released under the GPLv2.
- */
-
-#include <linux/io.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/suspend.h>
-
-/*
- * Platforms, like ACPI, may want us to save some memory used by them during
- * suspend and to restore the contents of this memory during the subsequent
- * resume.  The code below implements a mechanism allowing us to do that.
- */
-
-struct nvs_page {
-	unsigned long phys_start;
-	unsigned int size;
-	void *kaddr;
-	void *data;
-	struct list_head node;
-};
-
-static LIST_HEAD(nvs_list);
-
-/**
- *	suspend_nvs_register - register platform NVS memory region to save
- *	@start - physical address of the region
- *	@size - size of the region
- *
- *	The NVS region need not be page-aligned (both ends) and we arrange
- *	things so that the data from page-aligned addresses in this region will
- *	be copied into separate RAM pages.
- */
-int suspend_nvs_register(unsigned long start, unsigned long size)
-{
-	struct nvs_page *entry, *next;
-
-	while (size > 0) {
-		unsigned int nr_bytes;
-
-		entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL);
-		if (!entry)
-			goto Error;
-
-		list_add_tail(&entry->node, &nvs_list);
-		entry->phys_start = start;
-		nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK);
-		entry->size = (size < nr_bytes) ? size : nr_bytes;
-
-		start += entry->size;
-		size -= entry->size;
-	}
-	return 0;
-
- Error:
-	list_for_each_entry_safe(entry, next, &nvs_list, node) {
-		list_del(&entry->node);
-		kfree(entry);
-	}
-	return -ENOMEM;
-}
-
-/**
- *	suspend_nvs_free - free data pages allocated for saving NVS regions
- */
-void suspend_nvs_free(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			free_page((unsigned long)entry->data);
-			entry->data = NULL;
-			if (entry->kaddr) {
-				iounmap(entry->kaddr);
-				entry->kaddr = NULL;
-			}
-		}
-}
-
-/**
- *	suspend_nvs_alloc - allocate memory necessary for saving NVS regions
- */
-int suspend_nvs_alloc(void)
-{
-	struct nvs_page *entry;
-
-	list_for_each_entry(entry, &nvs_list, node) {
-		entry->data = (void *)__get_free_page(GFP_KERNEL);
-		if (!entry->data) {
-			suspend_nvs_free();
-			return -ENOMEM;
-		}
-	}
-	return 0;
-}
-
-/**
- *	suspend_nvs_save - save NVS memory regions
- */
-int suspend_nvs_save(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Saving platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data) {
-			entry->kaddr = ioremap(entry->phys_start, entry->size);
-			if (!entry->kaddr) {
-				suspend_nvs_free();
-				return -ENOMEM;
-			}
-			memcpy(entry->data, entry->kaddr, entry->size);
-		}
-
-	return 0;
-}
-
-/**
- *	suspend_nvs_restore - restore NVS memory regions
- *
- *	This function is going to be called with interrupts disabled, so it
- *	cannot iounmap the virtual addresses used to access the NVS region.
- */
-void suspend_nvs_restore(void)
-{
-	struct nvs_page *entry;
-
-	printk(KERN_INFO "PM: Restoring platform NVS memory\n");
-
-	list_for_each_entry(entry, &nvs_list, node)
-		if (entry->data)
-			memcpy(entry->kaddr, entry->data, entry->size);
-}
-- 
cgit v1.3-8-gc7d7


From 23036f1a340beec19cc451ba9719526c4ffb3a57 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 7 Jan 2011 08:53:58 +0100
Subject: blktrace: add missing probe argument to block_bio_complete

blktrace.c block bio complete callback needs to gain a new argument to reflect
the newly added "error" tracepoint argument. This is needed to match the new
block_bio_complete TRACE_EVENT as of
commit de983a7bfcb7c020901ca6e2314cf55a4207ab5a.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Jeff Moyer <jmoyer@redhat.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b8e2ee7c0ef..fab31aff9d97 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -782,7 +782,8 @@ static void blk_add_trace_bio_bounce(void *ignore,
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
-				       struct request_queue *q, struct bio *bio)
+				       struct request_queue *q, struct bio *bio,
+				       int error)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
 }
-- 
cgit v1.3-8-gc7d7


From 0b3fcf178deefd7b64154c2c0760a2c63df0b74f Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 3 Jan 2011 18:20:01 +0200
Subject: perf_events: Move code around to prepare for cgroup

In particular this patch move perf_event_exit_task() before
cgroup_exit() to allow for cgroup support. The cgroup_exit()
function detaches the cgroups attached to a task.

Other movements include hoisting some definitions and inlines
at the top of perf_event.c

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d22058b.cdace30a.4657.ffff95b1@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c       | 14 +++++++++-----
 kernel/perf_event.c | 28 +++++++++++++++++-----------
 2 files changed, 26 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 676149a4ac5f..8cb89045ecf3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code)
 	exit_fs(tsk);
 	check_stack_usage();
 	exit_thread();
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 *
+	 * because of cgroup mode, must be called before cgroup_exit()
+	 */
+	perf_event_exit_task(tsk);
+
 	cgroup_exit(tsk, 1);
 
 	if (group_dead)
@@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code)
 	 * FIXME: do that only when needed, using sched_exit tracepoint
 	 */
 	flush_ptrace_hw_breakpoint(tsk);
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_event_exit_task(tsk);
 
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 11847bf1e8cc..2c14e3afdf0d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -38,6 +38,12 @@
 
 #include <asm/irq_regs.h>
 
+enum event_type_t {
+	EVENT_FLEXIBLE = 0x1,
+	EVENT_PINNED = 0x2,
+	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
+};
+
 atomic_t perf_task_events __read_mostly;
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_event_id;
 
+static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
+			      enum event_type_t event_type);
+
+static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
+			     enum event_type_t event_type);
+
 void __weak perf_event_print_debug(void)	{ }
 
 extern __weak const char *perf_pmu_name(void)
@@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void)
 	return "pmu";
 }
 
+static inline u64 perf_clock(void)
+{
+	return local_clock();
+}
+
 void perf_pmu_disable(struct pmu *pmu)
 {
 	int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
 	put_ctx(ctx);
 }
 
-static inline u64 perf_clock(void)
-{
-	return local_clock();
-}
-
 /*
  * Update the record of the current time in a context.
  */
@@ -1193,12 +1205,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
 	return 0;
 }
 
-enum event_type_t {
-	EVENT_FLEXIBLE = 0x1,
-	EVENT_PINNED = 0x2,
-	EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
-};
-
 static void ctx_sched_out(struct perf_event_context *ctx,
 			  struct perf_cpu_context *cpuctx,
 			  enum event_type_t event_type)
-- 
cgit v1.3-8-gc7d7


From 5632ab12e9e1fcd7e94058567e181d8f35e83798 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 3 Jan 2011 18:20:01 +0200
Subject: perf_events: Generalize use of event_filter_match()

Replace all occurrences of:
	event->cpu != -1 && event->cpu == smp_processor_id()
by a call to:
	event_filter_match(event)

This makes the code more consistent and will make the cgroup
patch smaller.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d220593.2308e30a.48c5.ffff8ae9@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2c14e3afdf0d..dcdb19ed83a6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -949,7 +949,7 @@ static void __perf_install_in_context(void *info)
 
 	add_event_to_ctx(event, ctx);
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event))
 		goto unlock;
 
 	/*
@@ -1094,7 +1094,7 @@ static void __perf_event_enable(void *info)
 		goto unlock;
 	__perf_event_mark_enabled(event, ctx);
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event))
 		goto unlock;
 
 	/*
@@ -1441,7 +1441,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
 	list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 		if (event->state <= PERF_EVENT_STATE_OFF)
 			continue;
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event))
 			continue;
 
 		if (group_can_go_on(event, cpuctx, 1))
@@ -1473,7 +1473,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of events:
 		 */
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event))
 			continue;
 
 		if (group_can_go_on(event, cpuctx, can_add_hw)) {
@@ -1700,7 +1700,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-		if (event->cpu != -1 && event->cpu != smp_processor_id())
+		if (!event_filter_match(event))
 			continue;
 
 		hwc = &event->hw;
@@ -3899,7 +3899,7 @@ static int perf_event_task_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event))
 		return 0;
 
 	if (event->attr.comm || event->attr.mmap ||
@@ -4036,7 +4036,7 @@ static int perf_event_comm_match(struct perf_event *event)
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event))
 		return 0;
 
 	if (event->attr.comm)
@@ -4184,7 +4184,7 @@ static int perf_event_mmap_match(struct perf_event *event,
 	if (event->state < PERF_EVENT_STATE_INACTIVE)
 		return 0;
 
-	if (event->cpu != -1 && event->cpu != smp_processor_id())
+	if (!event_filter_match(event))
 		return 0;
 
 	if ((!executable && event->attr.mmap_data) ||
-- 
cgit v1.3-8-gc7d7


From 4158755d3136f4cb05c1a8a260e9c06f93baeb48 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Mon, 3 Jan 2011 18:20:01 +0200
Subject: perf_events: Add perf_event_time()

Adds perf_event_time() to try and centralize access to event
timing and in particular ctx->time. Prepares for cgroup support.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4d22059c.122ae30a.5e0e.ffff8b8b@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index dcdb19ed83a6..b782b7a79f00 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -268,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx)
 	ctx->timestamp = now;
 }
 
+static u64 perf_event_time(struct perf_event *event)
+{
+	struct perf_event_context *ctx = event->ctx;
+	return ctx ? ctx->time : 0;
+}
+
 /*
  * Update the total_time_enabled and total_time_running fields for a event.
  */
@@ -281,7 +287,7 @@ static void update_event_times(struct perf_event *event)
 		return;
 
 	if (ctx->is_active)
-		run_end = ctx->time;
+		run_end = perf_event_time(event);
 	else
 		run_end = event->tstamp_stopped;
 
@@ -290,7 +296,7 @@ static void update_event_times(struct perf_event *event)
 	if (event->state == PERF_EVENT_STATE_INACTIVE)
 		run_end = event->tstamp_stopped;
 	else
-		run_end = ctx->time;
+		run_end = perf_event_time(event);
 
 	event->total_time_running = run_end - event->tstamp_running;
 }
@@ -546,6 +552,7 @@ event_sched_out(struct perf_event *event,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_event_context *ctx)
 {
+	u64 tstamp = perf_event_time(event);
 	u64 delta;
 	/*
 	 * An event which could not be activated because of
@@ -557,7 +564,7 @@ event_sched_out(struct perf_event *event,
 	    && !event_filter_match(event)) {
 		delta = ctx->time - event->tstamp_stopped;
 		event->tstamp_running += delta;
-		event->tstamp_stopped = ctx->time;
+		event->tstamp_stopped = tstamp;
 	}
 
 	if (event->state != PERF_EVENT_STATE_ACTIVE)
@@ -568,7 +575,7 @@ event_sched_out(struct perf_event *event,
 		event->pending_disable = 0;
 		event->state = PERF_EVENT_STATE_OFF;
 	}
-	event->tstamp_stopped = ctx->time;
+	event->tstamp_stopped = tstamp;
 	event->pmu->del(event, 0);
 	event->oncpu = -1;
 
@@ -780,6 +787,8 @@ event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_event_context *ctx)
 {
+	u64 tstamp = perf_event_time(event);
+
 	if (event->state <= PERF_EVENT_STATE_OFF)
 		return 0;
 
@@ -796,9 +805,9 @@ event_sched_in(struct perf_event *event,
 		return -EAGAIN;
 	}
 
-	event->tstamp_running += ctx->time - event->tstamp_stopped;
+	event->tstamp_running += tstamp - event->tstamp_stopped;
 
-	event->shadow_ctx_time = ctx->time - ctx->timestamp;
+	event->shadow_ctx_time = tstamp - ctx->timestamp;
 
 	if (!is_software_event(event))
 		cpuctx->active_oncpu++;
@@ -910,11 +919,13 @@ static int group_can_go_on(struct perf_event *event,
 static void add_event_to_ctx(struct perf_event *event,
 			       struct perf_event_context *ctx)
 {
+	u64 tstamp = perf_event_time(event);
+
 	list_add_event(event, ctx);
 	perf_group_attach(event);
-	event->tstamp_enabled = ctx->time;
-	event->tstamp_running = ctx->time;
-	event->tstamp_stopped = ctx->time;
+	event->tstamp_enabled = tstamp;
+	event->tstamp_running = tstamp;
+	event->tstamp_stopped = tstamp;
 }
 
 /*
@@ -1054,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event,
 					struct perf_event_context *ctx)
 {
 	struct perf_event *sub;
+	u64 tstamp = perf_event_time(event);
 
 	event->state = PERF_EVENT_STATE_INACTIVE;
-	event->tstamp_enabled = ctx->time - event->total_time_enabled;
+	event->tstamp_enabled = tstamp - event->total_time_enabled;
 	list_for_each_entry(sub, &event->sibling_list, group_entry) {
-		if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
-			sub->tstamp_enabled =
-				ctx->time - sub->total_time_enabled;
-		}
+		if (sub->state >= PERF_EVENT_STATE_INACTIVE)
+			sub->tstamp_enabled = tstamp - sub->total_time_enabled;
 	}
 }
 
-- 
cgit v1.3-8-gc7d7


From 1dbd1951f39e13da579ffe879cce19586d0462de Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 9 Dec 2010 15:47:56 +0800
Subject: tracing: Fix preempt count leak

While running my ftrace stress test, this showed up:

BUG: sleeping function called from invalid context at mm/mmap.c:233
...
note: cat[3293] exited with preempt_count 1

The bug was introduced by commit 91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e
("tracing: Fix recursive user stack trace")

Cc: <stable@kernel.org>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4D0089AC.1020802@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f8cf959bad45..dc53ecb80589 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 
 	__this_cpu_inc(user_stack_count);
 
-
-
 	event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		return;
+		goto out_drop_count;
 	entry	= ring_buffer_event_data(event);
 
 	entry->tgid		= current->tgid;
@@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
 	if (!filter_check_discard(call, entry, buffer, event))
 		ring_buffer_unlock_commit(buffer, event);
 
+ out_drop_count:
 	__this_cpu_dec(user_stack_count);
-
  out:
 	preempt_enable();
 }
-- 
cgit v1.3-8-gc7d7


From 870915e047a2da695be118d32dd5a900f0c3e933 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 28 Oct 2010 11:31:17 -0400
Subject: tracing: Fix TRACE_EVENT power tracepoint creation

DEFINE_TRACE should also exist when CONFIG_EVENT_TRACING=n. Otherwise, setting
only TRACEPOINTS=y is broken.

Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
LKML-Reference: <20101028153117.GA4051@Krystal>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/Makefile       | 1 +
 kernel/trace/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0b5ff083fa22..e0f2831634b4 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 53f338190b26..761c510a06c5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
-obj-$(CONFIG_EVENT_TRACING) += power-traces.o
+obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
-- 
cgit v1.3-8-gc7d7


From 797a455d2c682476c3797dbfecf5bf84c1e3b9d3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Mon, 10 Jan 2011 09:06:44 +0100
Subject: block: ensure that completion error gets properly traced

We normally just use the BIO_UPTODATE flag to signal 0/-EIO. If
we have more information available, we should pass that along to
the trace output.

Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fab31aff9d97..153562d0b93c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -758,54 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @what:	the action
+ * @error:	error, if any
  *
  * Description:
  *     Records an action against a bio. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
+			      u32 what, int error)
 {
 	struct blk_trace *bt = q->blk_trace;
 
 	if (likely(!bt))
 		return;
 
+	if (!error && !bio_flagged(bio, BIO_UPTODATE))
+		error = EIO;
+
 	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+			error, 0, NULL);
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request_queue *q,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request_queue *q,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -813,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
@@ -828,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
-- 
cgit v1.3-8-gc7d7


From f123c98e7f168e949b283690693695f988332c3d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 6 Jan 2011 15:08:29 -0500
Subject: rtmutex: Fix comment about why new_owner can be NULL in
 wake_futex_pi()

The comment about why rt_mutex_next_owner() can return NULL in
wake_futex_pi() is not the normal case.

Tracing the cause of why this occurs is more likely that waiter
simply timedout. But because it originally caused contention with
the futex, the owner will go into the kernel when it unlocks
the lock. Then it will hit this code path and
rt_mutex_next_owner() will return NULL.

Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..5696d38cc71d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -791,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
 	/*
-	 * This happens when we have stolen the lock and the original
-	 * pending owner did not enqueue itself back on the rt_mutex.
-	 * Thats not a tragedy. We know that way, that a lock waiter
-	 * is on the fly. We make the futex_q waiter the pending owner.
+	 * It is possible that the next waiter (the one that brought
+	 * this owner to the kernel) timed out and is no longer
+	 * waiting on the lock.
 	 */
 	if (!new_owner)
 		new_owner = this->task;
-- 
cgit v1.3-8-gc7d7


From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 9 Jan 2011 23:32:15 +0100
Subject: workqueue: relax lockdep annotation on flush_work()

Currently, the lockdep annotation in flush_work() requires exclusive
access on the workqueue the target work is queued on and triggers
warning if a work is trying to flush another work on the same
workqueue; however, this is no longer true as workqueues can now
execute multiple works concurrently.

This patch adds lock_map_acquire_read() and make process_one_work()
hold read access to the workqueue while executing a work and
start_flush_work() check for write access if concurrnecy level is one
or the workqueue has a rescuer (as only one execution resource - the
rescuer - is guaranteed to be available under memory pressure), and
read access if higher.

This better represents what's going on and removes spurious lockdep
warnings which are triggered by fake dependency chain created through
flush_work().

* Peter pointed out that flushing another work from a WQ_MEM_RECLAIM
  wq breaks forward progress guarantee under memory pressure.
  Condition check accordingly updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Tested-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org
---
 include/linux/lockdep.h |  3 +++
 kernel/workqueue.c      | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 71c09b26c759..9f19430c7d07 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -522,12 +522,15 @@ static inline void print_irqtrace_events(struct task_struct *curr)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
 #  define lock_map_acquire(l)		lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_)
+#  define lock_map_acquire_read(l)	lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_)
 # else
 #  define lock_map_acquire(l)		lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_)
+#  define lock_map_acquire_read(l)	lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_)
 # endif
 # define lock_map_release(l)			lock_release(l, 1, _THIS_IP_)
 #else
 # define lock_map_acquire(l)			do { } while (0)
+# define lock_map_acquire_read(l)		do { } while (0)
 # define lock_map_release(l)			do { } while (0)
 #endif
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8ee6ec82f88a..930c2390b77e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock)
 	spin_unlock_irq(&gcwq->lock);
 
 	work_clear_pending(work);
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_acquire(&lockdep_map);
 	trace_workqueue_execute_start(work);
 	f(work);
@@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
 	insert_wq_barrier(cwq, barr, work, worker);
 	spin_unlock_irq(&gcwq->lock);
 
-	lock_map_acquire(&cwq->wq->lockdep_map);
+	/*
+	 * If @max_active is 1 or rescuer is in use, flushing another work
+	 * item on the same workqueue may lead to deadlock.  Make sure the
+	 * flusher is not running on the same workqueue by verifying write
+	 * access.
+	 */
+	if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER)
+		lock_map_acquire(&cwq->wq->lockdep_map);
+	else
+		lock_map_acquire_read(&cwq->wq->lockdep_map);
 	lock_map_release(&cwq->wq->lockdep_map);
+
 	return true;
 already_gone:
 	spin_unlock_irq(&gcwq->lock);
-- 
cgit v1.3-8-gc7d7


From 42c025f3de9042d9c9abd9a6f6205d1a0f4bcadf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Jan 2011 15:58:49 +0100
Subject: workqueue: note the nested NOT_RUNNING test in worker_clr_flags()
 isn't a noop

The nested NOT_RUNNING test in worker_clr_flags() is slightly
misleading in that if NOT_RUNNING were a single flag the nested test
would be always %true and thus noop.  Add a comment noting that the
test isn't a noop.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/workqueue.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 930c2390b77e..11869faa6819 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
 
 	worker->flags &= ~flags;
 
-	/* if transitioning out of NOT_RUNNING, increment nr_running */
+	/*
+	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
+	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
+	 * of multiple flags, not a single flag.
+	 */
 	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
 		if (!(worker->flags & WORKER_NOT_RUNNING))
 			atomic_inc(get_gcwq_nr_running(gcwq->cpu));
-- 
cgit v1.3-8-gc7d7


From 81e88fdc432a1552401d6e91a984dcccce72b8dc Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 12 Jan 2011 14:44:55 +0800
Subject: ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification
 type support

Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

This patch adds POLL/IRQ/NMI notification types support.

Because the memory area used to transfer hardware error information
from BIOS to Linux can be determined only in NMI, IRQ or timer
handler, but general ioremap can not be used in atomic context, so a
special version of atomic ioremap is implemented for that.

Known issue:

- Error information can not be printed for recoverable errors notified
  via NMI, because printk is not NMI-safe. Will fix this via delay
  printing to IRQ context via irq_work or make printk NMI-safe.

v2:

- adjust printk format per comments.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/boot.c |   1 +
 arch/x86/kernel/dumpstack.c |   1 +
 drivers/acpi/apei/ghes.c    | 406 +++++++++++++++++++++++++++++++++++---------
 kernel/panic.c              |   1 +
 lib/ioremap.c               |   2 +
 mm/vmalloc.c                |   1 +
 6 files changed, 328 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 71232b941b6c..c2d0baa48d8c 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -504,6 +504,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
 
 int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
 {
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..d34cf80ec402 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -240,6 +240,7 @@ unsigned __kprobes long oops_begin(void)
 	bust_spinlocks(1);
 	return flags;
 }
+EXPORT_SYMBOL_GPL(oops_begin);
 
 void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 {
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 51905d07a4d9..d1d484d4a06a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,10 +12,6 @@
  * For more information about Generic Hardware Error Source, please
  * refer to ACPI Specification version 4.0, section 17.3.2.6
  *
- * Now, only SCI notification type and memory errors are
- * supported. More notification type and hardware error type will be
- * added later.
- *
  * Copyright 2010 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
@@ -39,15 +35,18 @@
 #include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/timer.h>
 #include <linux/cper.h>
 #include <linux/kdebug.h>
 #include <linux/platform_device.h>
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
 #include <asm/mce.h>
+#include <asm/tlbflush.h>
 
 #include "apei-internal.h"
 
@@ -56,42 +55,131 @@
 #define GHES_ESTATUS_MAX_SIZE		65536
 
 /*
- * One struct ghes is created for each generic hardware error
- * source.
- *
+ * One struct ghes is created for each generic hardware error source.
  * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
- * handler. Handler for one generic hardware error source is only
- * triggered after the previous one is done. So handler can uses
- * struct ghes without locking.
+ * handler.
  *
  * estatus: memory buffer for error status block, allocated during
  * HEST parsing.
  */
 #define GHES_TO_CLEAR		0x0001
+#define GHES_EXITING		0x0002
 
 struct ghes {
 	struct acpi_hest_generic *generic;
 	struct acpi_hest_generic_status *estatus;
-	struct list_head list;
 	u64 buffer_paddr;
 	unsigned long flags;
+	union {
+		struct list_head list;
+		struct timer_list timer;
+		unsigned int irq;
+	};
 };
 
+static int ghes_panic_timeout	__read_mostly = 30;
+
 /*
- * Error source lists, one list for each notification method. The
- * members in lists are struct ghes.
+ * All error sources notified with SCI shares one notifier function,
+ * so they need to be linked and checked one by one.  This is applied
+ * to NMI too.
  *
- * The list members are only added in HEST parsing and deleted during
- * module_exit, that is, single-threaded. So no lock is needed for
- * that.
- *
- * But the mutual exclusion is needed between members adding/deleting
- * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
- * used for that.
+ * RCU is used for these lists, so ghes_list_mutex is only used for
+ * list changing, not for traversing.
  */
 static LIST_HEAD(ghes_sci);
+static LIST_HEAD(ghes_nmi);
 static DEFINE_MUTEX(ghes_list_mutex);
 
+/*
+ * NMI may be triggered on any CPU, so ghes_nmi_lock is used for
+ * mutual exclusion.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_nmi_lock);
+
+/*
+ * Because the memory area used to transfer hardware error information
+ * from BIOS to Linux can be determined only in NMI, IRQ or timer
+ * handler, but general ioremap can not be used in atomic context, so
+ * a special version of atomic ioremap is implemented for that.
+ */
+
+/*
+ * Two virtual pages are used, one for NMI context, the other for
+ * IRQ/PROCESS context
+ */
+#define GHES_IOREMAP_PAGES		2
+#define GHES_IOREMAP_NMI_PAGE(base)	(base)
+#define GHES_IOREMAP_IRQ_PAGE(base)	((base) + PAGE_SIZE)
+
+/* virtual memory area for atomic ioremap */
+static struct vm_struct *ghes_ioremap_area;
+/*
+ * These 2 spinlock is used to prevent atomic ioremap virtual memory
+ * area from being mapped simultaneously.
+ */
+static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
+static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+
+static int ghes_ioremap_init(void)
+{
+	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
+		VM_IOREMAP, VMALLOC_START, VMALLOC_END);
+	if (!ghes_ioremap_area) {
+		pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void ghes_ioremap_exit(void)
+{
+	free_vm_area(ghes_ioremap_area);
+}
+
+static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
+{
+	unsigned long vaddr;
+
+	vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr);
+	ioremap_page_range(vaddr, vaddr + PAGE_SIZE,
+			   pfn << PAGE_SHIFT, PAGE_KERNEL);
+
+	return (void __iomem *)vaddr;
+}
+
+static void ghes_iounmap_nmi(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
+static void ghes_iounmap_irq(void __iomem *vaddr_ptr)
+{
+	unsigned long vaddr = (unsigned long __force)vaddr_ptr;
+	void *base = ghes_ioremap_area->addr;
+
+	BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base));
+	unmap_kernel_range_noflush(vaddr, PAGE_SIZE);
+	__flush_tlb_one(vaddr);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
 	struct ghes *ghes;
@@ -102,7 +190,6 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 	if (!ghes)
 		return ERR_PTR(-ENOMEM);
 	ghes->generic = generic;
-	INIT_LIST_HEAD(&ghes->list);
 	rc = acpi_pre_map_gar(&generic->error_status_address);
 	if (rc)
 		goto err_free;
@@ -159,22 +246,41 @@ static inline int ghes_severity(int severity)
 	}
 }
 
-/* SCI handler run in work queue, so ioremap can be used here */
-static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
-				 int from_phys)
+static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
+				  int from_phys)
 {
-	void *vaddr;
-
-	vaddr = ioremap_cache(paddr, len);
-	if (!vaddr)
-		return -ENOMEM;
-	if (from_phys)
-		memcpy(buffer, vaddr, len);
-	else
-		memcpy(vaddr, buffer, len);
-	iounmap(vaddr);
-
-	return 0;
+	void __iomem *vaddr;
+	unsigned long flags = 0;
+	int in_nmi = in_nmi();
+	u64 offset;
+	u32 trunk;
+
+	while (len > 0) {
+		offset = paddr - (paddr & PAGE_MASK);
+		if (in_nmi) {
+			raw_spin_lock(&ghes_ioremap_lock_nmi);
+			vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
+		} else {
+			spin_lock_irqsave(&ghes_ioremap_lock_irq, flags);
+			vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
+		}
+		trunk = PAGE_SIZE - offset;
+		trunk = min(trunk, len);
+		if (from_phys)
+			memcpy_fromio(buffer, vaddr + offset, trunk);
+		else
+			memcpy_toio(vaddr + offset, buffer, trunk);
+		len -= trunk;
+		paddr += trunk;
+		buffer += trunk;
+		if (in_nmi) {
+			ghes_iounmap_nmi(vaddr);
+			raw_spin_unlock(&ghes_ioremap_lock_nmi);
+		} else {
+			ghes_iounmap_irq(vaddr);
+			spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags);
+		}
+	}
 }
 
 static int ghes_read_estatus(struct ghes *ghes, int silent)
@@ -195,10 +301,8 @@ static int ghes_read_estatus(struct ghes *ghes, int silent)
 	if (!buf_paddr)
 		return -ENOENT;
 
-	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
-				   sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+			      sizeof(*ghes->estatus), 1);
 	if (!ghes->estatus->block_status)
 		return -ENOENT;
 
@@ -213,17 +317,15 @@ static int ghes_read_estatus(struct ghes *ghes, int silent)
 		goto err_read_block;
 	if (apei_estatus_check_header(ghes->estatus))
 		goto err_read_block;
-	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
-				   buf_paddr + sizeof(*ghes->estatus),
-				   len - sizeof(*ghes->estatus), 1);
-	if (rc)
-		return rc;
+	ghes_copy_tofrom_phys(ghes->estatus + 1,
+			      buf_paddr + sizeof(*ghes->estatus),
+			      len - sizeof(*ghes->estatus), 1);
 	if (apei_estatus_check(ghes->estatus))
 		goto err_read_block;
 	rc = 0;
 
 err_read_block:
-	if (rc && !silent)
+	if (rc && !silent && printk_ratelimit())
 		pr_warning(FW_WARN GHES_PFX
 			   "Failed to read error status block!\n");
 	return rc;
@@ -293,6 +395,42 @@ out:
 	return 0;
 }
 
+static void ghes_add_timer(struct ghes *ghes)
+{
+	struct acpi_hest_generic *g = ghes->generic;
+	unsigned long expire;
+
+	if (!g->notify.poll_interval) {
+		pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n",
+			   g->header.source_id);
+		return;
+	}
+	expire = jiffies + msecs_to_jiffies(g->notify.poll_interval);
+	ghes->timer.expires = round_jiffies_relative(expire);
+	add_timer(&ghes->timer);
+}
+
+static void ghes_poll_func(unsigned long data)
+{
+	struct ghes *ghes = (void *)data;
+
+	ghes_proc(ghes);
+	if (!(ghes->flags & GHES_EXITING))
+		ghes_add_timer(ghes);
+}
+
+static irqreturn_t ghes_irq_func(int irq, void *data)
+{
+	struct ghes *ghes = data;
+	int rc;
+
+	rc = ghes_proc(ghes);
+	if (rc)
+		return IRQ_NONE;
+
+	return IRQ_HANDLED;
+}
+
 static int ghes_notify_sci(struct notifier_block *this,
 				  unsigned long event, void *data)
 {
@@ -309,10 +447,63 @@ static int ghes_notify_sci(struct notifier_block *this,
 	return ret;
 }
 
+static int ghes_notify_nmi(struct notifier_block *this,
+				  unsigned long cmd, void *data)
+{
+	struct ghes *ghes, *ghes_global = NULL;
+	int sev, sev_global = -1;
+	int ret = NOTIFY_DONE;
+
+	if (cmd != DIE_NMI)
+		return ret;
+
+	raw_spin_lock(&ghes_nmi_lock);
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (ghes_read_estatus(ghes, 1)) {
+			ghes_clear_estatus(ghes);
+			continue;
+		}
+		sev = ghes_severity(ghes->estatus->error_severity);
+		if (sev > sev_global) {
+			sev_global = sev;
+			ghes_global = ghes;
+		}
+		ret = NOTIFY_STOP;
+	}
+
+	if (ret == NOTIFY_DONE)
+		goto out;
+
+	if (sev_global >= GHES_SEV_PANIC) {
+		oops_begin();
+		ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global);
+		/* reboot to log the error! */
+		if (panic_timeout == 0)
+			panic_timeout = ghes_panic_timeout;
+		panic("Fatal hardware error!");
+	}
+
+	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+		if (!(ghes->flags & GHES_TO_CLEAR))
+			continue;
+		/* Do not print estatus because printk is not NMI safe */
+		ghes_do_proc(ghes);
+		ghes_clear_estatus(ghes);
+	}
+
+out:
+	raw_spin_unlock(&ghes_nmi_lock);
+	return ret;
+}
+
 static struct notifier_block ghes_notifier_sci = {
 	.notifier_call = ghes_notify_sci,
 };
 
+static struct notifier_block ghes_notifier_nmi = {
+	.notifier_call = ghes_notify_nmi,
+};
+
 static int __devinit ghes_probe(struct platform_device *ghes_dev)
 {
 	struct acpi_hest_generic *generic;
@@ -323,18 +514,27 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev)
 	if (!generic->enabled)
 		return -ENODEV;
 
-	if (generic->error_block_length <
-	    sizeof(struct acpi_hest_generic_status)) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid error block length: %u for generic hardware error source: %d\n",
-			   generic->error_block_length,
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+	case ACPI_HEST_NOTIFY_SCI:
+	case ACPI_HEST_NOTIFY_NMI:
+		break;
+	case ACPI_HEST_NOTIFY_LOCAL:
+		pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n",
 			   generic->header.source_id);
 		goto err;
+	default:
+		pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n",
+			   generic->notify.type, generic->header.source_id);
+		goto err;
 	}
-	if (generic->records_to_preallocate == 0) {
-		pr_warning(FW_BUG GHES_PFX
-"Invalid records to preallocate: %u for generic hardware error source: %d\n",
-			   generic->records_to_preallocate,
+
+	rc = -EIO;
+	if (generic->error_block_length <
+	    sizeof(struct acpi_hest_generic_status)) {
+		pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n",
+			   generic->error_block_length,
 			   generic->header.source_id);
 		goto err;
 	}
@@ -344,38 +544,43 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev)
 		ghes = NULL;
 		goto err;
 	}
-	if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) {
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		ghes->timer.function = ghes_poll_func;
+		ghes->timer.data = (unsigned long)ghes;
+		init_timer_deferrable(&ghes->timer);
+		ghes_add_timer(ghes);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		/* External interrupt vector is GSI */
+		if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) {
+			pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err;
+		}
+		if (request_irq(ghes->irq, ghes_irq_func,
+				0, "GHES IRQ", ghes)) {
+			pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n",
+			       generic->header.source_id);
+			goto err;
+		}
+		break;
+	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		if (list_empty(&ghes_sci))
 			register_acpi_hed_notifier(&ghes_notifier_sci);
 		list_add_rcu(&ghes->list, &ghes_sci);
 		mutex_unlock(&ghes_list_mutex);
-	} else {
-		unsigned char *notify = NULL;
-
-		switch (generic->notify.type) {
-		case ACPI_HEST_NOTIFY_POLLED:
-			notify = "POLL";
-			break;
-		case ACPI_HEST_NOTIFY_EXTERNAL:
-		case ACPI_HEST_NOTIFY_LOCAL:
-			notify = "IRQ";
-			break;
-		case ACPI_HEST_NOTIFY_NMI:
-			notify = "NMI";
-			break;
-		}
-		if (notify) {
-			pr_warning(GHES_PFX
-"Generic hardware error source: %d notified via %s is not supported!\n",
-				   generic->header.source_id, notify);
-		} else {
-			pr_warning(FW_WARN GHES_PFX
-"Unknown notification type: %u for generic hardware error source: %d\n",
-			generic->notify.type, generic->header.source_id);
-		}
-		rc = -ENODEV;
-		goto err;
+		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		if (list_empty(&ghes_nmi))
+			register_die_notifier(&ghes_notifier_nmi);
+		list_add_rcu(&ghes->list, &ghes_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		break;
+	default:
+		BUG();
 	}
 	platform_set_drvdata(ghes_dev, ghes);
 
@@ -396,7 +601,14 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
 	ghes = platform_get_drvdata(ghes_dev);
 	generic = ghes->generic;
 
+	ghes->flags |= GHES_EXITING;
 	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		del_timer_sync(&ghes->timer);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+		free_irq(ghes->irq, ghes);
+		break;
 	case ACPI_HEST_NOTIFY_SCI:
 		mutex_lock(&ghes_list_mutex);
 		list_del_rcu(&ghes->list);
@@ -404,12 +616,23 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev)
 			unregister_acpi_hed_notifier(&ghes_notifier_sci);
 		mutex_unlock(&ghes_list_mutex);
 		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		mutex_lock(&ghes_list_mutex);
+		list_del_rcu(&ghes->list);
+		if (list_empty(&ghes_nmi))
+			unregister_die_notifier(&ghes_notifier_nmi);
+		mutex_unlock(&ghes_list_mutex);
+		/*
+		 * To synchronize with NMI handler, ghes can only be
+		 * freed after NMI handler finishes.
+		 */
+		synchronize_rcu();
+		break;
 	default:
 		BUG();
 		break;
 	}
 
-	synchronize_rcu();
 	ghes_fini(ghes);
 	kfree(ghes);
 
@@ -429,6 +652,8 @@ static struct platform_driver ghes_platform_driver = {
 
 static int __init ghes_init(void)
 {
+	int rc;
+
 	if (acpi_disabled)
 		return -ENODEV;
 
@@ -437,12 +662,25 @@ static int __init ghes_init(void)
 		return -EINVAL;
 	}
 
-	return platform_driver_register(&ghes_platform_driver);
+	rc = ghes_ioremap_init();
+	if (rc)
+		goto err;
+
+	rc = platform_driver_register(&ghes_platform_driver);
+	if (rc)
+		goto err_ioremap_exit;
+
+	return 0;
+err_ioremap_exit:
+	ghes_ioremap_exit();
+err:
+	return rc;
 }
 
 static void __exit ghes_exit(void)
 {
 	platform_driver_unregister(&ghes_platform_driver);
+	ghes_ioremap_exit();
 }
 
 module_init(ghes_init);
diff --git a/kernel/panic.c b/kernel/panic.c
index 4c13b1a88ebb..991bb87a1704 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -34,6 +34,7 @@ static int pause_on_oops_flag;
 static DEFINE_SPINLOCK(pause_on_oops_lock);
 
 int panic_timeout;
+EXPORT_SYMBOL_GPL(panic_timeout);
 
 ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
 
diff --git a/lib/ioremap.c b/lib/ioremap.c
index 5730ecd3eb66..da4e2ad74b68 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/io.h>
+#include <linux/module.h>
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 
@@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long addr,
 
 	return err;
 }
+EXPORT_SYMBOL_GPL(ioremap_page_range);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index eb5cc7d00c5a..816f074fb4e1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1175,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
 	vunmap_page_range(addr, addr + size);
 }
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 
 /**
  * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
-- 
cgit v1.3-8-gc7d7


From 5fdade95f26372154459347dfb9f60721d22cfc7 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@fluxnic.net>
Date: Tue, 11 Jan 2011 12:18:12 -0500
Subject: time: Rename misnamed minsec argument of clocks_calc_mult_shift()

The minsec argument to clocks_calc_mult_shift() is misnamed. It is used
to clamp the magnitude of the mult factor so that a multiplication with
any value in the given range won't overflow a 64 bit result.  Let's
rename it to match the actual usage.

Signed-off-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Acked-by: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <alpine.LFD.2.00.1101111207140.17086@xanadu.home>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c18d7efa1b4b..8588abcac07b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * @shift:	pointer to shift variable
  * @from:	frequency to convert from
  * @to:		frequency to convert to
- * @minsec:	guaranteed runtime conversion range in seconds
+ * @maxsec:	guaranteed runtime conversion range in seconds
  *
  * The function evaluates the shift/mult pair for the scaled math
  * operations of clocksources and clockevents.
@@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
  * event @to is the counter frequency and @from is NSEC_PER_SEC.
  *
- * The @minsec conversion range argument controls the time frame in
+ * The @maxsec conversion range argument controls the time frame in
  * seconds which must be covered by the runtime conversion with the
  * calculated mult and shift factors. This guarantees that no 64bit
  * overflow happens when the input value of the conversion is
@@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time);
  * factors.
  */
 void
-clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
+clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec)
 {
 	u64 tmp;
 	u32 sft, sftacc= 32;
@@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
 	 * Calculate the shift factor which is limiting the conversion
 	 * range:
 	 */
-	tmp = ((u64)minsec * from) >> 32;
+	tmp = ((u64)maxsec * from) >> 32;
 	while (tmp) {
 		tmp >>=1;
 		sftacc--;
-- 
cgit v1.3-8-gc7d7


From afa14e7c553ebe45844d76208f66017a43abd0e2 Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 11 Jan 2011 17:59:38 -0600
Subject: timekeeping: Make local variables static

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: John Stultz <johnstul@us.ibm.com>
LKML-Reference: <0D753D10438DA54287A00B027084269764CE0E54B7@AUSP01VMBX24.collaborationhost.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..eef7452bd8a9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -49,7 +49,7 @@ struct timekeeper {
 	u32	mult;
 };
 
-struct timekeeper timekeeper;
+static struct timekeeper timekeeper;
 
 /**
  * timekeeper_setup_internals - Set up internals to use clocksource clock.
@@ -164,7 +164,7 @@ static struct timespec total_sleep_time;
 /*
  * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
  */
-struct timespec raw_time;
+static struct timespec raw_time;
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
-- 
cgit v1.3-8-gc7d7


From 0df6a63f8735a7c8a877878bc215d4312e41ef81 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Dec 2010 13:29:29 -0500
Subject: switch cgroup

switching it to s_d_op allows to kill the cgroup_lookup() kludge.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/cgroup.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 51cddc11cd85..5c5f4cc2e99a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -763,8 +763,6 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
  * -> cgroup_mkdir.
  */
 
-static struct dentry *cgroup_lookup(struct inode *dir,
-			struct dentry *dentry, struct nameidata *nd);
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -1451,6 +1449,10 @@ static int cgroup_set_super(struct super_block *sb, void *data)
 
 static int cgroup_get_rootdir(struct super_block *sb)
 {
+	static const struct dentry_operations cgroup_dops = {
+		.d_iput = cgroup_diput,
+	};
+
 	struct inode *inode =
 		cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
 	struct dentry *dentry;
@@ -1468,6 +1470,8 @@ static int cgroup_get_rootdir(struct super_block *sb)
 		return -ENOMEM;
 	}
 	sb->s_root = dentry;
+	/* for everything else we want ->d_op set */
+	sb->s_d_op = &cgroup_dops;
 	return 0;
 }
 
@@ -2191,7 +2195,7 @@ static const struct file_operations cgroup_file_operations = {
 };
 
 static const struct inode_operations cgroup_dir_inode_operations = {
-	.lookup = cgroup_lookup,
+	.lookup = simple_lookup,
 	.mkdir = cgroup_mkdir,
 	.rmdir = cgroup_rmdir,
 	.rename = cgroup_rename,
@@ -2207,26 +2211,6 @@ static inline struct cftype *__file_cft(struct file *file)
 	return __d_cft(file->f_dentry);
 }
 
-static int cgroup_delete_dentry(const struct dentry *dentry)
-{
-	return 1;
-}
-
-static struct dentry *cgroup_lookup(struct inode *dir,
-			struct dentry *dentry, struct nameidata *nd)
-{
-	static const struct dentry_operations cgroup_dentry_operations = {
-		.d_delete = cgroup_delete_dentry,
-		.d_iput = cgroup_diput,
-	};
-
-	if (dentry->d_name.len > NAME_MAX)
-		return ERR_PTR(-ENAMETOOLONG);
-	d_set_d_op(dentry, &cgroup_dentry_operations);
-	d_add(dentry, NULL);
-	return NULL;
-}
-
 static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
-- 
cgit v1.3-8-gc7d7


From 04c6862c055fb687c90d9652f32c11a063df15cf Mon Sep 17 00:00:00 2001
From: Seiji Aguchi <seiji.aguchi@hds.com>
Date: Wed, 12 Jan 2011 16:59:30 -0800
Subject: kmsg_dump: add kmsg_dump() calls to the reboot, halt, poweroff and
 emergency_restart paths

We need to know the reason why system rebooted in support service.
However, we can't inform our customers of the reason because final
messages are lost on current Linux kernel.

This patch improves the situation above because the final messages are
saved by adding kmsg_dump() to reboot, halt, poweroff and
emergency_restart path.

Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Marco Stornelli <marco.stornelli@gmail.com>
Reviewed-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmsg_dump.h | 4 ++++
 kernel/printk.c           | 4 ++++
 kernel/sys.c              | 6 ++++++
 3 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h
index 24b44145a886..2a0d7d651dc3 100644
--- a/include/linux/kmsg_dump.h
+++ b/include/linux/kmsg_dump.h
@@ -18,6 +18,10 @@ enum kmsg_dump_reason {
 	KMSG_DUMP_OOPS,
 	KMSG_DUMP_PANIC,
 	KMSG_DUMP_KEXEC,
+	KMSG_DUMP_RESTART,
+	KMSG_DUMP_HALT,
+	KMSG_DUMP_POWEROFF,
+	KMSG_DUMP_EMERG,
 };
 
 /**
diff --git a/kernel/printk.c b/kernel/printk.c
index f64b8997fc76..0b0c9aa71e89 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1539,6 +1539,10 @@ static const char * const kmsg_reasons[] = {
 	[KMSG_DUMP_OOPS]	= "oops",
 	[KMSG_DUMP_PANIC]	= "panic",
 	[KMSG_DUMP_KEXEC]	= "kexec",
+	[KMSG_DUMP_RESTART]	= "restart",
+	[KMSG_DUMP_HALT]	= "halt",
+	[KMSG_DUMP_POWEROFF]	= "poweroff",
+	[KMSG_DUMP_EMERG]	= "emergency_restart",
 };
 
 static const char *kmsg_to_str(enum kmsg_dump_reason reason)
diff --git a/kernel/sys.c b/kernel/sys.c
index 2745dcdb6c6c..31b71a276b40 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -43,6 +43,8 @@
 #include <linux/kprobes.h>
 #include <linux/user_namespace.h>
 
+#include <linux/kmsg_dump.h>
+
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/unistd.h>
@@ -285,6 +287,7 @@ out_unlock:
  */
 void emergency_restart(void)
 {
+	kmsg_dump(KMSG_DUMP_EMERG);
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
@@ -312,6 +315,7 @@ void kernel_restart(char *cmd)
 		printk(KERN_EMERG "Restarting system.\n");
 	else
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	kmsg_dump(KMSG_DUMP_RESTART);
 	machine_restart(cmd);
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
@@ -333,6 +337,7 @@ void kernel_halt(void)
 	kernel_shutdown_prepare(SYSTEM_HALT);
 	sysdev_shutdown();
 	printk(KERN_EMERG "System halted.\n");
+	kmsg_dump(KMSG_DUMP_HALT);
 	machine_halt();
 }
 
@@ -351,6 +356,7 @@ void kernel_power_off(void)
 	disable_nonboot_cpus();
 	sysdev_shutdown();
 	printk(KERN_EMERG "Power down.\n");
+	kmsg_dump(KMSG_DUMP_POWEROFF);
 	machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
-- 
cgit v1.3-8-gc7d7


From 351f8f8e6499ae4fff40f5e3a8fe16d9e1903646 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Wed, 12 Jan 2011 16:59:39 -0800
Subject: kernel: clean up USE_GENERIC_SMP_HELPERS

For arch which needs USE_GENERIC_SMP_HELPERS, it has to select
USE_GENERIC_SMP_HELPERS, rather than leaving a choice to user, since they
don't provide their own implementions.

Also, move on_each_cpu() to kernel/smp.c, it is strange to put it in
kernel/softirq.c.

For arch which doesn't use USE_GENERIC_SMP_HELPERS, e.g.  blackfin, only
on_each_cpu() is compiled.

Signed-off-by: Amerigo Wang <amwang@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/Kconfig |  6 +-----
 arch/x86/Kconfig     |  5 +----
 kernel/Makefile      |  2 +-
 kernel/smp.c         | 19 +++++++++++++++++++
 kernel/softirq.c     | 19 -------------------
 5 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 41ba38513c89..8ed41cf2b08d 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -203,6 +203,7 @@ endmenu
 config SMP
 	bool "Symmetric multi-processing support"
 	default y
+	select USE_GENERIC_SMP_HELPERS
 	depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050
 	---help---
 	  This enables support for systems with more than one CPU. If you have
@@ -226,11 +227,6 @@ config NR_CPUS
 	depends on SMP
 	default "2"
 
-config USE_GENERIC_SMP_HELPERS
-	bool
-	depends on SMP
-	default y
-
 source "kernel/Kconfig.preempt"
 
 config MN10300_CURRENT_IN_E2
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b6fccb07123e..8734db3c5830 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -65,6 +65,7 @@ config X86
 	select HAVE_SPARSE_IRQ
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
+	select USE_GENERIC_SMP_HELPERS if SMP
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +204,6 @@ config HAVE_INTEL_TXT
 	def_bool y
 	depends on EXPERIMENTAL && DMAR && ACPI
 
-config USE_GENERIC_SMP_HELPERS
-	def_bool y
-	depends on SMP
-
 config X86_32_SMP
 	def_bool y
 	depends on X86_32 && SMP
diff --git a/kernel/Makefile b/kernel/Makefile
index 5669f71dfdd5..353d3fe8ba33 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
 obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
 obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
-obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+obj-$(CONFIG_SMP) += smp.o
 ifneq ($(CONFIG_SMP),y)
 obj-y += up.o
 endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 12ed8b013e2d..4ec30e069987 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/smp.h>
 #include <linux/cpu.h>
 
+#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
 static struct {
 	struct list_head	queue;
 	raw_spinlock_t		lock;
@@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void)
 {
 	raw_spin_unlock_irq(&call_function.lock);
 }
+#endif /* USE_GENERIC_SMP_HELPERS */
+
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
+{
+	int ret = 0;
+
+	preempt_disable();
+	ret = smp_call_function(func, info, wait);
+	local_irq_disable();
+	func(info);
+	local_irq_enable();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(on_each_cpu);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0823778f87fc..68eb5efec388 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void)
 }
 early_initcall(spawn_ksoftirqd);
 
-#ifdef CONFIG_SMP
-/*
- * Call a function on all processors
- */
-int on_each_cpu(void (*func) (void *info), void *info, int wait)
-{
-	int ret = 0;
-
-	preempt_disable();
-	ret = smp_call_function(func, info, wait);
-	local_irq_disable();
-	func(info);
-	local_irq_enable();
-	preempt_enable();
-	return ret;
-}
-EXPORT_SYMBOL(on_each_cpu);
-#endif
-
 /*
  * [ These __weak aliases are kept in a separate compilation unit, so that
  *   GCC does not inline them incorrectly. ]
-- 
cgit v1.3-8-gc7d7


From 455cd5ab305c90ffc422dd2e0fb634730942b257 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Wed, 12 Jan 2011 16:59:41 -0800
Subject: kptr_restrict for hiding kernel pointers from unprivileged users

Add the %pK printk format specifier and the /proc/sys/kernel/kptr_restrict
sysctl.

The %pK format specifier is designed to hide exposed kernel pointers,
specifically via /proc interfaces.  Exposing these pointers provides an
easy target for kernel write vulnerabilities, since they reveal the
locations of writable structures containing easily triggerable function
pointers.  The behavior of %pK depends on the kptr_restrict sysctl.

If kptr_restrict is set to 0, no deviation from the standard %p behavior
occurs.  If kptr_restrict is set to 1, the default, if the current user
(intended to be a reader via seq_printf(), etc.) does not have CAP_SYSLOG
(currently in the LSM tree), kernel pointers using %pK are printed as 0's.
 If kptr_restrict is set to 2, kernel pointers using %pK are printed as
0's regardless of privileges.  Replacing with 0's was chosen over the
default "(null)", which cannot be parsed by userland %p, which expects
"(nil)".

[akpm@linux-foundation.org: check for IRQ context when !kptr_restrict, save an indent level, s/WARN/WARN_ONCE/]
[akpm@linux-foundation.org: coding-style fixup]
[randy.dunlap@oracle.com: fix kernel/sysctl.c warning]
Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Thomas Graf <tgraf@infradead.org>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Eric Paris <eparis@parisplace.org>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 14 ++++++++++++++
 include/linux/printk.h          |  1 +
 kernel/sysctl.c                 | 10 ++++++++++
 lib/vsprintf.c                  | 22 ++++++++++++++++++++++
 4 files changed, 47 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 574067194f38..11d5ceda5bb0 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,19 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kptr_restrict:
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.  When
+kptr_restrict is set to (0), there are no restrictions.  When
+kptr_restrict is set to (1), the default, kernel pointers
+printed using the %pK format specifier will be replaced with 0's
+unless the user has CAP_SYSLOG.  When kptr_restrict is set to
+(2), kernel pointers printed using %pK will be replaced with 0's
+regardless of privileges.
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/include/linux/printk.h b/include/linux/printk.h
index b772ca5fbdf0..9adfba6ec28a 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -83,6 +83,7 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 
 extern int printk_delay_msec;
 extern int dmesg_restrict;
+extern int kptr_restrict;
 
 /*
  * Print a one-time message (analogous to WARN_ONCE() et al):
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae5cbb1e3ced..c6811ee2092b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/signal.h>
+#include <linux/printk.h>
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
@@ -710,6 +711,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+	{
+		.procname	= "kptr_restrict",
+		.data		= &kptr_restrict,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
 #endif
 	{
 		.procname	= "ngroups_max",
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c150d3dafff4..6ff38524ec16 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -936,6 +936,8 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	return string(buf, end, uuid, spec);
 }
 
+int kptr_restrict = 1;
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -979,6 +981,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  *       Implements a "recursive vsnprintf".
  *       Do not use this feature without some mechanism to verify the
  *       correctness of the format string and va_list arguments.
+ * - 'K' For a kernel pointer that should be hidden from unprivileged users
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -1035,6 +1038,25 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return buf + vsnprintf(buf, end - buf,
 				       ((struct va_format *)ptr)->fmt,
 				       *(((struct va_format *)ptr)->va));
+	case 'K':
+		/*
+		 * %pK cannot be used in IRQ context because its test
+		 * for CAP_SYSLOG would be meaningless.
+		 */
+		if (in_irq() || in_serving_softirq() || in_nmi()) {
+			if (spec.field_width == -1)
+				spec.field_width = 2 * sizeof(void *);
+			return string(buf, end, "pK-error", spec);
+		} else if ((kptr_restrict == 0) ||
+			 (kptr_restrict == 1 &&
+			  has_capability_noaudit(current, CAP_SYSLOG)))
+			break;
+
+		if (spec.field_width == -1) {
+			spec.field_width = 2 * sizeof(void *);
+			spec.flags |= ZEROPAD;
+		}
+		return number(buf, end, 0, spec);
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {
-- 
cgit v1.3-8-gc7d7


From fb842b00c5eab66ec361b31550aa8a922745ce9e Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 12 Jan 2011 16:59:43 -0800
Subject: printk: use RCU to prevent potential lock contention in kmsg_dump

dump_list_lock is used to protect dump_list in kmsg_dumper implementation,
kmsg_dump() uses it to traverse dump_list too.  But if there is contention
on the lock, kmsg_dump() will fail, and the valuable kernel message may be
lost.

This patch solves this issue with RCU.  Because kmsg_dump() only read the
list, no lock is needed in kmsg_dump().  So that kmsg_dump() will never
fail because of lock contention.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 34 +++++++---------------------------
 1 file changed, 7 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 0b0c9aa71e89..53d9a9ec88e6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -39,6 +39,7 @@
 #include <linux/syslog.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/rculist.h>
 
 #include <asm/uaccess.h>
 
@@ -1502,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper)
 	/* Don't allow registering multiple times */
 	if (!dumper->registered) {
 		dumper->registered = 1;
-		list_add_tail(&dumper->list, &dump_list);
+		list_add_tail_rcu(&dumper->list, &dump_list);
 		err = 0;
 	}
 	spin_unlock_irqrestore(&dump_list_lock, flags);
@@ -1526,33 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
 	spin_lock_irqsave(&dump_list_lock, flags);
 	if (dumper->registered) {
 		dumper->registered = 0;
-		list_del(&dumper->list);
+		list_del_rcu(&dumper->list);
 		err = 0;
 	}
 	spin_unlock_irqrestore(&dump_list_lock, flags);
+	synchronize_rcu();
 
 	return err;
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
 
-static const char * const kmsg_reasons[] = {
-	[KMSG_DUMP_OOPS]	= "oops",
-	[KMSG_DUMP_PANIC]	= "panic",
-	[KMSG_DUMP_KEXEC]	= "kexec",
-	[KMSG_DUMP_RESTART]	= "restart",
-	[KMSG_DUMP_HALT]	= "halt",
-	[KMSG_DUMP_POWEROFF]	= "poweroff",
-	[KMSG_DUMP_EMERG]	= "emergency_restart",
-};
-
-static const char *kmsg_to_str(enum kmsg_dump_reason reason)
-{
-	if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
-		return "unknown";
-
-	return kmsg_reasons[reason];
-}
-
 /**
  * kmsg_dump - dump kernel log to kernel message dumpers.
  * @reason: the reason (oops, panic etc) for dumping
@@ -1591,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		l2 = chars;
 	}
 
-	if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
-		printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
-				kmsg_to_str(reason));
-		return;
-	}
-	list_for_each_entry(dumper, &dump_list, list)
+	rcu_read_lock();
+	list_for_each_entry_rcu(dumper, &dump_list, list)
 		dumper->dump(dumper, reason, s1, l1, s2, l2);
-	spin_unlock_irqrestore(&dump_list_lock, flags);
+	rcu_read_unlock();
 }
 #endif
-- 
cgit v1.3-8-gc7d7


From 34e49d4f635d6a800c4089c40fd254e12e451449 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 12 Jan 2011 17:00:30 -0800
Subject: fs/proc/base.c, kernel/latencytop.c: convert sprintf_symbol() to %ps

Use temporary lr for struct latency_record for improved readability and
fewer columns used.  Removed trailing space from output.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <trivial@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c      | 22 ++++++++--------------
 kernel/latencytop.c | 23 +++++++++--------------
 2 files changed, 17 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index b20962c71a52..336b79803e82 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -373,24 +373,18 @@ static int lstats_show_proc(struct seq_file *m, void *v)
 		return -ESRCH;
 	seq_puts(m, "Latency Top version : v0.1\n");
 	for (i = 0; i < 32; i++) {
-		if (task->latency_record[i].backtrace[0]) {
+		struct latency_record *lr = &task->latency_record[i];
+		if (lr->backtrace[0]) {
 			int q;
-			seq_printf(m, "%i %li %li ",
-				task->latency_record[i].count,
-				task->latency_record[i].time,
-				task->latency_record[i].max);
+			seq_printf(m, "%i %li %li",
+				   lr->count, lr->time, lr->max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_SYMBOL_LEN];
-				char *c;
-				if (!task->latency_record[i].backtrace[q])
+				unsigned long bt = lr->backtrace[q];
+				if (!bt)
 					break;
-				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+				if (bt == ULONG_MAX)
 					break;
-				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
-				c = strchr(sym, '+');
-				if (c)
-					*c = 0;
-				seq_printf(m, "%s ", sym);
+				seq_printf(m, " %ps", (void *)bt);
 			}
 			seq_printf(m, "\n");
 		}
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 17110a4a4fc2..ee74b35e528d 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v)
 	seq_puts(m, "Latency Top version : v0.1\n");
 
 	for (i = 0; i < MAXLR; i++) {
-		if (latency_record[i].backtrace[0]) {
+		struct latency_record *lr = &latency_record[i];
+
+		if (lr->backtrace[0]) {
 			int q;
-			seq_printf(m, "%i %lu %lu ",
-				latency_record[i].count,
-				latency_record[i].time,
-				latency_record[i].max);
+			seq_printf(m, "%i %lu %lu",
+				   lr->count, lr->time, lr->max);
 			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
-				char sym[KSYM_SYMBOL_LEN];
-				char *c;
-				if (!latency_record[i].backtrace[q])
+				unsigned long bt = lr->backtrace[q];
+				if (!bt)
 					break;
-				if (latency_record[i].backtrace[q] == ULONG_MAX)
+				if (bt == ULONG_MAX)
 					break;
-				sprint_symbol(sym, latency_record[i].backtrace[q]);
-				c = strchr(sym, '+');
-				if (c)
-					*c = 0;
-				seq_printf(m, "%s ", sym);
+				seq_printf(m, " %ps", (void *)bt);
 			}
 			seq_printf(m, "\n");
 		}
-- 
cgit v1.3-8-gc7d7


From 556105000334cb440636ef61b862d22b03c24f70 Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Wed, 12 Jan 2011 17:00:45 -0800
Subject: sysctl: fix #ifdef guard comment

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c6811ee2092b..a05605a4c369 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2909,7 +2909,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	}
 }
 
-#else /* CONFIG_PROC_FS */
+#else /* CONFIG_PROC_SYSCTL */
 
 int proc_dostring(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -2961,7 +2961,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
 }
 
 
-#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_PROC_SYSCTL */
 
 /*
  * No sense putting this after each symbol definition, twice,
-- 
cgit v1.3-8-gc7d7


From e020e742e5dbd8c44d31706995dc13ddc732e274 Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Wed, 12 Jan 2011 17:00:45 -0800
Subject: sysctl: remove obsolete comments

ctl_unnumbered.txt have been removed in Documentation directory so just
also remove this invalid comments

[akpm@linux-foundation.org: fix Documentation/sysctl/00-INDEX, per Dave]
Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Cc: Dave Young <hidave.darkstar@gmail.com>
Acked-by: WANG Cong <xiyou.wangcong@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/00-INDEX |  2 --
 kernel/sysctl.c               | 17 -----------------
 2 files changed, 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX
index 1286f455992f..8cf5d493fd03 100644
--- a/Documentation/sysctl/00-INDEX
+++ b/Documentation/sysctl/00-INDEX
@@ -4,8 +4,6 @@ README
 	- general information about /proc/sys/ sysctl files.
 abi.txt
 	- documentation for /proc/sys/abi/*.
-ctl_unnumbered.txt
-	- explanation of why one should not add new binary sysctl numbers.
 fs.txt
 	- documentation for /proc/sys/fs/*.
 kernel.txt
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a05605a4c369..bc86bb32e126 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -246,10 +246,6 @@ static struct ctl_table root_table[] = {
 		.mode		= 0555,
 		.child		= dev_table,
 	},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
 	{ }
 };
 
@@ -972,10 +968,6 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
 	{ }
 };
 
@@ -1336,11 +1328,6 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
-
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
 	{ }
 };
 
@@ -1496,10 +1483,6 @@ static struct ctl_table fs_table[] = {
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
 	},
-/*
- * NOTE: do not add new entries to this table unless you have read
- * Documentation/sysctl/ctl_unnumbered.txt
- */
 	{ }
 };
 
-- 
cgit v1.3-8-gc7d7


From 6164281ab7a4d3bd42588d6b25984e960a2e032f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Wed, 12 Jan 2011 17:00:46 -0800
Subject: user_ns: improve the user_ns on-the-slab packaging

Currently on 64-bit arch the user_namespace is 2096 and when being
kmalloc-ed it resides on a 4k slab wasting 2003 bytes.

If we allocate a separate cache for it and reduce the hash size from 128
to 64 chains the packaging becomes *much* better - the struct is 1072
bytes and the hole between is 98 bytes.

[akpm@linux-foundation.org: s/__initcall/module_init/]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Acked-by: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h |  2 +-
 kernel/user_namespace.c        | 15 ++++++++++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 8178156711f9..faf467944baf 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -6,7 +6,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 8)
+#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
 #define UIDHASH_SZ	(1 << UIDHASH_BITS)
 
 struct user_namespace {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 25915832291a..9da289c34f22 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,6 +12,8 @@
 #include <linux/highuid.h>
 #include <linux/cred.h>
 
+static struct kmem_cache *user_ns_cachep __read_mostly;
+
 /*
  * Create a new user namespace, deriving the creator from the user in the
  * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +28,7 @@ int create_user_ns(struct cred *new)
 	struct user_struct *root_user;
 	int n;
 
-	ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL);
+	ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL);
 	if (!ns)
 		return -ENOMEM;
 
@@ -38,7 +40,7 @@ int create_user_ns(struct cred *new)
 	/* Alloc new root user.  */
 	root_user = alloc_uid(ns, 0);
 	if (!root_user) {
-		kfree(ns);
+		kmem_cache_free(user_ns_cachep, ns);
 		return -ENOMEM;
 	}
 
@@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work)
 	struct user_namespace *ns =
 		container_of(work, struct user_namespace, destroyer);
 	free_uid(ns->creator);
-	kfree(ns);
+	kmem_cache_free(user_ns_cachep, ns);
 }
 
 void free_user_ns(struct kref *kref)
@@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t
 	/* No useful relationship so no mapping */
 	return overflowgid;
 }
+
+static __init int user_namespaces_init(void)
+{
+	user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
+	return 0;
+}
+module_init(user_namespaces_init);
-- 
cgit v1.3-8-gc7d7


From 9ab020cf07e457a8b425bf5af17e704f90f86d8b Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 12 Jan 2011 17:00:48 -0800
Subject: taskstats: use better ifdef for alignment

Commit 4be2c95d ("taskstats: pad taskstats netlink response for aligment
issues on ia64") added a null field to align the taskstats structure but
the discussion centered around ia64.  The issue exists on other platforms
with inefficient unaligned access and adding them piecemeal would be an
unmaintainable mess.

This patch uses Dave Miller's suggestion of using a combination of
CONFIG_64BIT && !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS to determine
whether alignment is needed.

Note that this will cause breakage on those platforms with applications
like iotop which had hard-coded offsets into the packet to access the
taskstats structure.

The message seen on systems without the alignment fixes looks like: kernel
unaligned access to 0xe000023879dca9bc, ip=0xa000000100133d10

The addresses may vary but resolve to locations inside __delayacct_add_tsk.

iotop makes what I'd call unreasonable assumptions about the contents of a
netlink genetlink packet containing generic attributes.  They're typed and
have headers that specify value lengths, so the client can (should)
identify and skip the ones the client doesn't understand.

The kernel, as of version 2.6.36, presented a packet like so:
+--------------------------------+
| genlmsghdr - 4 bytes           |
+--------------------------------+
| NLA header - 4 bytes           | /* Aggregate header */
+-+------------------------------+
| | NLA header - 4 bytes         | /* PID header */
| +------------------------------+
| | pid/tgid   - 4 bytes         |
| +------------------------------+
| | NLA header - 4 bytes         | /* stats header */
| + -----------------------------+ <- oops. aligned on 4 byte boundary
| | struct taskstats - 328 bytes |
+-+------------------------------+

The iotop code expects that the kernel will behave as it did then,
assuming that the packet format is set in stone.  The format is set in
stone, but the packet offsets are not.  There's nothing in the packet
format that guarantees that the packet will be sent in exactly the same
way.  The attribute contents are set (or versioned) and the aggregate
contents are set but they can be anywhere in the packet.

The issue here isn't that an unaligned structure gets passed to userspace,
it's that the NLA infrastructure has something of a weakness: The 4 byte
attribute header may force the payload to be unaligned.  The taskstats
structure is created at an unaligned location and then 64-bit values are
operated on inside the kernel, so the unaligned access warnings gets
spewed everywhere.

It's possible to use the unaligned access API to operate on the structure
in the kernel but it seems like a wasted effort to work around userspace
code that isn't following the packet format.  Any new additions would also
need the be worked around.  It's a maintenance nightmare.

The conclusion of the earlier discussion seemed to be "ok fine, if we have
to break it, don't break it on arches that don't have the problem." Dave
pointed out that the unaligned access problem doesn't only exist on ia64,
but also on other 64-bit arches that don't have efficient unaligned access
and it should be fixed there as well.  The committed version of the patch
and this addition keep with the conclusion of that discussion not to break
it unnecessarily, which the pid padding and the packet padding fixes did
do.  x86_64 and powerpc don't suffer this problem so they shouldn't suffer
the solution.  Other 64-bit architectures do and will, though.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reported-by: David S. Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Dan Carpenter <error27@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Florian Mickler <florian@mickler.org>
Cc: Guillaume Chazarain <guichaz@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 69691eb4b715..3971c6b9d58d 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -348,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask)
 	return ret;
 }
 
-#ifdef CONFIG_IA64
+#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
 #define TASKSTATS_NEEDS_PADDING 1
 #endif
 
-- 
cgit v1.3-8-gc7d7


From 025b40abe715d638e60516a657d354e8560c1a85 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:56 -0800
Subject: ntp: add hardpps implementation

This commit adds hardpps() implementation based upon the original one from
the NTPv4 reference kernel code from David Mills.  However, it is highly
optimized towards very fast syncronization and maximum stickness to PPS
signal.  The typical error is less then a microsecond.

To make it sync faster I had to throw away exponential phase filter so
that the full phase offset is corrected immediately.  Then I also had to
throw away median phase filter because it gives a bigger error itself if
used without exponential filter.

Maybe we will find an appropriate filtering scheme in the future but it's
not necessary if the signal quality is ok.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/Kconfig   |   9 ++
 include/linux/timex.h |   1 +
 kernel/time/ntp.c     | 425 ++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 420 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig
index 1afe4e03440f..0ad5ff38bfec 100644
--- a/drivers/pps/Kconfig
+++ b/drivers/pps/Kconfig
@@ -30,6 +30,15 @@ config PPS_DEBUG
 	  messages to the system log.  Select this if you are having a
 	  problem with PPS support and want to see more of what is going on.
 
+config NTP_PPS
+	bool "PPS kernel consumer support"
+	depends on PPS && !NO_HZ
+	help
+	  This option adds support for direct in-kernel time
+	  syncronization using an external PPS signal.
+
+	  It doesn't work on tickless systems at the moment.
+
 source drivers/pps/clients/Kconfig
 
 endmenu
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 32d852f8cbe4..d23999f9499d 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -268,6 +268,7 @@ extern u64 tick_length;
 extern void second_overflow(void);
 extern void update_ntp_one_tick(void);
 extern int do_adjtimex(struct timex *);
+extern void hardpps(const struct timespec *, const struct timespec *);
 
 int read_current_timer(unsigned long *timer_val);
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index d2321891538f..5c00242fa921 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -14,6 +14,7 @@
 #include <linux/timex.h>
 #include <linux/time.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 /*
  * NTP timekeeping variables:
@@ -74,6 +75,162 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
 
+#ifdef CONFIG_NTP_PPS
+
+/*
+ * The following variables are used when a pulse-per-second (PPS) signal
+ * is available. They establish the engineering parameters of the clock
+ * discipline loop when controlled by the PPS signal.
+ */
+#define PPS_VALID	10	/* PPS signal watchdog max (s) */
+#define PPS_POPCORN	4	/* popcorn spike threshold (shift) */
+#define PPS_INTMIN	2	/* min freq interval (s) (shift) */
+#define PPS_INTMAX	8	/* max freq interval (s) (shift) */
+#define PPS_INTCOUNT	4	/* number of consecutive good intervals to
+				   increase pps_shift or consecutive bad
+				   intervals to decrease it */
+#define PPS_MAXWANDER	100000	/* max PPS freq wander (ns/s) */
+
+static int pps_valid;		/* signal watchdog counter */
+static long pps_tf[3];		/* phase median filter */
+static long pps_jitter;		/* current jitter (ns) */
+static struct timespec pps_fbase; /* beginning of the last freq interval */
+static int pps_shift;		/* current interval duration (s) (shift) */
+static int pps_intcnt;		/* interval counter */
+static s64 pps_freq;		/* frequency offset (scaled ns/s) */
+static long pps_stabil;		/* current stability (scaled ns/s) */
+
+/*
+ * PPS signal quality monitors
+ */
+static long pps_calcnt;		/* calibration intervals */
+static long pps_jitcnt;		/* jitter limit exceeded */
+static long pps_stbcnt;		/* stability limit exceeded */
+static long pps_errcnt;		/* calibration errors */
+
+
+/* PPS kernel consumer compensates the whole phase error immediately.
+ * Otherwise, reduce the offset by a fixed factor times the time constant.
+ */
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		return offset;
+	else
+		return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void)
+{
+	/* the PPS calibration interval may end
+	   surprisingly early */
+	pps_shift = PPS_INTMIN;
+	pps_intcnt = 0;
+}
+
+/**
+ * pps_clear - Clears the PPS state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_clear(void)
+{
+	pps_reset_freq_interval();
+	pps_tf[0] = 0;
+	pps_tf[1] = 0;
+	pps_tf[2] = 0;
+	pps_fbase.tv_sec = pps_fbase.tv_nsec = 0;
+	pps_freq = 0;
+}
+
+/* Decrease pps_valid to indicate that another second has passed since
+ * the last PPS signal. When it reaches 0, indicate that PPS signal is
+ * missing.
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void pps_dec_valid(void)
+{
+	if (pps_valid > 0)
+		pps_valid--;
+	else {
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				 STA_PPSWANDER | STA_PPSERROR);
+		pps_clear();
+	}
+}
+
+static inline void pps_set_freq(s64 freq)
+{
+	pps_freq = freq;
+}
+
+static inline int is_error_status(int status)
+{
+	return (time_status & (STA_UNSYNC|STA_CLOCKERR))
+		/* PPS signal lost when either PPS time or
+		 * PPS frequency synchronization requested
+		 */
+		|| ((time_status & (STA_PPSFREQ|STA_PPSTIME))
+			&& !(time_status & STA_PPSSIGNAL))
+		/* PPS jitter exceeded when
+		 * PPS time synchronization requested */
+		|| ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+			== (STA_PPSTIME|STA_PPSJITTER))
+		/* PPS wander exceeded or calibration error when
+		 * PPS frequency synchronization requested
+		 */
+		|| ((time_status & STA_PPSFREQ)
+			&& (time_status & (STA_PPSWANDER|STA_PPSERROR)));
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+	txc->ppsfreq	   = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) *
+					 PPM_SCALE_INV, NTP_SCALE_SHIFT);
+	txc->jitter	   = pps_jitter;
+	if (!(time_status & STA_NANO))
+		txc->jitter /= NSEC_PER_USEC;
+	txc->shift	   = pps_shift;
+	txc->stabil	   = pps_stabil;
+	txc->jitcnt	   = pps_jitcnt;
+	txc->calcnt	   = pps_calcnt;
+	txc->errcnt	   = pps_errcnt;
+	txc->stbcnt	   = pps_stbcnt;
+}
+
+#else /* !CONFIG_NTP_PPS */
+
+static inline s64 ntp_offset_chunk(s64 offset)
+{
+	return shift_right(offset, SHIFT_PLL + time_constant);
+}
+
+static inline void pps_reset_freq_interval(void) {}
+static inline void pps_clear(void) {}
+static inline void pps_dec_valid(void) {}
+static inline void pps_set_freq(s64 freq) {}
+
+static inline int is_error_status(int status)
+{
+	return status & (STA_UNSYNC|STA_CLOCKERR);
+}
+
+static inline void pps_fill_timex(struct timex *txc)
+{
+	/* PPS is not implemented, so these are zero */
+	txc->ppsfreq	   = 0;
+	txc->jitter	   = 0;
+	txc->shift	   = 0;
+	txc->stabil	   = 0;
+	txc->jitcnt	   = 0;
+	txc->calcnt	   = 0;
+	txc->errcnt	   = 0;
+	txc->stbcnt	   = 0;
+}
+
+#endif /* CONFIG_NTP_PPS */
+
 /*
  * NTP methods:
  */
@@ -185,6 +342,9 @@ void ntp_clear(void)
 
 	tick_length	= tick_length_base;
 	time_offset	= 0;
+
+	/* Clear PPS state variables */
+	pps_clear();
 }
 
 /*
@@ -250,16 +410,16 @@ void second_overflow(void)
 		time_status |= STA_UNSYNC;
 	}
 
-	/*
-	 * Compute the phase adjustment for the next second. The offset is
-	 * reduced by a fixed factor times the time constant.
-	 */
+	/* Compute the phase adjustment for the next second */
 	tick_length	 = tick_length_base;
 
-	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant);
+	delta		 = ntp_offset_chunk(time_offset);
 	time_offset	-= delta;
 	tick_length	+= delta;
 
+	/* Check PPS signal */
+	pps_dec_valid();
+
 	if (!time_adjust)
 		return;
 
@@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
+		/* restart PPS frequency calibration */
+		pps_reset_freq_interval();
 	}
 
 	/*
@@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 		time_freq = txc->freq * PPM_SCALE;
 		time_freq = min(time_freq, MAXFREQ_SCALED);
 		time_freq = max(time_freq, -MAXFREQ_SCALED);
+		/* update pps_freq */
+		pps_set_freq(time_freq);
 	}
 
 	if (txc->modes & ADJ_MAXERROR)
@@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc)
 	}
 
 	result = time_state;	/* mostly `TIME_OK' */
-	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
+	/* check for errors */
+	if (is_error_status(time_status))
 		result = TIME_ERROR;
 
 	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
@@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc)
 	txc->tick	   = tick_usec;
 	txc->tai	   = time_tai;
 
-	/* PPS is not implemented, so these are zero */
-	txc->ppsfreq	   = 0;
-	txc->jitter	   = 0;
-	txc->shift	   = 0;
-	txc->stabil	   = 0;
-	txc->jitcnt	   = 0;
-	txc->calcnt	   = 0;
-	txc->errcnt	   = 0;
-	txc->stbcnt	   = 0;
+	/* fill PPS status fields */
+	pps_fill_timex(txc);
 
 	write_sequnlock_irq(&xtime_lock);
 
@@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc)
 	return result;
 }
 
+#ifdef	CONFIG_NTP_PPS
+
+/* actually struct pps_normtime is good old struct timespec, but it is
+ * semantically different (and it is the reason why it was invented):
+ * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ]
+ * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */
+struct pps_normtime {
+	__kernel_time_t	sec;	/* seconds */
+	long		nsec;	/* nanoseconds */
+};
+
+/* normalize the timestamp so that nsec is in the
+   ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */
+static inline struct pps_normtime pps_normalize_ts(struct timespec ts)
+{
+	struct pps_normtime norm = {
+		.sec = ts.tv_sec,
+		.nsec = ts.tv_nsec
+	};
+
+	if (norm.nsec > (NSEC_PER_SEC >> 1)) {
+		norm.nsec -= NSEC_PER_SEC;
+		norm.sec++;
+	}
+
+	return norm;
+}
+
+/* get current phase correction and jitter */
+static inline long pps_phase_filter_get(long *jitter)
+{
+	*jitter = pps_tf[0] - pps_tf[1];
+	if (*jitter < 0)
+		*jitter = -*jitter;
+
+	/* TODO: test various filters */
+	return pps_tf[0];
+}
+
+/* add the sample to the phase filter */
+static inline void pps_phase_filter_add(long err)
+{
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = err;
+}
+
+/* decrease frequency calibration interval length.
+ * It is halved after four consecutive unstable intervals.
+ */
+static inline void pps_dec_freq_interval(void)
+{
+	if (--pps_intcnt <= -PPS_INTCOUNT) {
+		pps_intcnt = -PPS_INTCOUNT;
+		if (pps_shift > PPS_INTMIN) {
+			pps_shift--;
+			pps_intcnt = 0;
+		}
+	}
+}
+
+/* increase frequency calibration interval length.
+ * It is doubled after four consecutive stable intervals.
+ */
+static inline void pps_inc_freq_interval(void)
+{
+	if (++pps_intcnt >= PPS_INTCOUNT) {
+		pps_intcnt = PPS_INTCOUNT;
+		if (pps_shift < PPS_INTMAX) {
+			pps_shift++;
+			pps_intcnt = 0;
+		}
+	}
+}
+
+/* update clock frequency based on MONOTONIC_RAW clock PPS signal
+ * timestamps
+ *
+ * At the end of the calibration interval the difference between the
+ * first and last MONOTONIC_RAW clock timestamps divided by the length
+ * of the interval becomes the frequency update. If the interval was
+ * too long, the data are discarded.
+ * Returns the difference between old and new frequency values.
+ */
+static long hardpps_update_freq(struct pps_normtime freq_norm)
+{
+	long delta, delta_mod;
+	s64 ftemp;
+
+	/* check if the frequency interval was too long */
+	if (freq_norm.sec > (2 << pps_shift)) {
+		time_status |= STA_PPSERROR;
+		pps_errcnt++;
+		pps_dec_freq_interval();
+		pr_err("hardpps: PPSERROR: interval too long - %ld s\n",
+				freq_norm.sec);
+		return 0;
+	}
+
+	/* here the raw frequency offset and wander (stability) is
+	 * calculated. If the wander is less than the wander threshold
+	 * the interval is increased; otherwise it is decreased.
+	 */
+	ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT,
+			freq_norm.sec);
+	delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT);
+	pps_freq = ftemp;
+	if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) {
+		pr_warning("hardpps: PPSWANDER: change=%ld\n", delta);
+		time_status |= STA_PPSWANDER;
+		pps_stbcnt++;
+		pps_dec_freq_interval();
+	} else {	/* good sample */
+		pps_inc_freq_interval();
+	}
+
+	/* the stability metric is calculated as the average of recent
+	 * frequency changes, but is used only for performance
+	 * monitoring
+	 */
+	delta_mod = delta;
+	if (delta_mod < 0)
+		delta_mod = -delta_mod;
+	pps_stabil += (div_s64(((s64)delta_mod) <<
+				(NTP_SCALE_SHIFT - SHIFT_USEC),
+				NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN;
+
+	/* if enabled, the system clock frequency is updated */
+	if ((time_status & STA_PPSFREQ) != 0 &&
+	    (time_status & STA_FREQHOLD) == 0) {
+		time_freq = pps_freq;
+		ntp_update_frequency();
+	}
+
+	return delta;
+}
+
+/* correct REALTIME clock phase error against PPS signal */
+static void hardpps_update_phase(long error)
+{
+	long correction = -error;
+	long jitter;
+
+	/* add the sample to the median filter */
+	pps_phase_filter_add(correction);
+	correction = pps_phase_filter_get(&jitter);
+
+	/* Nominal jitter is due to PPS signal noise. If it exceeds the
+	 * threshold, the sample is discarded; otherwise, if so enabled,
+	 * the time offset is updated.
+	 */
+	if (jitter > (pps_jitter << PPS_POPCORN)) {
+		pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n",
+		       jitter, (pps_jitter << PPS_POPCORN));
+		time_status |= STA_PPSJITTER;
+		pps_jitcnt++;
+	} else if (time_status & STA_PPSTIME) {
+		/* correct the time using the phase offset */
+		time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT,
+				NTP_INTERVAL_FREQ);
+		/* cancel running adjtime() */
+		time_adjust = 0;
+	}
+	/* update jitter */
+	pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN;
+}
+
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS signal arrival in order to
+ * discipline the CPU clock oscillator to the PPS signal. It takes two
+ * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former
+ * is used to correct clock phase error and the latter is used to
+ * correct the frequency.
+ *
+ * This code is based on David Mills's reference nanokernel
+ * implementation. It was mostly rewritten but keeps the same idea.
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+	struct pps_normtime pts_norm, freq_norm;
+	unsigned long flags;
+
+	pts_norm = pps_normalize_ts(*phase_ts);
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	/* clear the error bits, they will be set again if needed */
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+
+	/* indicate signal presence */
+	time_status |= STA_PPSSIGNAL;
+	pps_valid = PPS_VALID;
+
+	/* when called for the first time,
+	 * just start the frequency interval */
+	if (unlikely(pps_fbase.tv_sec == 0)) {
+		pps_fbase = *raw_ts;
+		write_sequnlock_irqrestore(&xtime_lock, flags);
+		return;
+	}
+
+	/* ok, now we have a base for frequency calculation */
+	freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase));
+
+	/* check that the signal is in the range
+	 * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */
+	if ((freq_norm.sec == 0) ||
+			(freq_norm.nsec > MAXFREQ * freq_norm.sec) ||
+			(freq_norm.nsec < -MAXFREQ * freq_norm.sec)) {
+		time_status |= STA_PPSJITTER;
+		/* restart the frequency calibration interval */
+		pps_fbase = *raw_ts;
+		write_sequnlock_irqrestore(&xtime_lock, flags);
+		pr_err("hardpps: PPSJITTER: bad pulse\n");
+		return;
+	}
+
+	/* signal is ok */
+
+	/* check if the current frequency interval is finished */
+	if (freq_norm.sec >= (1 << pps_shift)) {
+		pps_calcnt++;
+		/* restart the frequency calibration interval */
+		pps_fbase = *raw_ts;
+		hardpps_update_freq(freq_norm);
+	}
+
+	hardpps_update_phase(pts_norm.nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+
+#endif	/* CONFIG_NTP_PPS */
+
 static int __init ntp_tick_adj_setup(char *str)
 {
 	ntp_tick_adj = simple_strtol(str, NULL, 0);
-- 
cgit v1.3-8-gc7d7


From e2c18e49a0d4f822ffc29fb4958943beb1ff08b7 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Date: Wed, 12 Jan 2011 17:00:57 -0800
Subject: pps: capture MONOTONIC_RAW timestamps as well

MONOTONIC_RAW clock timestamps are ideally suited for frequency
calculation and also fit well into the original NTP hardpps design.  Now
phase and frequency can be adjusted separately: the former based on
REALTIME clock and the latter based on MONOTONIC_RAW clock.

A new function getnstime_raw_and_real is added to timekeeping subsystem to
capture both timestamps at the same time and atomically.

Signed-off-by: Alexander Gordeev <lasaine@lvk.cs.msu.su>
Acked-by: John Stultz <johnstul@us.ibm.com>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pps_kernel.h | 14 ++++++++++++++
 include/linux/time.h       |  2 ++
 kernel/time/timekeeping.c  | 43 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h
index 1aedf50088cf..94048547f29a 100644
--- a/include/linux/pps_kernel.h
+++ b/include/linux/pps_kernel.h
@@ -47,6 +47,9 @@ struct pps_source_info {
 };
 
 struct pps_event_time {
+#ifdef CONFIG_NTP_PPS
+	struct timespec ts_raw;
+#endif /* CONFIG_NTP_PPS */
 	struct timespec ts_real;
 };
 
@@ -97,10 +100,21 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt,
 	kt->nsec = ts.tv_nsec;
 }
 
+#ifdef CONFIG_NTP_PPS
+
+static inline void pps_get_ts(struct pps_event_time *ts)
+{
+	getnstime_raw_and_real(&ts->ts_raw, &ts->ts_real);
+}
+
+#else /* CONFIG_NTP_PPS */
+
 static inline void pps_get_ts(struct pps_event_time *ts)
 {
 	getnstimeofday(&ts->ts_real);
 }
 
+#endif /* CONFIG_NTP_PPS */
+
 #endif /* LINUX_PPS_KERNEL_H */
 
diff --git a/include/linux/time.h b/include/linux/time.h
index 9f15ac7ab92a..1e6d3b59238d 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -158,6 +158,8 @@ extern unsigned int alarm_setitimer(unsigned int seconds);
 extern int do_getitimer(int which, struct itimerval *value);
 extern void getnstimeofday(struct timespec *tv);
 extern void getrawmonotonic(struct timespec *ts);
+extern void getnstime_raw_and_real(struct timespec *ts_raw,
+		struct timespec *ts_real);
 extern void getboottime(struct timespec *ts);
 extern void monotonic_to_bootbased(struct timespec *ts);
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5bb86da82003..5536aaf3ba36 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+#ifdef CONFIG_NTP_PPS
+
+/**
+ * getnstime_raw_and_real - get day and raw monotonic time in timespec format
+ * @ts_raw:	pointer to the timespec to be set to raw monotonic time
+ * @ts_real:	pointer to the timespec to be set to the time of day
+ *
+ * This function reads both the time of day and raw monotonic time at the
+ * same time atomically and stores the resulting timestamps in timespec
+ * format.
+ */
+void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
+{
+	unsigned long seq;
+	s64 nsecs_raw, nsecs_real;
+
+	WARN_ON_ONCE(timekeeping_suspended);
+
+	do {
+		u32 arch_offset;
+
+		seq = read_seqbegin(&xtime_lock);
+
+		*ts_raw = raw_time;
+		*ts_real = xtime;
+
+		nsecs_raw = timekeeping_get_ns_raw();
+		nsecs_real = timekeeping_get_ns();
+
+		/* If arch requires, add in gettimeoffset() */
+		arch_offset = arch_gettimeoffset();
+		nsecs_raw += arch_offset;
+		nsecs_real += arch_offset;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	timespec_add_ns(ts_raw, nsecs_raw);
+	timespec_add_ns(ts_real, nsecs_real);
+}
+EXPORT_SYMBOL(getnstime_raw_and_real);
+
+#endif /* CONFIG_NTP_PPS */
+
 /**
  * do_gettimeofday - Returns the time of day in a timeval
  * @tv:		pointer to the timeval to be set
-- 
cgit v1.3-8-gc7d7


From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 13 Jan 2011 15:45:38 -0800
Subject: irq: use per_cpu kstat_irqs

Use modern per_cpu API to increment {soft|hard}irq counters, and use
per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array.

This gives better SMP/NUMA locality and saves few instructions per irq.

With small nr_cpuids values (8 for example), kstats_irq was a small array
(less than L1_CACHE_BYTES), potentially source of false sharing.

In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly
kstat_irqs_all[NR_IRQS][NR_CPUS] array.

Note: we still populate kstats_irq for all possible irqs in
early_irq_init().  We probably could use on-demand allocations.  (Code
included in alloc_descs()).  Problem is not all IRQS are used with a prior
alloc_descs() call.

kstat_irqs_this_cpu() is not used anymore, remove it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/irqdesc.h     |  2 +-
 include/linux/kernel_stat.h | 19 +++++++++----------
 kernel/irq/irqdesc.c        | 40 ++++++++++++++++++++++++++++++----------
 3 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68cc7458..6a64c6fa81af 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -57,7 +57,7 @@ struct irq_desc {
 #endif
 
 	struct timer_rand_state *timer_rand_state;
-	unsigned int		*kstat_irqs;
+	unsigned int __percpu	*kstat_irqs;
 	irq_flow_handler_t	handle_irq;
 	struct irqaction	*action;	/* IRQ action list */
 	unsigned int		status;		/* IRQ status */
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44e83ba12b5b..0cce2db580c3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 extern unsigned long long nr_context_switches(void);
 
 #ifndef CONFIG_GENERIC_HARDIRQS
-#define kstat_irqs_this_cpu(irq) \
-	(this_cpu_read(kstat.irqs[irq])
 
 struct irq_desc;
 
 static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 					    struct irq_desc *desc)
 {
-	kstat_this_cpu.irqs[irq]++;
-	kstat_this_cpu.irqs_sum++;
+	__this_cpu_inc(kstat.irqs[irq]);
+	__this_cpu_inc(kstat.irqs_sum);
 }
 
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
@@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 #else
 #include <linux/irq.h>
 extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
-#define kstat_irqs_this_cpu(DESC) \
-	((DESC)->kstat_irqs[smp_processor_id()])
-#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\
-	((DESC)->kstat_irqs[smp_processor_id()]++);\
-	kstat_this_cpu.irqs_sum++; } while (0)
+
+#define kstat_incr_irqs_this_cpu(irqno, DESC)		\
+do {							\
+	__this_cpu_inc(*(DESC)->kstat_irqs);		\
+	__this_cpu_inc(kstat.irqs_sum);			\
+} while (0)
 
 #endif
 
 static inline void kstat_incr_softirqs_this_cpu(unsigned int irq)
 {
-	kstat_this_cpu.softirqs[irq]++;
+	__this_cpu_inc(kstat.softirqs[irq]);
 }
 
 static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9988d03797f5..282f20230e67 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; }
 
 static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 {
+	int cpu;
+
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
@@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 	desc->name = NULL;
-	memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
 	desc_smp_init(desc, node);
 }
 
@@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
 	if (!desc)
 		return NULL;
 	/* allocate based on nr_cpu_ids */
-	desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
-					 gfp, node);
+	desc->kstat_irqs = alloc_percpu(unsigned int);
 	if (!desc->kstat_irqs)
 		goto err_desc;
 
@@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
 	return desc;
 
 err_kstat:
-	kfree(desc->kstat_irqs);
+	free_percpu(desc->kstat_irqs);
 err_desc:
 	kfree(desc);
 	return NULL;
@@ -166,7 +168,7 @@ static void free_desc(unsigned int irq)
 	mutex_unlock(&sparse_irq_lock);
 
 	free_masks(desc);
-	kfree(desc->kstat_irqs);
+	free_percpu(desc->kstat_irqs);
 	kfree(desc);
 }
 
@@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
-static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
 	int count, i, node = first_online_node;
@@ -250,7 +251,8 @@ int __init early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc[i].irq_data.irq = i;
 		desc[i].irq_data.chip = &no_irq_chip;
-		desc[i].kstat_irqs = kstat_irqs_all[i];
+		/* TODO : do this allocation on-demand ... */
+		desc[i].kstat_irqs = alloc_percpu(unsigned int);
 		alloc_masks(desc + i, GFP_KERNEL, node);
 		desc_smp_init(desc + i, node);
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -275,6 +277,22 @@ static void free_desc(unsigned int irq)
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
 {
+#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
+	struct irq_desc *desc;
+	unsigned int i;
+
+	for (i = 0; i < cnt; i++) {
+		desc = irq_to_desc(start + i);
+		if (desc && !desc->kstat_irqs) {
+			unsigned int __percpu *stats = alloc_percpu(unsigned int);
+
+			if (!stats)
+				return -1;
+			if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
+				free_percpu(stats);
+		}
+	}
+#endif
 	return start;
 }
 #endif /* !CONFIG_SPARSE_IRQ */
@@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq)
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	return desc ? desc->kstat_irqs[cpu] : 0;
+
+	return desc && desc->kstat_irqs ?
+			*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
 }
 
 #ifdef CONFIG_GENERIC_HARDIRQS
@@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq)
 	int cpu;
 	int sum = 0;
 
-	if (!desc)
+	if (!desc || !desc->kstat_irqs)
 		return 0;
 	for_each_possible_cpu(cpu)
-		sum += desc->kstat_irqs[cpu];
+		sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
 	return sum;
 }
 #endif /* CONFIG_GENERIC_HARDIRQS */
-- 
cgit v1.3-8-gc7d7


From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 13 Jan 2011 15:45:40 -0800
Subject: sched: remove long deprecated CLONE_STOPPED flag

This warning was added in commit bdff746a3915 ("clone: prepare to recycle
CLONE_STOPPED") three years ago.  2.6.26 came and went.  As far as I know,
no-one is actually using CLONE_STOPPED.

Signed-off-by: Dave Jones <davej@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  3 ++-
 kernel/fork.c         | 28 +---------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 96e23215e276..07402530fc70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -21,7 +21,8 @@
 #define CLONE_DETACHED		0x00400000	/* Unused, ignored */
 #define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
-#define CLONE_STOPPED		0x02000000	/* Start in stopped state */
+/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
+   and is now available for re-use. */
 #define CLONE_NEWUTS		0x04000000	/* New utsname group? */
 #define CLONE_NEWIPC		0x08000000	/* New ipcs */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
diff --git a/kernel/fork.c b/kernel/fork.c
index d9b44f20b6b0..76a1fdd80bdf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags,
 			return -EPERM;
 	}
 
-	/*
-	 * We hope to recycle these flags after 2.6.26
-	 */
-	if (unlikely(clone_flags & CLONE_STOPPED)) {
-		static int __read_mostly count = 100;
-
-		if (count > 0 && printk_ratelimit()) {
-			char comm[TASK_COMM_LEN];
-
-			count--;
-			printk(KERN_INFO "fork(): process `%s' used deprecated "
-					"clone flags 0x%lx\n",
-				get_task_comm(comm, current),
-				clone_flags & CLONE_STOPPED);
-		}
-	}
-
 	/*
 	 * When called from kernel_thread, don't do user tracing stuff.
 	 */
@@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags,
 		 */
 		p->flags &= ~PF_STARTING;
 
-		if (unlikely(clone_flags & CLONE_STOPPED)) {
-			/*
-			 * We'll start up with an immediate SIGSTOP.
-			 */
-			sigaddset(&p->pending.signal, SIGSTOP);
-			set_tsk_thread_flag(p, TIF_SIGPENDING);
-			__set_task_state(p, TASK_STOPPED);
-		} else {
-			wake_up_new_task(p, clone_flags);
-		}
+		wake_up_new_task(p, clone_flags);
 
 		tracehook_report_clone_complete(trace, regs,
 						clone_flags, nr, p);
-- 
cgit v1.3-8-gc7d7


From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@chromium.org>
Date: Thu, 13 Jan 2011 15:46:05 -0800
Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down

We'd like to be able to oom_score_adj a process up/down as it
enters/leaves the foreground.  Currently, it is not possible to oom_adj
down without CAP_SYS_RESOURCE.  This patch allows a task to decrease its
oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to
or its inherited value at fork.  Assuming the thread that has forked it
has oom_score_adj of 0, each process could decrease it back from 0 upon
activation unless a CAP_SYS_RESOURCE thread elevated it to something
higher.

Alternative considered:

* a setuid binary
* a daemon with CAP_SYS_RESOURCE

Since you don't wan't all processes to be able to reduce their oom_adj, a
setuid or daemon implementation would be complex.  The alternatives also
have much higher overhead.

This patch updated from original patch based on feedback from David
Rientjes.

Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Ying Han <yinghan@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 4 ++++
 fs/proc/base.c                     | 4 +++-
 include/linux/sched.h              | 2 ++
 kernel/fork.c                      | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index ef757fca470b..23cae6548d3a 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1323,6 +1323,10 @@ scaled linearly with /proc/<pid>/oom_score_adj.
 Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
 other with its scaled value.
 
+The value of /proc/<pid>/oom_score_adj may be reduced no lower than the last
+value set by a CAP_SYS_RESOURCE process. To reduce the value any lower
+requires CAP_SYS_RESOURCE.
+
 NOTICE: /proc/<pid>/oom_adj is deprecated and will be removed, please see
 Documentation/feature-removal-schedule.txt.
 
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93f1cdd5d3d7..9d096e82b201 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 		goto err_task_lock;
 	}
 
-	if (oom_score_adj < task->signal->oom_score_adj &&
+	if (oom_score_adj < task->signal->oom_score_adj_min &&
 			!capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
@@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 			atomic_dec(&task->mm->oom_disable_count);
 	}
 	task->signal->oom_score_adj = oom_score_adj;
+	if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
+		task->signal->oom_score_adj_min = oom_score_adj;
 	/*
 	 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
 	 * always attainable.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 07402530fc70..f23b5bb6f52e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -634,6 +634,8 @@ struct signal_struct {
 
 	int oom_adj;		/* OOM kill score adjustment (bit shift) */
 	int oom_score_adj;	/* OOM kill score adjustment */
+	int oom_score_adj_min;	/* OOM kill score adjustment minimum value.
+				 * Only settable by CAP_SYS_RESOURCE. */
 
 	struct mutex cred_guard_mutex;	/* guard against foreign influences on
 					 * credential calculations
diff --git a/kernel/fork.c b/kernel/fork.c
index 76a1fdd80bdf..1499607e4da2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
+	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
 	mutex_init(&sig->cred_guard_mutex);
 
-- 
cgit v1.3-8-gc7d7


From a5b338f2b0b1ff73ae20c66ab831201549eaec01 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:34 -0800
Subject: thp: update futex compound knowledge

Futex code is smarter than most other gup_fast O_DIRECT code and knows
about the compound internals.  However now doing a put_page(head_page)
will not release the pin on the tail page taken by gup-fast, leading to
all sort of refcounting bugchecks.  Getting a stable head_page is a little
tricky.

page_head = page is there because if this is not a tail page it's also the
page_head.  Only in case this is a tail page, compound_head is called,
otherwise it's guaranteed unnecessary.  And if it's a tail page
compound_head has to run atomically inside irq disabled section
__get_user_pages_fast before returning.  Otherwise ->first_page won't be a
stable pointer.

Disableing irq before __get_user_page_fast and releasing irq after running
compound_head is needed because if __get_user_page_fast returns == 1, it
means the huge pmd is established and cannot go away from under us.
pmdp_splitting_flush_notify in __split_huge_page_splitting will have to
wait for local_irq_enable before the IPI delivery can return.  This means
__split_huge_page_refcount can't be running from under us, and in turn
when we run compound_head(page) we're not reading a dangling pointer from
tailpage->first_page.  Then after we get to stable head page, we are
always safe to call compound_lock and after taking the compound lock on
head page we can finally re-check if the page returned by gup-fast is
still a tail page.  in which case we're set and we didn't need to split
the hugepage in order to take a futex on it.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/futex.c | 55 +++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3019b92e6917..52075633373f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
-	struct page *page;
+	struct page *page, *page_head;
 	int err;
 
 	/*
@@ -265,11 +265,46 @@ again:
 	if (err < 0)
 		return err;
 
-	page = compound_head(page);
-	lock_page(page);
-	if (!page->mapping) {
-		unlock_page(page);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	page_head = page;
+	if (unlikely(PageTail(page))) {
 		put_page(page);
+		/* serialize against __split_huge_page_splitting() */
+		local_irq_disable();
+		if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+			page_head = compound_head(page);
+			/*
+			 * page_head is valid pointer but we must pin
+			 * it before taking the PG_lock and/or
+			 * PG_compound_lock. The moment we re-enable
+			 * irqs __split_huge_page_splitting() can
+			 * return and the head page can be freed from
+			 * under us. We can't take the PG_lock and/or
+			 * PG_compound_lock on a page that could be
+			 * freed from under us.
+			 */
+			if (page != page_head) {
+				get_page(page_head);
+				put_page(page);
+			}
+			local_irq_enable();
+		} else {
+			local_irq_enable();
+			goto again;
+		}
+	}
+#else
+	page_head = compound_head(page);
+	if (page != page_head) {
+		get_page(page_head);
+		put_page(page);
+	}
+#endif
+
+	lock_page(page_head);
+	if (!page_head->mapping) {
+		unlock_page(page_head);
+		put_page(page_head);
 		goto again;
 	}
 
@@ -280,20 +315,20 @@ again:
 	 * it's a read-only handle, it's expected that futexes attach to
 	 * the object not the particular process.
 	 */
-	if (PageAnon(page)) {
+	if (PageAnon(page_head)) {
 		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
 		key->private.mm = mm;
 		key->private.address = address;
 	} else {
 		key->both.offset |= FUT_OFF_INODE; /* inode-based key */
-		key->shared.inode = page->mapping->host;
-		key->shared.pgoff = page->index;
+		key->shared.inode = page_head->mapping->host;
+		key->shared.pgoff = page_head->index;
 	}
 
 	get_futex_key_refs(key);
 
-	unlock_page(page);
-	put_page(page);
+	unlock_page(page_head);
+	put_page(page_head);
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:45 -0800
Subject: thp: add pmd_huge_pte to mm_struct

This increase the size of the mm struct a bit but it is needed to
preallocate one pte for each hugepage so that split_huge_page will not
require a fail path.  Guarantee of success is a fundamental property of
split_huge_page to avoid decrasing swapping reliability and to avoid
adding -ENOMEM fail paths that would otherwise force the hugepage-unaware
VM code to learn rolling back in the middle of its pte mangling operations
(if something we need it to learn handling pmd_trans_huge natively rather
being capable of rollback).  When split_huge_page runs a pte is needed to
succeed the split, to map the newly splitted regular pages with a regular
pte.  This way all existing VM code remains backwards compatible by just
adding a split_huge_page* one liner.  The memory waste of those
preallocated ptes is negligible and so it is worth it.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h | 3 +++
 kernel/fork.c            | 7 +++++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bb7288a782fd..26bc4e2cd275 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -309,6 +309,9 @@ struct mm_struct {
 #endif
 #ifdef CONFIG_MMU_NOTIFIER
 	struct mmu_notifier_mm *mmu_notifier_mm;
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	pgtable_t pmd_huge_pte; /* protected by page_table_lock */
 #endif
 	/* How many tasks sharing this mm are OOM_DISABLE */
 	atomic_t oom_disable_count;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1499607e4da2..f78f50ba6cb2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm)
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	VM_BUG_ON(mm->pmd_huge_pte);
+#endif
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	mm->token_priority = 0;
 	mm->last_interval = 0;
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+	mm->pmd_huge_pte = NULL;
+#endif
+
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
-- 
cgit v1.3-8-gc7d7


From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 13 Jan 2011 15:46:58 -0800
Subject: thp: khugepaged

Add khugepaged to relocate fragmented pages into hugepages if new
hugepages become available.  (this is indipendent of the defrag logic that
will have to make new hugepages available)

The fundamental reason why khugepaged is unavoidable, is that some memory
can be fragmented and not everything can be relocated.  So when a virtual
machine quits and releases gigabytes of hugepages, we want to use those
freely available hugepages to create huge-pmd in the other virtual
machines that may be running on fragmented memory, to maximize the CPU
efficiency at all times.  The scan is slow, it takes nearly zero cpu time,
except when it copies data (in which case it means we definitely want to
pay for that cpu time) so it seems a good tradeoff.

In addition to the hugepages being released by other process releasing
memory, we have the strong suspicion that the performance impact of
potentially defragmenting hugepages during or before each page fault could
lead to more performance inconsistency than allocating small pages at
first and having them collapsed into large pages later...  if they prove
themselfs to be long lived mappings (khugepaged scan is slow so short
lived mappings have low probability to run into khugepaged if compared to
long lived mappings).

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h    |    1 +
 include/linux/khugepaged.h |   66 +++
 include/linux/sched.h      |    1 +
 kernel/fork.c              |    5 +
 mm/huge_memory.c           | 1073 +++++++++++++++++++++++++++++++++++++++++++-
 5 files changed, 1136 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/khugepaged.h

(limited to 'kernel')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d9ab70d776e2..43a694ef8904 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -25,6 +25,7 @@ enum transparent_hugepage_flag {
 	TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
 	TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+	TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
 #ifdef CONFIG_DEBUG_VM
 	TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG,
 #endif
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
new file mode 100644
index 000000000000..552f3184756c
--- /dev/null
+++ b/include/linux/khugepaged.h
@@ -0,0 +1,66 @@
+#ifndef _LINUX_KHUGEPAGED_H
+#define _LINUX_KHUGEPAGED_H
+
+#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern int __khugepaged_enter(struct mm_struct *mm);
+extern void __khugepaged_exit(struct mm_struct *mm);
+extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma);
+
+#define khugepaged_enabled()					       \
+	(transparent_hugepage_flags &				       \
+	 ((1<<TRANSPARENT_HUGEPAGE_FLAG) |		       \
+	  (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)))
+#define khugepaged_always()				\
+	(transparent_hugepage_flags &			\
+	 (1<<TRANSPARENT_HUGEPAGE_FLAG))
+#define khugepaged_req_madv()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))
+#define khugepaged_defrag()					\
+	(transparent_hugepage_flags &				\
+	 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags))
+		return __khugepaged_enter(mm);
+	return 0;
+}
+
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+	if (test_bit(MMF_VM_HUGEPAGE, &mm->flags))
+		__khugepaged_exit(mm);
+}
+
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+	if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags))
+		if (khugepaged_always() ||
+		    (khugepaged_req_madv() &&
+		     vma->vm_flags & VM_HUGEPAGE))
+			if (__khugepaged_enter(vma->vm_mm))
+				return -ENOMEM;
+	return 0;
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	return 0;
+}
+static inline void khugepaged_exit(struct mm_struct *mm)
+{
+}
+static inline int khugepaged_enter(struct vm_area_struct *vma)
+{
+	return 0;
+}
+static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+	return 0;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f23b5bb6f52e..d747f948b34e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm);
 #endif
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
+#define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f78f50ba6cb2..25e429152ddc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
 #include <linux/posix-timers.h>
 #include <linux/user-return-notifier.h>
 #include <linux/oom.h>
+#include <linux/khugepaged.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 	retval = ksm_fork(mm, oldmm);
+	if (retval)
+		goto out;
+	retval = khugepaged_fork(mm, oldmm);
 	if (retval)
 		goto out;
 
@@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm)
 	if (atomic_dec_and_test(&mm->mm_users)) {
 		exit_aio(mm);
 		ksm_exit(mm);
+		khugepaged_exit(mm); /* must run before exit_mmap */
 		exit_mmap(mm);
 		set_mm_exe_file(mm, NULL);
 		if (!list_empty(&mm->mmlist)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7101112a5429..ae2bf08b1099 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,14 +12,111 @@
 #include <linux/mmu_notifier.h>
 #include <linux/rmap.h>
 #include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
 unsigned long transparent_hugepage_flags __read_mostly =
-	(1<<TRANSPARENT_HUGEPAGE_FLAG);
+	(1<<TRANSPARENT_HUGEPAGE_FLAG)|
+	(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+	struct hlist_node hash;
+	struct list_head mm_node;
+	struct mm_struct *mm;
+};
+
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+	struct list_head mm_head;
+	struct mm_slot *mm_slot;
+	unsigned long address;
+} khugepaged_scan = {
+	.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+
+static int start_khugepaged(void)
+{
+	int err = 0;
+	if (khugepaged_enabled()) {
+		int wakeup;
+		if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mutex_lock(&khugepaged_mutex);
+		if (!khugepaged_thread)
+			khugepaged_thread = kthread_run(khugepaged, NULL,
+							"khugepaged");
+		if (unlikely(IS_ERR(khugepaged_thread))) {
+			printk(KERN_ERR
+			       "khugepaged: kthread_run(khugepaged) failed\n");
+			err = PTR_ERR(khugepaged_thread);
+			khugepaged_thread = NULL;
+		}
+		wakeup = !list_empty(&khugepaged_scan.mm_head);
+		mutex_unlock(&khugepaged_mutex);
+		if (wakeup)
+			wake_up_interruptible(&khugepaged_wait);
+	} else
+		/* wakeup to exit */
+		wake_up_interruptible(&khugepaged_wait);
+out:
+	return err;
+}
 
 #ifdef CONFIG_SYSFS
+
 static ssize_t double_flag_show(struct kobject *kobj,
 				struct kobj_attribute *attr, char *buf,
 				enum transparent_hugepage_flag enabled,
@@ -68,9 +165,19 @@ static ssize_t enabled_store(struct kobject *kobj,
 			     struct kobj_attribute *attr,
 			     const char *buf, size_t count)
 {
-	return double_flag_store(kobj, attr, buf, count,
-				 TRANSPARENT_HUGEPAGE_FLAG,
-				 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+	ssize_t ret;
+
+	ret = double_flag_store(kobj, attr, buf, count,
+				TRANSPARENT_HUGEPAGE_FLAG,
+				TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+
+	if (ret > 0) {
+		int err = start_khugepaged();
+		if (err)
+			ret = err;
+	}
+
+	return ret;
 }
 static struct kobj_attribute enabled_attr =
 	__ATTR(enabled, 0644, enabled_show, enabled_store);
@@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = {
 
 static struct attribute_group hugepage_attr_group = {
 	.attrs = hugepage_attr,
-	.name = "transparent_hugepage",
+};
+
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = strict_strtoul(buf, 10, &msecs);
+	if (err || msecs > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_scan_sleep_millisecs = msecs;
+	wake_up_interruptible(&khugepaged_wait);
+
+	return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+	__ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+	       scan_sleep_millisecs_store);
+
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+					   struct kobj_attribute *attr,
+					   const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = strict_strtoul(buf, 10, &msecs);
+	if (err || msecs > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_alloc_sleep_millisecs = msecs;
+	wake_up_interruptible(&khugepaged_wait);
+
+	return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+	__ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+	       alloc_sleep_millisecs_store);
+
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+				  struct kobj_attribute *attr,
+				  char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int err;
+	unsigned long pages;
+
+	err = strict_strtoul(buf, 10, &pages);
+	if (err || !pages || pages > UINT_MAX)
+		return -EINVAL;
+
+	khugepaged_pages_to_scan = pages;
+
+	return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+	__ATTR(pages_to_scan, 0644, pages_to_scan_show,
+	       pages_to_scan_store);
+
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+				    struct kobj_attribute *attr,
+				    char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+	__ATTR_RO(pages_collapsed);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+	__ATTR_RO(full_scans);
+
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+				      struct kobj_attribute *attr, char *buf)
+{
+	return single_flag_show(kobj, attr, buf,
+				TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+				       struct kobj_attribute *attr,
+				       const char *buf, size_t count)
+{
+	return single_flag_store(kobj, attr, buf, count,
+				 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+	__ATTR(defrag, 0644, khugepaged_defrag_show,
+	       khugepaged_defrag_store);
+
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+					     struct kobj_attribute *attr,
+					     char *buf)
+{
+	return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+					      struct kobj_attribute *attr,
+					      const char *buf, size_t count)
+{
+	int err;
+	unsigned long max_ptes_none;
+
+	err = strict_strtoul(buf, 10, &max_ptes_none);
+	if (err || max_ptes_none > HPAGE_PMD_NR-1)
+		return -EINVAL;
+
+	khugepaged_max_ptes_none = max_ptes_none;
+
+	return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+	__ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+	       khugepaged_max_ptes_none_store);
+
+static struct attribute *khugepaged_attr[] = {
+	&khugepaged_defrag_attr.attr,
+	&khugepaged_max_ptes_none_attr.attr,
+	&pages_to_scan_attr.attr,
+	&pages_collapsed_attr.attr,
+	&full_scans_attr.attr,
+	&scan_sleep_millisecs_attr.attr,
+	&alloc_sleep_millisecs_attr.attr,
+	NULL,
+};
+
+static struct attribute_group khugepaged_attr_group = {
+	.attrs = khugepaged_attr,
+	.name = "khugepaged",
 };
 #endif /* CONFIG_SYSFS */
 
 static int __init hugepage_init(void)
 {
-#ifdef CONFIG_SYSFS
 	int err;
+#ifdef CONFIG_SYSFS
+	static struct kobject *hugepage_kobj;
 
-	err = sysfs_create_group(mm_kobj, &hugepage_attr_group);
-	if (err)
-		printk(KERN_ERR "hugepage: register sysfs failed\n");
+	err = -ENOMEM;
+	hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+	if (unlikely(!hugepage_kobj)) {
+		printk(KERN_ERR "hugepage: failed kobject create\n");
+		goto out;
+	}
+
+	err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+	if (err) {
+		printk(KERN_ERR "hugepage: failed register hugeage group\n");
+		goto out;
+	}
+
+	err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+	if (err) {
+		printk(KERN_ERR "hugepage: failed register hugeage group\n");
+		goto out;
+	}
 #endif
-	return 0;
+
+	err = khugepaged_slab_init();
+	if (err)
+		goto out;
+
+	err = mm_slots_hash_init();
+	if (err) {
+		khugepaged_slab_free();
+		goto out;
+	}
+
+	start_khugepaged();
+
+out:
+	return err;
 }
 module_init(hugepage_init)
 
@@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
 		if (unlikely(anon_vma_prepare(vma)))
 			return VM_FAULT_OOM;
+		if (unlikely(khugepaged_enter(vma)))
+			return VM_FAULT_OOM;
 		page = alloc_hugepage(transparent_hugepage_defrag(vma));
 		if (unlikely(!page))
 			goto out;
@@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags)
 	return 0;
 }
 
+static int __init khugepaged_slab_init(void)
+{
+	mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+					  sizeof(struct mm_slot),
+					  __alignof__(struct mm_slot), 0, NULL);
+	if (!mm_slot_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void __init khugepaged_slab_free(void)
+{
+	kmem_cache_destroy(mm_slot_cache);
+	mm_slot_cache = NULL;
+}
+
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+	if (!mm_slot_cache)	/* initialization failed */
+		return NULL;
+	return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+	kmem_cache_free(mm_slot_cache, mm_slot);
+}
+
+static int __init mm_slots_hash_init(void)
+{
+	mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+				GFP_KERNEL);
+	if (!mm_slots_hash)
+		return -ENOMEM;
+	return 0;
+}
+
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+	kfree(mm_slots_hash);
+	mm_slots_hash = NULL;
+}
+#endif
+
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	struct hlist_head *bucket;
+	struct hlist_node *node;
+
+	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+				% MM_SLOTS_HASH_HEADS];
+	hlist_for_each_entry(mm_slot, node, bucket, hash) {
+		if (mm == mm_slot->mm)
+			return mm_slot;
+	}
+	return NULL;
+}
+
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+				    struct mm_slot *mm_slot)
+{
+	struct hlist_head *bucket;
+
+	bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+				% MM_SLOTS_HASH_HEADS];
+	mm_slot->mm = mm;
+	hlist_add_head(&mm_slot->hash, bucket);
+}
+
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+	return atomic_read(&mm->mm_users) == 0;
+}
+
+int __khugepaged_enter(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	int wakeup;
+
+	mm_slot = alloc_mm_slot();
+	if (!mm_slot)
+		return -ENOMEM;
+
+	/* __khugepaged_exit() must not run from under us */
+	VM_BUG_ON(khugepaged_test_exit(mm));
+	if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+		free_mm_slot(mm_slot);
+		return 0;
+	}
+
+	spin_lock(&khugepaged_mm_lock);
+	insert_to_mm_slots_hash(mm, mm_slot);
+	/*
+	 * Insert just behind the scanning cursor, to let the area settle
+	 * down a little.
+	 */
+	wakeup = list_empty(&khugepaged_scan.mm_head);
+	list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+	spin_unlock(&khugepaged_mm_lock);
+
+	atomic_inc(&mm->mm_count);
+	if (wakeup)
+		wake_up_interruptible(&khugepaged_wait);
+
+	return 0;
+}
+
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+	unsigned long hstart, hend;
+	if (!vma->anon_vma)
+		/*
+		 * Not yet faulted in so we will register later in the
+		 * page fault if needed.
+		 */
+		return 0;
+	if (vma->vm_file || vma->vm_ops)
+		/* khugepaged not yet working on file or special mappings */
+		return 0;
+	VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = vma->vm_end & HPAGE_PMD_MASK;
+	if (hstart < hend)
+		return khugepaged_enter(vma);
+	return 0;
+}
+
+void __khugepaged_exit(struct mm_struct *mm)
+{
+	struct mm_slot *mm_slot;
+	int free = 0;
+
+	spin_lock(&khugepaged_mm_lock);
+	mm_slot = get_mm_slot(mm);
+	if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+		hlist_del(&mm_slot->hash);
+		list_del(&mm_slot->mm_node);
+		free = 1;
+	}
+
+	if (free) {
+		spin_unlock(&khugepaged_mm_lock);
+		clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+		free_mm_slot(mm_slot);
+		mmdrop(mm);
+	} else if (mm_slot) {
+		spin_unlock(&khugepaged_mm_lock);
+		/*
+		 * This is required to serialize against
+		 * khugepaged_test_exit() (which is guaranteed to run
+		 * under mmap sem read mode). Stop here (after we
+		 * return all pagetables will be destroyed) until
+		 * khugepaged has finished working on the pagetables
+		 * under the mmap_sem.
+		 */
+		down_write(&mm->mmap_sem);
+		up_write(&mm->mmap_sem);
+	} else
+		spin_unlock(&khugepaged_mm_lock);
+}
+
+static void release_pte_page(struct page *page)
+{
+	/* 0 stands for page_is_file_cache(page) == false */
+	dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+	unlock_page(page);
+	putback_lru_page(page);
+}
+
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+	while (--_pte >= pte) {
+		pte_t pteval = *_pte;
+		if (!pte_none(pteval))
+			release_pte_page(pte_page(pteval));
+	}
+}
+
+static void release_all_pte_pages(pte_t *pte)
+{
+	release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+					unsigned long address,
+					pte_t *pte)
+{
+	struct page *page;
+	pte_t *_pte;
+	int referenced = 0, isolated = 0, none = 0;
+	for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+	     _pte++, address += PAGE_SIZE) {
+		pte_t pteval = *_pte;
+		if (pte_none(pteval)) {
+			if (++none <= khugepaged_max_ptes_none)
+				continue;
+			else {
+				release_pte_pages(pte, _pte);
+				goto out;
+			}
+		}
+		if (!pte_present(pteval) || !pte_write(pteval)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		page = vm_normal_page(vma, address, pteval);
+		if (unlikely(!page)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		VM_BUG_ON(PageCompound(page));
+		BUG_ON(!PageAnon(page));
+		VM_BUG_ON(!PageSwapBacked(page));
+
+		/* cannot use mapcount: can't collapse if there's a gup pin */
+		if (page_count(page) != 1) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/*
+		 * We can do it before isolate_lru_page because the
+		 * page can't be freed from under us. NOTE: PG_lock
+		 * is needed to serialize against split_huge_page
+		 * when invoked from the VM.
+		 */
+		if (!trylock_page(page)) {
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/*
+		 * Isolate the page to avoid collapsing an hugepage
+		 * currently in use by the VM.
+		 */
+		if (isolate_lru_page(page)) {
+			unlock_page(page);
+			release_pte_pages(pte, _pte);
+			goto out;
+		}
+		/* 0 stands for page_is_file_cache(page) == false */
+		inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+		VM_BUG_ON(!PageLocked(page));
+		VM_BUG_ON(PageLRU(page));
+
+		/* If there is no mapped pte young don't collapse the page */
+		if (pte_young(pteval))
+			referenced = 1;
+	}
+	if (unlikely(!referenced))
+		release_all_pte_pages(pte);
+	else
+		isolated = 1;
+out:
+	return isolated;
+}
+
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+				      struct vm_area_struct *vma,
+				      unsigned long address,
+				      spinlock_t *ptl)
+{
+	pte_t *_pte;
+	for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+		pte_t pteval = *_pte;
+		struct page *src_page;
+
+		if (pte_none(pteval)) {
+			clear_user_highpage(page, address);
+			add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+		} else {
+			src_page = pte_page(pteval);
+			copy_user_highpage(page, src_page, address, vma);
+			VM_BUG_ON(page_mapcount(src_page) != 1);
+			VM_BUG_ON(page_count(src_page) != 2);
+			release_pte_page(src_page);
+			/*
+			 * ptl mostly unnecessary, but preempt has to
+			 * be disabled to update the per-cpu stats
+			 * inside page_remove_rmap().
+			 */
+			spin_lock(ptl);
+			/*
+			 * paravirt calls inside pte_clear here are
+			 * superfluous.
+			 */
+			pte_clear(vma->vm_mm, address, _pte);
+			page_remove_rmap(src_page);
+			spin_unlock(ptl);
+			free_page_and_swap_cache(src_page);
+		}
+
+		address += PAGE_SIZE;
+		page++;
+	}
+}
+
+static void collapse_huge_page(struct mm_struct *mm,
+			       unsigned long address,
+			       struct page **hpage)
+{
+	struct vm_area_struct *vma;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd, _pmd;
+	pte_t *pte;
+	pgtable_t pgtable;
+	struct page *new_page;
+	spinlock_t *ptl;
+	int isolated;
+	unsigned long hstart, hend;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(!*hpage);
+
+	/*
+	 * Prevent all access to pagetables with the exception of
+	 * gup_fast later hanlded by the ptep_clear_flush and the VM
+	 * handled by the anon_vma lock + PG_lock.
+	 */
+	down_write(&mm->mmap_sem);
+	if (unlikely(khugepaged_test_exit(mm)))
+		goto out;
+
+	vma = find_vma(mm, address);
+	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+	hend = vma->vm_end & HPAGE_PMD_MASK;
+	if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+		goto out;
+
+	if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+		goto out;
+
+	/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+	if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
+		goto out;
+	VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	/* pmd can't go away or become huge under us */
+	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+		goto out;
+
+	new_page = *hpage;
+	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+		goto out;
+
+	anon_vma_lock(vma->anon_vma);
+
+	pte = pte_offset_map(pmd, address);
+	ptl = pte_lockptr(mm, pmd);
+
+	spin_lock(&mm->page_table_lock); /* probably unnecessary */
+	/*
+	 * After this gup_fast can't run anymore. This also removes
+	 * any huge TLB entry from the CPU so we won't allow
+	 * huge and small TLB entries for the same virtual address
+	 * to avoid the risk of CPU bugs in that area.
+	 */
+	_pmd = pmdp_clear_flush_notify(vma, address, pmd);
+	spin_unlock(&mm->page_table_lock);
+
+	spin_lock(ptl);
+	isolated = __collapse_huge_page_isolate(vma, address, pte);
+	spin_unlock(ptl);
+	pte_unmap(pte);
+
+	if (unlikely(!isolated)) {
+		spin_lock(&mm->page_table_lock);
+		BUG_ON(!pmd_none(*pmd));
+		set_pmd_at(mm, address, pmd, _pmd);
+		spin_unlock(&mm->page_table_lock);
+		anon_vma_unlock(vma->anon_vma);
+		mem_cgroup_uncharge_page(new_page);
+		goto out;
+	}
+
+	/*
+	 * All pages are isolated and locked so anon_vma rmap
+	 * can't run anymore.
+	 */
+	anon_vma_unlock(vma->anon_vma);
+
+	__collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+	__SetPageUptodate(new_page);
+	pgtable = pmd_pgtable(_pmd);
+	VM_BUG_ON(page_count(pgtable) != 1);
+	VM_BUG_ON(page_mapcount(pgtable) != 0);
+
+	_pmd = mk_pmd(new_page, vma->vm_page_prot);
+	_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+	_pmd = pmd_mkhuge(_pmd);
+
+	/*
+	 * spin_lock() below is not the equivalent of smp_wmb(), so
+	 * this is needed to avoid the copy_huge_page writes to become
+	 * visible after the set_pmd_at() write.
+	 */
+	smp_wmb();
+
+	spin_lock(&mm->page_table_lock);
+	BUG_ON(!pmd_none(*pmd));
+	page_add_new_anon_rmap(new_page, vma, address);
+	set_pmd_at(mm, address, pmd, _pmd);
+	update_mmu_cache(vma, address, entry);
+	prepare_pmd_huge_pte(pgtable, mm);
+	mm->nr_ptes--;
+	spin_unlock(&mm->page_table_lock);
+
+	*hpage = NULL;
+	khugepaged_pages_collapsed++;
+out:
+	up_write(&mm->mmap_sem);
+}
+
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+			       struct vm_area_struct *vma,
+			       unsigned long address,
+			       struct page **hpage)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte, *_pte;
+	int ret = 0, referenced = 0, none = 0;
+	struct page *page;
+	unsigned long _address;
+	spinlock_t *ptl;
+
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+		goto out;
+
+	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+	for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+	     _pte++, _address += PAGE_SIZE) {
+		pte_t pteval = *_pte;
+		if (pte_none(pteval)) {
+			if (++none <= khugepaged_max_ptes_none)
+				continue;
+			else
+				goto out_unmap;
+		}
+		if (!pte_present(pteval) || !pte_write(pteval))
+			goto out_unmap;
+		page = vm_normal_page(vma, _address, pteval);
+		if (unlikely(!page))
+			goto out_unmap;
+		VM_BUG_ON(PageCompound(page));
+		if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+			goto out_unmap;
+		/* cannot use mapcount: can't collapse if there's a gup pin */
+		if (page_count(page) != 1)
+			goto out_unmap;
+		if (pte_young(pteval))
+			referenced = 1;
+	}
+	if (referenced)
+		ret = 1;
+out_unmap:
+	pte_unmap_unlock(pte, ptl);
+	if (ret) {
+		up_read(&mm->mmap_sem);
+		collapse_huge_page(mm, address, hpage);
+	}
+out:
+	return ret;
+}
+
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+	struct mm_struct *mm = mm_slot->mm;
+
+	VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+	if (khugepaged_test_exit(mm)) {
+		/* free mm_slot */
+		hlist_del(&mm_slot->hash);
+		list_del(&mm_slot->mm_node);
+
+		/*
+		 * Not strictly needed because the mm exited already.
+		 *
+		 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+		 */
+
+		/* khugepaged_mm_lock actually not necessary for the below */
+		free_mm_slot(mm_slot);
+		mmdrop(mm);
+	}
+}
+
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+					    struct page **hpage)
+{
+	struct mm_slot *mm_slot;
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	int progress = 0;
+
+	VM_BUG_ON(!pages);
+	VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+
+	if (khugepaged_scan.mm_slot)
+		mm_slot = khugepaged_scan.mm_slot;
+	else {
+		mm_slot = list_entry(khugepaged_scan.mm_head.next,
+				     struct mm_slot, mm_node);
+		khugepaged_scan.address = 0;
+		khugepaged_scan.mm_slot = mm_slot;
+	}
+	spin_unlock(&khugepaged_mm_lock);
+
+	mm = mm_slot->mm;
+	down_read(&mm->mmap_sem);
+	if (unlikely(khugepaged_test_exit(mm)))
+		vma = NULL;
+	else
+		vma = find_vma(mm, khugepaged_scan.address);
+
+	progress++;
+	for (; vma; vma = vma->vm_next) {
+		unsigned long hstart, hend;
+
+		cond_resched();
+		if (unlikely(khugepaged_test_exit(mm))) {
+			progress++;
+			break;
+		}
+
+		if (!(vma->vm_flags & VM_HUGEPAGE) &&
+		    !khugepaged_always()) {
+			progress++;
+			continue;
+		}
+
+		/* VM_PFNMAP vmas may have vm_ops null but vm_file set */
+		if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
+			khugepaged_scan.address = vma->vm_end;
+			progress++;
+			continue;
+		}
+		VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
+
+		hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+		hend = vma->vm_end & HPAGE_PMD_MASK;
+		if (hstart >= hend) {
+			progress++;
+			continue;
+		}
+		if (khugepaged_scan.address < hstart)
+			khugepaged_scan.address = hstart;
+		if (khugepaged_scan.address > hend) {
+			khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
+			progress++;
+			continue;
+		}
+		BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+
+		while (khugepaged_scan.address < hend) {
+			int ret;
+			cond_resched();
+			if (unlikely(khugepaged_test_exit(mm)))
+				goto breakouterloop;
+
+			VM_BUG_ON(khugepaged_scan.address < hstart ||
+				  khugepaged_scan.address + HPAGE_PMD_SIZE >
+				  hend);
+			ret = khugepaged_scan_pmd(mm, vma,
+						  khugepaged_scan.address,
+						  hpage);
+			/* move to next address */
+			khugepaged_scan.address += HPAGE_PMD_SIZE;
+			progress += HPAGE_PMD_NR;
+			if (ret)
+				/* we released mmap_sem so break loop */
+				goto breakouterloop_mmap_sem;
+			if (progress >= pages)
+				goto breakouterloop;
+		}
+	}
+breakouterloop:
+	up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+
+	spin_lock(&khugepaged_mm_lock);
+	BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+	/*
+	 * Release the current mm_slot if this mm is about to die, or
+	 * if we scanned all vmas of this mm.
+	 */
+	if (khugepaged_test_exit(mm) || !vma) {
+		/*
+		 * Make sure that if mm_users is reaching zero while
+		 * khugepaged runs here, khugepaged_exit will find
+		 * mm_slot not pointing to the exiting mm.
+		 */
+		if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+			khugepaged_scan.mm_slot = list_entry(
+				mm_slot->mm_node.next,
+				struct mm_slot, mm_node);
+			khugepaged_scan.address = 0;
+		} else {
+			khugepaged_scan.mm_slot = NULL;
+			khugepaged_full_scans++;
+		}
+
+		collect_mm_slot(mm_slot);
+	}
+
+	return progress;
+}
+
+static int khugepaged_has_work(void)
+{
+	return !list_empty(&khugepaged_scan.mm_head) &&
+		khugepaged_enabled();
+}
+
+static int khugepaged_wait_event(void)
+{
+	return !list_empty(&khugepaged_scan.mm_head) ||
+		!khugepaged_enabled();
+}
+
+static void khugepaged_do_scan(struct page **hpage)
+{
+	unsigned int progress = 0, pass_through_head = 0;
+	unsigned int pages = khugepaged_pages_to_scan;
+
+	barrier(); /* write khugepaged_pages_to_scan to local stack */
+
+	while (progress < pages) {
+		cond_resched();
+
+		if (!*hpage) {
+			*hpage = alloc_hugepage(khugepaged_defrag());
+			if (unlikely(!*hpage))
+				break;
+		}
+
+		spin_lock(&khugepaged_mm_lock);
+		if (!khugepaged_scan.mm_slot)
+			pass_through_head++;
+		if (khugepaged_has_work() &&
+		    pass_through_head < 2)
+			progress += khugepaged_scan_mm_slot(pages - progress,
+							    hpage);
+		else
+			progress = pages;
+		spin_unlock(&khugepaged_mm_lock);
+	}
+}
+
+static struct page *khugepaged_alloc_hugepage(void)
+{
+	struct page *hpage;
+
+	do {
+		hpage = alloc_hugepage(khugepaged_defrag());
+		if (!hpage) {
+			DEFINE_WAIT(wait);
+			add_wait_queue(&khugepaged_wait, &wait);
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(
+					khugepaged_alloc_sleep_millisecs));
+			remove_wait_queue(&khugepaged_wait, &wait);
+		}
+	} while (unlikely(!hpage) &&
+		 likely(khugepaged_enabled()));
+	return hpage;
+}
+
+static void khugepaged_loop(void)
+{
+	struct page *hpage;
+
+	while (likely(khugepaged_enabled())) {
+		hpage = khugepaged_alloc_hugepage();
+		if (unlikely(!hpage))
+			break;
+
+		khugepaged_do_scan(&hpage);
+		if (hpage)
+			put_page(hpage);
+		if (khugepaged_has_work()) {
+			DEFINE_WAIT(wait);
+			if (!khugepaged_scan_sleep_millisecs)
+				continue;
+			add_wait_queue(&khugepaged_wait, &wait);
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(
+					khugepaged_scan_sleep_millisecs));
+			remove_wait_queue(&khugepaged_wait, &wait);
+		} else if (khugepaged_enabled())
+			wait_event_interruptible(khugepaged_wait,
+						 khugepaged_wait_event());
+	}
+}
+
+static int khugepaged(void *none)
+{
+	struct mm_slot *mm_slot;
+
+	set_user_nice(current, 19);
+
+	/* serialize with start_khugepaged() */
+	mutex_lock(&khugepaged_mutex);
+
+	for (;;) {
+		mutex_unlock(&khugepaged_mutex);
+		BUG_ON(khugepaged_thread != current);
+		khugepaged_loop();
+		BUG_ON(khugepaged_thread != current);
+
+		mutex_lock(&khugepaged_mutex);
+		if (!khugepaged_enabled())
+			break;
+	}
+
+	spin_lock(&khugepaged_mm_lock);
+	mm_slot = khugepaged_scan.mm_slot;
+	khugepaged_scan.mm_slot = NULL;
+	if (mm_slot)
+		collect_mm_slot(mm_slot);
+	spin_unlock(&khugepaged_mm_lock);
+
+	khugepaged_thread = NULL;
+	mutex_unlock(&khugepaged_mutex);
+
+	return 0;
+}
+
 void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
 {
 	struct page *page;
-- 
cgit v1.3-8-gc7d7


From 3ec762ad8be364c2fadfe0d6b2cc6d4d3b5e1b54 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 11:34:34 +0800
Subject: cgroups: Fix a lockdep warning at cgroup removal

Commit 2fd6b7f5 ("fs: dcache scale subdirs") forgot to annotate a dentry
lock, which caused a lockdep warning.

Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 51cddc11cd85..a7837e2d9d6b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -912,7 +912,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 
 	parent = dentry->d_parent;
 	spin_lock(&parent->d_lock);
-	spin_lock(&dentry->d_lock);
+	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 	list_del_init(&dentry->d_u.d_child);
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-- 
cgit v1.3-8-gc7d7


From c072a388d59a1d48e36864d0e66f42d71745be1c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 7 Jan 2011 02:33:47 -0800
Subject: rcu: demote SRCU_SYNCHRONIZE_DELAY from kernel-parameter status

Because the adaptive synchronize_srcu_expedited() approach has
worked very well in testing, remove the kernel parameter and
replace it by a C-preprocessor macro.  If someone finds problems
with this approach, a more complex and aggressively adaptive
approach might be required.

Longer term, SRCU will be merged with the other RCU implementations,
at which point synchronize_srcu_expedited() will be event driven,
just as synchronize_sched_expedited() currently is.  At that point,
there will be no need for this adaptive approach.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 init/Kconfig  | 15 ---------------
 kernel/srcu.c | 15 +++++++++++++--
 2 files changed, 13 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 526ec1c7456a..e11bc793a91d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -497,21 +497,6 @@ config RCU_BOOST_DELAY
 
 	  Accept the default if unsure.
 
-config SRCU_SYNCHRONIZE_DELAY
-	int "Microseconds to delay before waiting for readers"
-	range 0 20
-	default 10
-	help
-	  This option controls how long SRCU delays before entering its
-	  loop waiting on SRCU readers.  The purpose of this loop is
-	  to avoid the unconditional context-switch penalty that would
-	  otherwise be incurred if there was an active SRCU reader,
-	  in a manner similar to adaptive locking schemes.  This should
-	  be set to be a bit longer than the common-case SRCU read-side
-	  critical-section overhead.
-
-	  Accept the default if unsure.
-
 endmenu # "RCU Subsystem"
 
 config IKCONFIG
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 98d8c1e80edb..73ce23feaea9 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -155,6 +155,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
 
+/*
+ * We use an adaptive strategy for synchronize_srcu() and especially for
+ * synchronize_srcu_expedited().  We spin for a fixed time period
+ * (defined below) to allow SRCU readers to exit their read-side critical
+ * sections.  If there are still some readers after 10 microseconds,
+ * we repeatedly block for 1-millisecond time periods.  This approach
+ * has done well in testing, so there is no need for a config parameter.
+ */
+#define SYNCHRONIZE_SRCU_READER_DELAY 10
+
 /*
  * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
  */
@@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
 	 * will have finished executing.  We initially give readers
 	 * an arbitrarily chosen 10 microseconds to get out of their
 	 * SRCU read-side critical sections, then loop waiting 1/HZ
-	 * seconds per iteration.
+	 * seconds per iteration.  The 10-microsecond value has done
+	 * very well in testing.
 	 */
 
 	if (srcu_readers_active_idx(sp, idx))
-		udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY);
+		udelay(SYNCHRONIZE_SRCU_READER_DELAY);
 	while (srcu_readers_active_idx(sp, idx))
 		schedule_timeout_interruptible(1);
 
-- 
cgit v1.3-8-gc7d7


From b24efdfdf679cf9b05947c531971905fc727dd40 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Wed, 12 Jan 2011 14:18:11 -0800
Subject: rcu: avoid pointless blocked-task warnings

If the RCU callback-processing kthread has nothing to do, it parks in
a wait_event().  If RCU remains idle for more than two minutes, the
kernel complains about this.  This commit changes from wait_event()
to wait_event_interruptible() to prevent the kernel from complaining
just because RCU is idle.

Reported-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Thomas Weber <weber@corscience.de>
Tested-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 kernel/rcutiny.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 034493724749..0c343b9a46d5 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -189,7 +189,8 @@ static int rcu_kthread(void *arg)
 	unsigned long flags;
 
 	for (;;) {
-		wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0);
+		wait_event_interruptible(rcu_kthread_wq,
+					 have_rcu_kthread_work != 0);
 		morework = rcu_boost();
 		local_irq_save(flags);
 		work = have_rcu_kthread_work;
-- 
cgit v1.3-8-gc7d7


From c72a04e34735ec3f19f4788b7f95017310b5e1eb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 14 Jan 2011 05:31:45 +0000
Subject: cgroup_fs: fix cgroup use of simple_lookup()

cgroup can't use simple_lookup(), since that'd override its desired ->d_op.

Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5c5f4cc2e99a..ffb7bbad0638 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
  */
 
 static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
@@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
+static int cgroup_delete(const struct dentry *d)
+{
+	return 1;
+}
+
 static void remove_dir(struct dentry *d)
 {
 	struct dentry *parent = dget(d->d_parent);
@@ -1451,6 +1457,7 @@ static int cgroup_get_rootdir(struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
+		.d_delete = cgroup_delete,
 	};
 
 	struct inode *inode =
@@ -2195,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = {
 };
 
 static const struct inode_operations cgroup_dir_inode_operations = {
-	.lookup = simple_lookup,
+	.lookup = cgroup_lookup,
 	.mkdir = cgroup_mkdir,
 	.rmdir = cgroup_rmdir,
 	.rename = cgroup_rename,
 };
 
+static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+{
+	if (dentry->d_name.len > NAME_MAX)
+		return ERR_PTR(-ENAMETOOLONG);
+	d_add(dentry, NULL);
+	return NULL;
+}
+
 /*
  * Check if a file is a control file
  */
-- 
cgit v1.3-8-gc7d7


From 7f85803a26f304e698c673838aab06cc6d4d6e59 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 14 Jan 2011 17:49:02 +0800
Subject: tracing: Remove syscall_exit_fields

There is no need for syscall_exit_fields as the syscall
exit event class can already host the fields in its structure,
like most other trace events do by default. Use that
default behavior instead.

Following this scheme, we don't need anymore to override the
get_fields() callback of the syscall exit event class either.

Hence both syscall_exit_fields and syscall_get_exit_fields() can
be removed.

Also changed some indentation to keep the following under 80
characters:

".fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),"

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4D301C0E.8090408@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_syscalls.c | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index bac752f0cfb5..b706529b4fc7 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event,
 static int syscall_enter_define_fields(struct ftrace_event_call *call);
 static int syscall_exit_define_fields(struct ftrace_event_call *call);
 
-/* All syscall exit events have the same fields */
-static LIST_HEAD(syscall_exit_fields);
-
 static struct list_head *
 syscall_get_enter_fields(struct ftrace_event_call *call)
 {
@@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call)
 	return &entry->enter_fields;
 }
 
-static struct list_head *
-syscall_get_exit_fields(struct ftrace_event_call *call)
-{
-	return &syscall_exit_fields;
-}
-
 struct trace_event_functions enter_syscall_print_funcs = {
-	.trace                  = print_syscall_enter,
+	.trace		= print_syscall_enter,
 };
 
 struct trace_event_functions exit_syscall_print_funcs = {
-	.trace                  = print_syscall_exit,
+	.trace		= print_syscall_exit,
 };
 
 struct ftrace_event_class event_class_syscall_enter = {
-	.system			= "syscalls",
-	.reg			= syscall_enter_register,
-	.define_fields		= syscall_enter_define_fields,
-	.get_fields		= syscall_get_enter_fields,
-	.raw_init		= init_syscall_trace,
+	.system		= "syscalls",
+	.reg		= syscall_enter_register,
+	.define_fields	= syscall_enter_define_fields,
+	.get_fields	= syscall_get_enter_fields,
+	.raw_init	= init_syscall_trace,
 };
 
 struct ftrace_event_class event_class_syscall_exit = {
-	.system			= "syscalls",
-	.reg			= syscall_exit_register,
-	.define_fields		= syscall_exit_define_fields,
-	.get_fields		= syscall_get_exit_fields,
-	.raw_init		= init_syscall_trace,
+	.system		= "syscalls",
+	.reg		= syscall_exit_register,
+	.define_fields	= syscall_exit_define_fields,
+	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
+	.raw_init	= init_syscall_trace,
 };
 
 extern unsigned long __start_syscalls_metadata[];
-- 
cgit v1.3-8-gc7d7


From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 14 Jan 2011 17:57:50 -0800
Subject: sched: Update effective_load() to use global share weights

Previously effective_load would approximate the global load weight present on
a group taking advantage of:

entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided
by tg_shares_up.

This worked (approximately) for an 'empty' (at tg level) cpu since we would
place boost load representative of what a newly woken task would receive.

However, now that load is instantaneously updated this assumption is no longer
true and the load calculation is rather incorrect in this case.

Fix this (and improve the general case) by re-writing effective_load to take
advantage of the new shares distribution code.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110115015817.069769529@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c62ebae65cf0..414145cf5344 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 		return wl;
 
 	for_each_sched_entity(se) {
-		long S, rw, s, a, b;
+		long lw, w;
 
-		S = se->my_q->tg->shares;
-		s = se->load.weight;
-		rw = se->my_q->load.weight;
+		tg = se->my_q->tg;
+		w = se->my_q->load.weight;
 
-		a = S*(rw + wl);
-		b = S*rw + s*wg;
+		/* use this cpu's instantaneous contribution */
+		lw = atomic_read(&tg->load_weight);
+		lw -= se->my_q->load_contribution;
+		lw += w + wg;
 
-		wl = s*(a-b);
+		wl += w;
 
-		if (likely(b))
-			wl /= b;
+		if (lw > 0 && wl < lw)
+			wl = (wl * tg->shares) / lw;
+		else
+			wl = tg->shares;
 
-		/*
-		 * Assume the group is already running and will
-		 * thus already be accounted for in the weight.
-		 *
-		 * That is, moving shares between CPUs, does not
-		 * alter the group weight.
-		 */
+		/* zero point is MIN_SHARES */
+		if (wl < MIN_SHARES)
+			wl = MIN_SHARES;
+		wl -= se->load.weight;
 		wg = 0;
 	}
 
-- 
cgit v1.3-8-gc7d7


From efe25c2c7b3a5d17b0c70987a758d8fe7af8e3d1 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 15:41:54 +0530
Subject: sched: Reinstate group names in /proc/sched_debug

Displaying of group names in /proc/sched_debug was dropped in autogroup
patches. Add group names while displaying cfs_rq and tasks information.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110111101153.GE4772@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 1dfae3d014b5..4d36f3726da7 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
 
+static DEFINE_SPINLOCK(sched_debug_lock);
+
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
@@ -86,6 +88,23 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 }
 #endif
 
+#ifdef CONFIG_CGROUP_SCHED
+static char group_path[PATH_MAX];
+
+static char *task_group_path(struct task_group *tg)
+{
+	/*
+	 * May be NULL if the underlying cgroup isn't fully-created yet
+	 */
+	if (!tg->css.cgroup) {
+		group_path[0] = '\0';
+		return group_path;
+	}
+	cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+	return group_path;
+}
+#endif
+
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@@ -108,6 +127,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_CGROUP_SCHED
+	SEQ_printf(m, " %s", task_group_path(task_group(p)));
+#endif
 
 	SEQ_printf(m, "\n");
 }
@@ -144,7 +166,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+#else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
+#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
@@ -191,7 +217,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
+#ifdef CONFIG_RT_GROUP_SCHED
+	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+#else
 	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
+#endif
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -212,6 +242,7 @@ extern __read_mostly int sched_clock_running;
 static void print_cpu(struct seq_file *m, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
 
 #ifdef CONFIG_X86
 	{
@@ -266,10 +297,14 @@ static void print_cpu(struct seq_file *m, int cpu)
 
 #undef P
 #endif
+	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
 	print_rt_stats(m, cpu);
 
+	rcu_read_lock();
 	print_rq(m, rq, cpu);
+	rcu_read_unlock();
+	spin_unlock_irqrestore(&sched_debug_lock, flags);
 }
 
 static const char *sched_tunable_scaling_names[] = {
-- 
cgit v1.3-8-gc7d7


From 8ecedd7a06d27a31dbb36fab88e2ba6e6edd43ca Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 15:42:57 +0530
Subject: sched: Display autogroup names in /proc/sched_debug

Add autogroup name to cfs_rq and tasks information to /proc/sched_debug.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110111101257.GF4772@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_autogroup.c | 5 +++++
 kernel/sched_debug.c     | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 32a723b8f84c..938d52f80a2d 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -231,6 +231,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
+	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+	if (!enabled || !tg->autogroup)
+		return 0;
+
 	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4d36f3726da7..e4d37259d490 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -93,6 +93,9 @@ static char group_path[PATH_MAX];
 
 static char *task_group_path(struct task_group *tg)
 {
+	if (autogroup_path(tg, group_path, PATH_MAX))
+		return group_path;
+
 	/*
 	 * May be NULL if the underlying cgroup isn't fully-created yet
 	 */
-- 
cgit v1.3-8-gc7d7


From f44937718ce3b8360f72f6c68c9481712517a867 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 13 Jan 2011 04:54:50 +0100
Subject: sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler()
 failure

If CONFIG_RT_GROUP_SCHED is set, __sched_setscheduler() fails due to autogroup
not allocating rt_runtime.  Free unused/unusable rt_se and rt_rq, redirect RT
tasks to the root task group, and tell __sched_setscheduler() that it's ok.

Reported-and-tested-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294890890.8089.39.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c           |  3 ++-
 kernel/sched_autogroup.c | 27 +++++++++++++++++++++++++++
 kernel/sched_autogroup.h |  4 ++++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index a0eb0941fa84..6cbff6bd1a60 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4871,7 +4871,8 @@ recheck:
 		 * assigned.
 		 */
 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
-				task_group(p)->rt_bandwidth.rt_runtime == 0) {
+				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+				!task_group_is_autogroup(task_group(p))) {
 			__task_rq_unlock(rq);
 			raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 			return -EPERM;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 938d52f80a2d..9fb656283157 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref)
 {
 	struct autogroup *ag = container_of(kref, struct autogroup, kref);
 
+#ifdef CONFIG_RT_GROUP_SCHED
+	/* We've redirected RT tasks to the root task group... */
+	ag->tg->rt_se = NULL;
+	ag->tg->rt_rq = NULL;
+#endif
 	sched_destroy_group(ag->tg);
 }
 
@@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
 	return ag;
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+static void free_rt_sched_group(struct task_group *tg);
+#endif
+
 static inline struct autogroup *autogroup_create(void)
 {
 	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void)
 	init_rwsem(&ag->lock);
 	ag->id = atomic_inc_return(&autogroup_seq_nr);
 	ag->tg = tg;
+#ifdef CONFIG_RT_GROUP_SCHED
+	/*
+	 * Autogroup RT tasks are redirected to the root task group
+	 * so we don't have to move tasks around upon policy change,
+	 * or flail around trying to allocate bandwidth on the fly.
+	 * A bandwidth exception in __sched_setscheduler() allows
+	 * the policy change to proceed.  Thereafter, task_group()
+	 * returns &root_task_group, so zero bandwidth is required.
+	 */
+	free_rt_sched_group(tg);
+	tg->rt_se = root_task_group.rt_se;
+	tg->rt_rq = root_task_group.rt_rq;
+#endif
 	tg->autogroup = ag;
 
 	return ag;
@@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	return true;
 }
 
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return tg != &root_task_group && tg->autogroup;
+}
+
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
 {
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 5358e241cb20..7b859ffe5dad 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg);
 
 static inline void autogroup_init(struct task_struct *init_task) {  }
 static inline void autogroup_free(struct task_group *tg) { }
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+	return 0;
+}
 
 static inline struct task_group *
 autogroup_task_group(struct task_struct *p, struct task_group *tg)
-- 
cgit v1.3-8-gc7d7


From fce2097983d914ea8c2338efc6f6e9bb737f6ae4 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Fri, 14 Jan 2011 15:57:39 +0800
Subject: sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count

Now rq->rq_sched_info.bkl_count is not used for rq, scroll
rq->bkl_count into it. Thus we can save some space for rq.

Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294991859-13246-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 5 +----
 kernel/sched_debug.c | 4 +++-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6cbff6bd1a60..0a169a85eb3e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -553,9 +553,6 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int ttwu_count;
 	unsigned int ttwu_local;
-
-	/* BKL stats */
-	unsigned int bkl_count;
 #endif
 };
 
@@ -3887,7 +3884,7 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 #ifdef CONFIG_SCHEDSTATS
 	if (unlikely(prev->lock_depth >= 0)) {
-		schedstat_inc(this_rq(), bkl_count);
+		schedstat_inc(this_rq(), rq_sched_info.bkl_count);
 		schedstat_inc(prev, sched_info.bkl_count);
 	}
 #endif
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e4d37259d490..eb6cb8edd075 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -296,9 +296,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(ttwu_count);
 	P(ttwu_local);
 
-	P(bkl_count);
+	SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
+				rq->rq_sched_info.bkl_count);
 
 #undef P
+#undef P64
 #endif
 	spin_lock_irqsave(&sched_debug_lock, flags);
 	print_cfs_stats(m, cpu);
-- 
cgit v1.3-8-gc7d7


From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 5 Jan 2011 05:41:17 +0100
Subject: sched: Fix signed unsigned comparison in check_preempt_tick()

Signed unsigned comparison may lead to superfluous resched if leftmost
is right of the current task, wasting a few cycles, and inadvertently
_lengthening_ the current task's slice.

Reported-by: Venkatesh Pallipadi <venki@google.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1294202477.9384.5.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 414145cf5344..77e9166d7bbf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		struct sched_entity *se = __pick_next_entity(cfs_rq);
 		s64 delta = curr->vruntime - se->vruntime;
 
+		if (delta < 0)
+			return;
+
 		if (delta > ideal_runtime)
 			resched_task(rq_of(cfs_rq)->curr);
 	}
-- 
cgit v1.3-8-gc7d7


From c5ed5145591774bd9a2960ba4ca45a02fc70aad1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 17 Jan 2011 13:45:37 +0100
Subject: perf: Fix contexted inheritance

Linus reported that the RCU lockdep annotation bits triggered for this
rcu_dereference() because we're not holding rcu_read_lock().

Going over the code I cannot convince myself its correct:

 - holding a ref on the parent_ctx, doesn't avoid it being uncloned
   concurrently (as the comment says), so we can race with a free.

 - holding parent_ctx->mutex doesn't avoid the above free from taking
   place either, it would at best avoid parent_ctx from being freed.

I.e. the warning is correct. To fix the bug, serialize against the
unclone_ctx() call by extending the reach of the parent_ctx->lock.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index b782b7a79f00..76be4c7bf08e 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6494,7 +6494,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 
 	raw_spin_lock_irqsave(&parent_ctx->lock, flags);
 	parent_ctx->rotate_disable = 0;
-	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 
 	child_ctx = child->perf_event_ctxp[ctxn];
 
@@ -6502,12 +6501,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 		/*
 		 * Mark the child context as a clone of the parent
 		 * context, or of whatever the parent is a clone of.
-		 * Note that if the parent is a clone, it could get
-		 * uncloned at any point, but that doesn't matter
-		 * because the list of events and the generation
-		 * count can't have changed since we took the mutex.
+		 *
+		 * Note that if the parent is a clone, the holding of
+		 * parent_ctx->lock avoids it from being uncloned.
 		 */
-		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		cloned_ctx = parent_ctx->parent_ctx;
 		if (cloned_ctx) {
 			child_ctx->parent_ctx = cloned_ctx;
 			child_ctx->parent_gen = parent_ctx->parent_gen;
@@ -6518,6 +6516,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 		get_ctx(child_ctx->parent_ctx);
 	}
 
+	raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
 	mutex_unlock(&parent_ctx->mutex);
 
 	perf_unpin_context(parent_ctx);
-- 
cgit v1.3-8-gc7d7


From 22a4ec729017ba613337a28f306f94ba5023fe2e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 18 Jan 2011 17:10:08 +0100
Subject: perf: Find_get_context: fix the per-cpu-counter check

If task == NULL, find_get_context() should always check that cpu
is correct.

Afaics, the bug was introduced by 38a81da2 "perf events: Clean
up pid passing", but even before that commit "&& cpu != -1" was
not exactly right, -ESRCH from find_task_by_vpid() is not
accurate.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: gregkh@suse.de
Cc: stable@kernel.org
LKML-Reference: <20110118161008.GB693@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 76be4c7bf08e..a962b1962eee 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2228,7 +2228,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 	unsigned long flags;
 	int ctxn, err;
 
-	if (!task && cpu != -1) {
+	if (!task) {
 		/* Must be root to operate on a CPU event: */
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
-- 
cgit v1.3-8-gc7d7


From 66832eb4baaaa9abe4c993ddf9113a79e39b9915 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 18 Jan 2011 17:10:32 +0100
Subject: perf: Validate cpu early in perf_event_alloc()

Starting from perf_event_alloc()->perf_init_event(), the kernel
assumes that event->cpu is either -1 or the valid CPU number.

Change perf_event_alloc() to validate this argument early. This
also means we can remove the similar check in
find_get_context().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: gregkh@suse.de
Cc: stable@kernel.org
LKML-Reference: <20110118161032.GC693@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index a962b1962eee..67d9bd70b746 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2233,9 +2233,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
 		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
-		if (cpu < 0 || cpu >= nr_cpumask_bits)
-			return ERR_PTR(-EINVAL);
-
 		/*
 		 * We could be clever and allow to attach a event to an
 		 * offline CPU and activate it when the CPU comes up, but
@@ -5541,6 +5538,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	struct hw_perf_event *hwc;
 	long err;
 
+	if ((unsigned)cpu >= nr_cpu_ids) {
+		if (!task || cpu != -1)
+			return ERR_PTR(-EINVAL);
+	}
+
 	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return ERR_PTR(-ENOMEM);
@@ -5589,7 +5591,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	if (!overflow_handler && parent_event)
 		overflow_handler = parent_event->overflow_handler;
-	
+
 	event->overflow_handler	= overflow_handler;
 
 	if (attr->disabled)
-- 
cgit v1.3-8-gc7d7


From 068c5cc5ac7414a8e9eb7856b4bf3cc4d4744267 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 19 Jan 2011 12:26:11 +0100
Subject: sched, cgroup: Use exit hook to avoid use-after-free crash

By not notifying the controller of the on-exit move back to
init_css_set, we fail to move the task out of the previous
cgroup's cfs_rq. This leads to an opportunity for a
cgroup-destroy to come in and free the cgroup (there are no
active tasks left in it after all) to which the not-quite dead
task is still enqueued.

Reported-by: Miklos Vajna <vmiklos@frugalware.org>
Fixed-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1293206353.29444.205.camel@laptop>
---
 kernel/sched.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 0a169a85eb3e..fa5272a0932c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -606,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p)
 	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
+	if (p->flags & PF_EXITING)
+		return &root_task_group;
+
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
 	tg = container_of(css, struct task_group, css);
@@ -8880,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	}
 }
 
+static void
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+{
+	/*
+	 * cgroup_exit() is called in the copy_process() failure path.
+	 * Ignore this case since the task hasn't ran yet, this avoids
+	 * trying to poke a half freed task state from generic code.
+	 */
+	if (!(task->flags & PF_EXITING))
+		return;
+
+	sched_move_task(task);
+}
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
 				u64 shareval)
@@ -8952,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.destroy	= cpu_cgroup_destroy,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
+	.exit		= cpu_cgroup_exit,
 	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
 	.early_init	= 1,
-- 
cgit v1.3-8-gc7d7


From 490da40d82b31c0562d3f5edb37810f492ca1c34 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 19 Jan 2011 10:51:44 +0800
Subject: blktrace: Don't output messages if NOTIFY isn't set.

Now if we enable blktrace, cfq has too many messages output to the
trace buffer. It is fine if we don't specify any action mask.
But if I do like this:
blktrace /dev/sdb -a issue -a complete -o - | blkparse -i -
I only want to see 'D' and 'C', while with the following command
dd if=/mnt/ocfs2/test of=/dev/null bs=4k count=1 iflag=direct

I will get(with a 2.6.37 vanilla kernel):
  8,16   0        0     0.000000000     0  m   N cfq3805 alloced
  8,16   0        0     0.000004126     0  m   N cfq3805 insert_request
  8,16   0        0     0.000004884     0  m   N cfq3805 add_to_rr
  8,16   0        0     0.000008417     0  m   N cfq workload slice:300
  8,16   0        0     0.000009557     0  m   N cfq3805 set_active wl_prio:0 wl_type:2
  8,16   0        0     0.000010640     0  m   N cfq3805 fifo=          (null)
  8,16   0        0     0.000011193     0  m   N cfq3805 dispatch_insert
  8,16   0        0     0.000012221     0  m   N cfq3805 dispatched a request
  8,16   0        0     0.000012802     0  m   N cfq3805 activate rq, drv=1
  8,16   0        1     0.000013181  3805  D   R 114759 + 8 [dd]
  8,16   0        2     0.000164244     0  C   R 114759 + 8 [0]
  8,16   0        0     0.000167997     0  m   N cfq3805 complete rqnoidle 0
  8,16   0        0     0.000168782     0  m   N cfq3805 set_slice=100
  8,16   0        0     0.000169874     0  m   N cfq3805 arm_idle: 8 group_idle: 0
  8,16   0        0     0.000170189     0  m   N cfq schedule dispatch
  8,16   0        0     0.000397938     0  m   N cfq3805 slice expired t=0
  8,16   0        0     0.000399763     0  m   N cfq3805 sl_used=1 disp=1 charge=1 iops=0 sect=8
  8,16   0        0     0.000400227     0  m   N cfq3805 del_from_rr
  8,16   0        0     0.000400882     0  m   N cfq3805 put_queue

See, there are 19 lines while I only need 2. I don't think it is
appropriate for a user.

So this patch will disable any messages if the BLK_TC_NOTIFY isn't set.
Now the output for the same command will look like:
  8,16   0        1     0.000000000  4908  D   R 114759 + 8 [dd]
  8,16   0        2     0.000146827     0  C   R 114759 + 8 [0]

Yes, it is what I want to see.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 kernel/trace/blktrace.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 153562d0b93c..d95721f33702 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 		     !blk_tracer_enabled))
 		return;
 
+	/*
+	 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
+	 * message to the trace.
+	 */
+	if (!(bt->act_mask & BLK_TC_NOTIFY))
+		return;
+
 	local_irq_save(flags);
 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
-- 
cgit v1.3-8-gc7d7


From dbe08d82ce3967ccdf459f7951d02589cf967300 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 19 Jan 2011 19:22:07 +0100
Subject: perf: Fix find_get_context() vs perf_event_exit_task() race

find_get_context() must not install the new perf_event_context
if the task has already passed perf_event_exit_task().

If nothing else, this means the memory leak. Initially
ctx->refcount == 2, it is supposed that
perf_event_exit_task_context() should participate and do the
necessary put_ctx().

find_lively_task_by_vpid() checks PF_EXITING but this buys
nothing, by the time we call find_get_context() this task can be
already dead. To the point, cmpxchg() can succeed when the task
has already done the last schedule().

Change find_get_context() to populate task->perf_event_ctxp[]
under task->perf_event_mutex, this way we can trust PF_EXITING
because perf_event_exit_task() takes the same mutex.

Also, change perf_event_exit_task_context() to use
rcu_dereference(). Probably this is not strictly needed, but
with or without this change find_get_context() can race with
setup_new_exec()->perf_event_exit_task(), rcu_dereference()
looks better.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <20110119182207.GB12183@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 84522c796987..4ec55ef5810c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	/*
-	 * Can't attach events to a dying task.
-	 */
-	err = -ESRCH;
-	if (task->flags & PF_EXITING)
-		goto errout;
-
 	/* Reuse ptrace permission checks for now. */
 	err = -EACCES;
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
@@ -2268,14 +2261,27 @@ retry:
 
 		get_ctx(ctx);
 
-		if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
-			/*
-			 * We raced with some other task; use
-			 * the context they set.
-			 */
+		err = 0;
+		mutex_lock(&task->perf_event_mutex);
+		/*
+		 * If it has already passed perf_event_exit_task().
+		 * we must see PF_EXITING, it takes this mutex too.
+		 */
+		if (task->flags & PF_EXITING)
+			err = -ESRCH;
+		else if (task->perf_event_ctxp[ctxn])
+			err = -EAGAIN;
+		else
+			rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+		mutex_unlock(&task->perf_event_mutex);
+
+		if (unlikely(err)) {
 			put_task_struct(task);
 			kfree(ctx);
-			goto retry;
+
+			if (err == -EAGAIN)
+				goto retry;
+			goto errout;
 		}
 	}
 
@@ -6127,7 +6133,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * scheduled, so we are now safe from rescheduling changing
 	 * our context.
 	 */
-	child_ctx = child->perf_event_ctxp[ctxn];
+	child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]);
 	task_ctx_sched_out(child_ctx, EVENT_ALL);
 
 	/*
-- 
cgit v1.3-8-gc7d7


From 8550d7cb6ed6c89add49c3b6ad4c753ab8a3d7f9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 19 Jan 2011 19:22:28 +0100
Subject: perf: Fix perf_event_init_task()/perf_event_free_task() interaction

perf_event_init_task() should clear child->perf_event_ctxp[]
before anything else. Otherwise, if
perf_event_init_context(perf_hw_context) fails,
perf_event_free_task() can free perf_event_ctxp[perf_sw_context]
copied from parent->perf_event_ctxp[] by dup_task_struct().

Also move the initialization of perf_event_mutex and
perf_event_list from perf_event_init_context() to
perf_event_init_context().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Prasad <prasad@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <20110119182228.GC12183@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 4ec55ef5810c..244ca3acb0ee 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6446,11 +6446,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
 	unsigned long flags;
 	int ret = 0;
 
-	child->perf_event_ctxp[ctxn] = NULL;
-
-	mutex_init(&child->perf_event_mutex);
-	INIT_LIST_HEAD(&child->perf_event_list);
-
 	if (likely(!parent->perf_event_ctxp[ctxn]))
 		return 0;
 
@@ -6539,6 +6534,10 @@ int perf_event_init_task(struct task_struct *child)
 {
 	int ctxn, ret;
 
+	memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
+	mutex_init(&child->perf_event_mutex);
+	INIT_LIST_HEAD(&child->perf_event_list);
+
 	for_each_task_context_nr(ctxn) {
 		ret = perf_event_init_context(child, ctxn);
 		if (ret)
-- 
cgit v1.3-8-gc7d7


From 2d0640b47da74cff7c11642c798d40de861ed524 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Tue, 18 Jan 2011 22:46:34 -0800
Subject: hrtimers: Notify hrtimer users of switches to NOHZ mode

When NOHZ=y and high res timers are disabled (via cmdline or
Kconfig) tick_nohz_switch_to_nohz() will notify the user about
switching into NOHZ mode. Nothing is printed for the case where
HIGH_RES_TIMERS=y. Fix this for the HIGH_RES_TIMERS=y case by
duplicating the printk from the low res NOHZ path in the high
res NOHZ path.

This confused me since I was thinking 'dmesg | grep -i NOHZ' would
tell me if NOHZ was enabled, but if I have hrtimers there is
nothing.

Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1295419594-13085-1-git-send-email-sboyd@codeaurora.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-sched.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e216e01bbd1..c55ea2433471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void)
 	}
 	local_irq_enable();
 
-	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n",
-	       smp_processor_id());
+	printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
 }
 
 /*
@@ -795,8 +794,10 @@ void tick_setup_sched_timer(void)
 	}
 
 #ifdef CONFIG_NO_HZ
-	if (tick_nohz_enabled)
+	if (tick_nohz_enabled) {
 		ts->nohz_mode = NOHZ_MODE_HIGHRES;
+		printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id());
+	}
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
-- 
cgit v1.3-8-gc7d7


From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Jan 2011 12:06:35 +0100
Subject: lockdep: Move early boot local IRQ enable/disable status to
 init/main.c

During early boot, local IRQ is disabled until IRQ subsystem is
properly initialized.  During this time, no one should enable
local IRQ and some operations which usually are not allowed with
IRQ disabled, e.g. operations which might sleep or require
communications with other processors, are allowed.

lockdep tracked this with early_boot_irqs_off/on() callbacks.
As other subsystems need this information too, move it to
init/main.c and make it generally available.  While at it,
toggle the boolean to early_boot_irqs_disabled instead of
enabled so that it can be initialized with %false and %true
indicates the exceptional condition.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110120110635.GB6036@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/enlighten.c     |  2 +-
 include/linux/kernel.h       |  2 ++
 include/linux/lockdep.h      |  8 --------
 init/main.c                  | 13 +++++++++++--
 kernel/lockdep.c             | 18 +-----------------
 kernel/trace/trace_irqsoff.c |  8 --------
 6 files changed, 15 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7e8d3bc80af6..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
 
 	local_irq_disable();
-	early_boot_irqs_off();
+	early_boot_irqs_disabled = true;
 
 	memblock_init();
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 5a9d9059520b..d07d8057e440 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -243,6 +243,8 @@ extern int test_taint(unsigned flag);
 extern unsigned long get_taint(void);
 extern int root_mountflags;
 
+extern bool early_boot_irqs_disabled;
+
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 71c09b26c759..f638fd78d106 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -436,16 +436,8 @@ do {								\
 #endif /* CONFIG_LOCKDEP */
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-extern void early_boot_irqs_off(void);
-extern void early_boot_irqs_on(void);
 extern void print_irqtrace_events(struct task_struct *curr);
 #else
-static inline void early_boot_irqs_off(void)
-{
-}
-static inline void early_boot_irqs_on(void)
-{
-}
 static inline void print_irqtrace_events(struct task_struct *curr)
 {
 }
diff --git a/init/main.c b/init/main.c
index 00799c1d4628..33c37c379e96 100644
--- a/init/main.c
+++ b/init/main.c
@@ -96,6 +96,15 @@ static inline void mark_rodata_ro(void) { }
 extern void tc_init(void);
 #endif
 
+/*
+ * Debug helper: via this flag we know that we are in 'early bootup code'
+ * where only the boot processor is running with IRQ disabled.  This means
+ * two things - IRQ must not be enabled before the flag is cleared and some
+ * operations which are not allowed with IRQ disabled are allowed while the
+ * flag is set.
+ */
+bool early_boot_irqs_disabled __read_mostly;
+
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
@@ -554,7 +563,7 @@ asmlinkage void __init start_kernel(void)
 	cgroup_init_early();
 
 	local_irq_disable();
-	early_boot_irqs_off();
+	early_boot_irqs_disabled = true;
 
 /*
  * Interrupts are still disabled. Do necessary setups, then
@@ -621,7 +630,7 @@ asmlinkage void __init start_kernel(void)
 	if (!irqs_disabled())
 		printk(KERN_CRIT "start_kernel(): bug: interrupts were "
 				 "enabled early\n");
-	early_boot_irqs_on();
+	early_boot_irqs_disabled = false;
 	local_irq_enable();
 
 	/* Interrupts are enabled now so all GFP allocations are safe. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 42ba65dff7d9..0d2058da80f5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2291,22 +2291,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
 	return 1;
 }
 
-/*
- * Debugging helper: via this flag we know that we are in
- * 'early bootup code', and will warn about any invalid irqs-on event:
- */
-static int early_boot_irqs_enabled;
-
-void early_boot_irqs_off(void)
-{
-	early_boot_irqs_enabled = 0;
-}
-
-void early_boot_irqs_on(void)
-{
-	early_boot_irqs_enabled = 1;
-}
-
 /*
  * Hardirqs will be enabled:
  */
@@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
-	if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
+	if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
 		return;
 
 	if (unlikely(curr->hardirqs_enabled)) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 5cf8c602b880..92b6e1e12d98 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
  * Stubs:
  */
 
-void early_boot_irqs_off(void)
-{
-}
-
-void early_boot_irqs_on(void)
-{
-}
-
 void trace_softirqs_on(unsigned long ip)
 {
 }
-- 
cgit v1.3-8-gc7d7


From bd924e8cbd4b73ffb7d707a774c04f7e2cae88ed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 20 Jan 2011 12:07:13 +0100
Subject: smp: Allow on_each_cpu() to be called while early_boot_irqs_disabled
 status to init/main.c

percpu may end up calling vfree() during early boot which in
turn may call on_each_cpu() for TLB flushes.  The function of
on_each_cpu() can be done safely while IRQ is disabled during
early boot but it assumed that the function is always called
with local IRQ enabled which ended up enabling local IRQ
prematurely during boot and triggering a couple of warnings.

This patch updates on_each_cpu() and smp_call_function_many()
such on_each_cpu() can be used safely while
early_boot_irqs_disabled is set.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Pekka Enberg <penberg@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110120110713.GC6036@htj.dyndns.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e069987..4b83cd6815e2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -430,7 +430,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	 * can't happen.
 	 */
 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
-		     && !oops_in_progress);
+		     && !oops_in_progress && !early_boot_irqs_disabled);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
@@ -533,17 +533,20 @@ void ipi_call_unlock_irq(void)
 #endif /* USE_GENERIC_SMP_HELPERS */
 
 /*
- * Call a function on all processors
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.  Use local_irq_save/restore() instead
+ * of local_irq_disable/enable().
  */
 int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
+	unsigned long flags;
 	int ret = 0;
 
 	preempt_disable();
 	ret = smp_call_function(func, info, wait);
-	local_irq_disable();
+	local_irq_save(flags);
 	func(info);
-	local_irq_enable();
+	local_irq_restore(flags);
 	preempt_enable();
 	return ret;
 }
-- 
cgit v1.3-8-gc7d7


From 6dc19899958e420a931274b94019e267e2396d3e Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 20 Jan 2011 14:44:33 -0800
Subject: kernel/smp.c: fix smp_call_function_many() SMP race

I noticed a failure where we hit the following WARN_ON in
generic_smp_call_function_interrupt:

                if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
                        continue;

                data->csd.func(data->csd.info);

                refs = atomic_dec_return(&data->refs);
                WARN_ON(refs < 0);      <-------------------------

We atomically tested and cleared our bit in the cpumask, and yet the
number of cpus left (ie refs) was 0.  How can this be?

It turns out commit 54fdade1c3332391948ec43530c02c4794a38172
("generic-ipi: make struct call_function_data lockless") is at fault.  It
removes locking from smp_call_function_many and in doing so creates a
rather complicated race.

The problem comes about because:

 - The smp_call_function_many interrupt handler walks call_function.queue
   without any locking.
 - We reuse a percpu data structure in smp_call_function_many.
 - We do not wait for any RCU grace period before starting the next
   smp_call_function_many.

Imagine a scenario where CPU A does two smp_call_functions back to back,
and CPU B does an smp_call_function in between.  We concentrate on how CPU
C handles the calls:

CPU A            CPU B                  CPU C              CPU D

smp_call_function
                                        smp_call_function_interrupt
                                            walks
					call_function.queue sees
					data from CPU A on list

                 smp_call_function

                                        smp_call_function_interrupt
                                            walks

                                        call_function.queue sees
                                          (stale) CPU A on list
							   smp_call_function int
							   clears last ref on A
							   list_del_rcu, unlock
smp_call_function reuses
percpu *data A
                                         data->cpumask sees and
                                         clears bit in cpumask
                                         might be using old or new fn!
                                         decrements refs below 0

set data->refs (too late!)

The important thing to note is since the interrupt handler walks a
potentially stale call_function.queue without any locking, then another
cpu can view the percpu *data structure at any time, even when the owner
is in the process of initialising it.

The following test case hits the WARN_ON 100% of the time on my PowerPC
box (having 128 threads does help :)

#include <linux/module.h>
#include <linux/init.h>

#define ITERATIONS 100

static void do_nothing_ipi(void *dummy)
{
}

static void do_ipis(struct work_struct *dummy)
{
	int i;

	for (i = 0; i < ITERATIONS; i++)
		smp_call_function(do_nothing_ipi, NULL, 1);

	printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id());
}

static struct work_struct work[NR_CPUS];

static int __init testcase_init(void)
{
	int cpu;

	for_each_online_cpu(cpu) {
		INIT_WORK(&work[cpu], do_ipis);
		schedule_work_on(cpu, &work[cpu]);
	}

	return 0;
}

static void __exit testcase_exit(void)
{
}

module_init(testcase_init)
module_exit(testcase_exit)
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Anton Blanchard");

I tried to fix it by ordering the read and the write of ->cpumask and
->refs.  In doing so I missed a critical case but Paul McKenney was able
to spot my bug thankfully :) To ensure we arent viewing previous
iterations the interrupt handler needs to read ->refs then ->cpumask then
->refs _again_.

Thanks to Milton Miller and Paul McKenney for helping to debug this issue.

[miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ]
[miltonm@bga.com: remove excess tests]
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Milton Miller <miltonm@bga.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: <stable@kernel.org> [2.6.32+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 4ec30e069987..17c6e5860231 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void)
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
+		/*
+		 * Since we walk the list without any locks, we might
+		 * see an entry that was completed, removed from the
+		 * list and is in the process of being reused.
+		 *
+		 * We must check that the cpu is in the cpumask before
+		 * checking the refs, and both must be set before
+		 * executing the callback on this cpu.
+		 */
+
+		if (!cpumask_test_cpu(cpu, data->cpumask))
+			continue;
+
+		smp_rmb();
+
+		if (atomic_read(&data->refs) == 0)
+			continue;
+
 		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
 			continue;
 
@@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void)
 		refs = atomic_dec_return(&data->refs);
 		WARN_ON(refs < 0);
 		if (!refs) {
+			WARN_ON(!cpumask_empty(data->cpumask));
+
 			raw_spin_lock(&call_function.lock);
 			list_del_rcu(&data->csd.list);
 			raw_spin_unlock(&call_function.lock);
@@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	data = &__get_cpu_var(cfd_data);
 	csd_lock(&data->csd);
+	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
 
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
 	cpumask_clear_cpu(this_cpu, data->cpumask);
+
+	/*
+	 * To ensure the interrupt handler gets an complete view
+	 * we order the cpumask and refs writes and order the read
+	 * of them in the interrupt handler.  In addition we may
+	 * only clear our own cpu bit from the mask.
+	 */
+	smp_wmb();
+
 	atomic_set(&data->refs, cpumask_weight(data->cpumask));
 
 	raw_spin_lock_irqsave(&call_function.lock, flags);
-- 
cgit v1.3-8-gc7d7


From 225c8e010f2d17a62aef131e24c6e7c111f36f9b Mon Sep 17 00:00:00 2001
From: Milton Miller <miltonm@bga.com>
Date: Thu, 20 Jan 2011 14:44:34 -0800
Subject: kernel/smp.c: consolidate writes in smp_call_function_interrupt()

We have to test the cpu mask in the interrupt handler before checking the
refs, otherwise we can start to follow an entry before its deleted and
find it partially initailzed for the next trip.  Presently we also clear
the cpumask bit before executing the called function, which implies
getting write access to the line.  After the function is called we then
decrement refs, and if they go to zero we then unlock the structure.

However, this implies getting write access to the call function data
before and after another the function is called.  If we can assert that no
smp_call_function execution function is allowed to enable interrupts, then
we can move both writes to after the function is called, hopfully allowing
both writes with one cache line bounce.

On a 256 thread system with a kernel compiled for 1024 threads, the time
to execute testcase in the "smp_call_function_many race" changelog was
reduced by about 30-40ms out of about 545 ms.

I decided to keep this as WARN because its now a buggy function, even
though the stack trace is of no value -- a simple printk would give us the
information needed.

Raw data:

Without patch:
  ipi_test startup took 1219366ns complete 539819014ns total 541038380ns
  ipi_test startup took 1695754ns complete 543439872ns total 545135626ns
  ipi_test startup took 7513568ns complete 539606362ns total 547119930ns
  ipi_test startup took 13304064ns complete 533898562ns total 547202626ns
  ipi_test startup took 8668192ns complete 544264074ns total 552932266ns
  ipi_test startup took 4977626ns complete 548862684ns total 553840310ns
  ipi_test startup took 2144486ns complete 541292318ns total 543436804ns
  ipi_test startup took 21245824ns complete 530280180ns total 551526004ns

With patch:
  ipi_test startup took 5961748ns complete 500859628ns total 506821376ns
  ipi_test startup took 8975996ns complete 495098924ns total 504074920ns
  ipi_test startup took 19797750ns complete 492204740ns total 512002490ns
  ipi_test startup took 14824796ns complete 487495878ns total 502320674ns
  ipi_test startup took 11514882ns complete 494439372ns total 505954254ns
  ipi_test startup took 8288084ns complete 502570774ns total 510858858ns
  ipi_test startup took 6789954ns complete 493388112ns total 500178066ns

	#include <linux/module.h>
	#include <linux/init.h>
	#include <linux/sched.h> /* sched clock */

	#define ITERATIONS 100

	static void do_nothing_ipi(void *dummy)
	{
	}

	static void do_ipis(struct work_struct *dummy)
	{
		int i;

		for (i = 0; i < ITERATIONS; i++)
			smp_call_function(do_nothing_ipi, NULL, 1);

		printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id());
	}

	static struct work_struct work[NR_CPUS];

	static int __init testcase_init(void)
	{
		int cpu;
		u64 start, started, done;

		start = local_clock();
		for_each_online_cpu(cpu) {
			INIT_WORK(&work[cpu], do_ipis);
			schedule_work_on(cpu, &work[cpu]);
		}
		started = local_clock();
		for_each_online_cpu(cpu)
			flush_work(&work[cpu]);
		done = local_clock();
		pr_info("ipi_test startup took %lldns complete %lldns total %lldns\n",
			started-start, done-started, done-start);

		return 0;
	}

	static void __exit testcase_exit(void)
	{
	}

	module_init(testcase_init)
	module_exit(testcase_exit)
	MODULE_LICENSE("GPL");
	MODULE_AUTHOR("Anton Blanchard");

Signed-off-by: Milton Miller <miltonm@bga.com>
Cc: Anton Blanchard <anton@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 17c6e5860231..2fe66f7c617a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,6 +194,7 @@ void generic_smp_call_function_interrupt(void)
 	 */
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
+		void (*func) (void *info);
 
 		/*
 		 * Since we walk the list without any locks, we might
@@ -213,24 +214,32 @@ void generic_smp_call_function_interrupt(void)
 		if (atomic_read(&data->refs) == 0)
 			continue;
 
-		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
-			continue;
-
+		func = data->csd.func;			/* for later warn */
 		data->csd.func(data->csd.info);
 
+		/*
+		 * If the cpu mask is not still set then it enabled interrupts,
+		 * we took another smp interrupt, and executed the function
+		 * twice on this cpu.  In theory that copy decremented refs.
+		 */
+		if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
+			WARN(1, "%pS enabled interrupts and double executed\n",
+			     func);
+			continue;
+		}
+
 		refs = atomic_dec_return(&data->refs);
 		WARN_ON(refs < 0);
-		if (!refs) {
-			WARN_ON(!cpumask_empty(data->cpumask));
-
-			raw_spin_lock(&call_function.lock);
-			list_del_rcu(&data->csd.list);
-			raw_spin_unlock(&call_function.lock);
-		}
 
 		if (refs)
 			continue;
 
+		WARN_ON(!cpumask_empty(data->cpumask));
+
+		raw_spin_lock(&call_function.lock);
+		list_del_rcu(&data->csd.list);
+		raw_spin_unlock(&call_function.lock);
+
 		csd_unlock(&data->csd);
 	}
 
-- 
cgit v1.3-8-gc7d7


From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 19 Jan 2011 19:41:35 +0100
Subject: genirq: Remove __do_IRQ

All architectures are finally converted. Remove the cruft.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Greg Ungerer <gerg@uclinux.org>
Cc: Michal Simek <monstr@monstr.eu>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Jeff Dike <jdike@addtoit.com>
---
 Documentation/feature-removal-schedule.txt |   8 ---
 arch/alpha/Kconfig                         |   3 -
 arch/blackfin/Kconfig                      |   3 -
 arch/frv/Kconfig                           |   4 --
 arch/ia64/Kconfig                          |   3 -
 arch/m68knommu/Kconfig                     |   4 --
 arch/microblaze/Kconfig                    |   3 -
 arch/mips/Kconfig                          |   3 -
 arch/mn10300/Kconfig                       |   3 -
 arch/parisc/Kconfig                        |   4 --
 arch/powerpc/Kconfig                       |   4 --
 arch/score/Kconfig                         |   3 -
 arch/sparc/Kconfig                         |   4 --
 arch/tile/Kconfig                          |   3 -
 arch/um/Kconfig.um                         |   3 -
 include/linux/irqdesc.h                    |  14 ----
 kernel/irq/Kconfig                         |   3 -
 kernel/irq/handle.c                        | 111 -----------------------------
 18 files changed, 183 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 8c594c45b6a1..b959659c5df4 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -357,14 +357,6 @@ Who:	Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
 
 -----------------------------
 
-What:	__do_IRQ all in one fits nothing interrupt handler
-When:	2.6.32
-Why:	__do_IRQ was kept for easy migration to the type flow handlers.
-	More than two years of migration time is enough.
-Who:	Thomas Gleixner <tglx@linutronix.de>
-
------------------------------
-
 What:	fakephp and associated sysfs files in /sys/bus/pci/slots/
 When:	2011
 Why:	In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index fc95ee1bcf6f..943fe6930f77 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -68,9 +68,6 @@ config GENERIC_IOMAP
 	bool
 	default n
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_HARDIRQS
 	bool
 	default y
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 0a221d48152d..a37b2be23f18 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -50,9 +50,6 @@ config GENERIC_HARDIRQS
 config GENERIC_IRQ_PROBE
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_GPIO
 	def_bool y
 
diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index f6bcb039cd6d..e504edeb3d84 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -33,10 +33,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config TIME_LOW_RES
 	bool
 	default y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index e0f5b6d7f849..be1faf991813 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -684,9 +684,6 @@ source "lib/Kconfig"
 config GENERIC_HARDIRQS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_IRQ_PROBE
 	bool
 	default y
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index 704e7b92334c..7379cb0ce1af 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -52,10 +52,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 5f5018a71a3d..5a6378493797 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -52,9 +52,6 @@ config GENERIC_TIME_VSYSCALL
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_GPIO
 	def_bool y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 548e6cc3bc28..f5ecc0566bc2 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -793,9 +793,6 @@ config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 #
 # Select some configuration options automatically based on user selections.
 #
diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 8ed41cf2b08d..4638269152f6 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -34,9 +34,6 @@ config RWSEM_GENERIC_SPINLOCK
 config RWSEM_XCHGADD_ALGORITHM
 	bool
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_CALIBRATE_DELAY
 	def_bool y
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 0888675c98dd..4b94ac4dc603 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -12,7 +12,6 @@ config PARISC
 	select HAVE_IRQ_WORK
 	select HAVE_PERF_EVENTS
 	select GENERIC_ATOMIC64 if !64BIT
-	select GENERIC_HARDIRQS_NO__DO_IRQ
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
 	  in many of their workstations & servers (HP9000 700 and 800 series,
@@ -79,9 +78,6 @@ config IRQ_PER_CPU
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 # unless you want to implement ACPI on PA-RISC ... ;-)
 config PM
 	bool
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 959f38ccb9a7..e0b185dd718a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -40,10 +40,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	default y
-
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool PPC64
 
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 4293fdcb5398..4f159acfbe33 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -53,9 +53,6 @@ config GENERIC_CLOCKEVENTS
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_SYSCALL_TABLE
 	def_bool y
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 45d9c87d083a..989bb6415ea3 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -107,10 +107,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
 	def_bool y if SPARC64
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	bool
-	def_bool y if SPARC64
-
 config MMU
 	bool
 	default y
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index 4e8b82bca9e5..c16b98c2435d 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -10,9 +10,6 @@ config GENERIC_CSUM
 config GENERIC_HARDIRQS
 	def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config GENERIC_IRQ_PROBE
 	def_bool y
 
diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index f8d1d0d47fe6..90a438acbfaf 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -120,9 +120,6 @@ config SMP
 
 	  If you don't know what to do, say N.
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-	def_bool y
-
 config NR_CPUS
 	int "Maximum number of CPUs (2-32)"
 	range 2 32
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 6a64c6fa81af..c1a95b7b58de 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -100,13 +100,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 #define get_irq_desc_data(desc)		((desc)->irq_data.handler_data)
 #define get_irq_desc_msi(desc)		((desc)->irq_data.msi_desc)
 
-/*
- * Monolithic do_IRQ implementation.
- */
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-extern unsigned int __do_IRQ(unsigned int irq);
-#endif
-
 /*
  * Architectures call this to let the generic IRQ layer
  * handle an interrupt. If the descriptor is attached to an
@@ -115,14 +108,7 @@ extern unsigned int __do_IRQ(unsigned int irq);
  */
 static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
 {
-#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
 	desc->handle_irq(irq, desc);
-#else
-	if (likely(desc->handle_irq))
-		desc->handle_irq(irq, desc);
-	else
-		__do_IRQ(irq);
-#endif
 }
 
 static inline void generic_handle_irq(unsigned int irq)
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 31d766bf5d2e..8e42fec7686d 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -9,9 +9,6 @@ menu "IRQ subsystem"
 config GENERIC_HARDIRQS
        def_bool y
 
-config GENERIC_HARDIRQS_NO__DO_IRQ
-       def_bool y
-
 # Select this to disable the deprecated stuff
 config GENERIC_HARDIRQS_NO_DEPRECATED
        def_bool n
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e2347eb63306..3540a7190122 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	return retval;
 }
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
-
-#ifdef CONFIG_ENABLE_WARN_DEPRECATED
-# warning __do_IRQ is deprecated. Please convert to proper flow handlers
-#endif
-
-/**
- * __do_IRQ - original all in one highlevel IRQ handler
- * @irq:	the interrupt number
- *
- * __do_IRQ handles all normal device IRQ's (the special
- * SMP cross-CPU interrupts have their own specific
- * handlers).
- *
- * This is the original x86 implementation which is used for every
- * interrupt type.
- */
-unsigned int __do_IRQ(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action;
-	unsigned int status;
-
-	kstat_incr_irqs_this_cpu(irq, desc);
-
-	if (CHECK_IRQ_PER_CPU(desc->status)) {
-		irqreturn_t action_ret;
-
-		/*
-		 * No locking required for CPU-local interrupts:
-		 */
-		if (desc->irq_data.chip->ack)
-			desc->irq_data.chip->ack(irq);
-		if (likely(!(desc->status & IRQ_DISABLED))) {
-			action_ret = handle_IRQ_event(irq, desc->action);
-			if (!noirqdebug)
-				note_interrupt(irq, desc, action_ret);
-		}
-		desc->irq_data.chip->end(irq);
-		return 1;
-	}
-
-	raw_spin_lock(&desc->lock);
-	if (desc->irq_data.chip->ack)
-		desc->irq_data.chip->ack(irq);
-	/*
-	 * REPLAY is when Linux resends an IRQ that was dropped earlier
-	 * WAITING is used by probe to mark irqs that are being tested
-	 */
-	status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING);
-	status |= IRQ_PENDING; /* we _want_ to handle it */
-
-	/*
-	 * If the IRQ is disabled for whatever reason, we cannot
-	 * use the action we have.
-	 */
-	action = NULL;
-	if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) {
-		action = desc->action;
-		status &= ~IRQ_PENDING; /* we commit to handling */
-		status |= IRQ_INPROGRESS; /* we are handling it */
-	}
-	desc->status = status;
-
-	/*
-	 * If there is no IRQ handler or it was disabled, exit early.
-	 * Since we set PENDING, if another processor is handling
-	 * a different instance of this same irq, the other processor
-	 * will take care of it.
-	 */
-	if (unlikely(!action))
-		goto out;
-
-	/*
-	 * Edge triggered interrupts need to remember
-	 * pending events.
-	 * This applies to any hw interrupts that allow a second
-	 * instance of the same irq to arrive while we are in do_IRQ
-	 * or in the handler. But the code here only handles the _second_
-	 * instance of the irq, not the third or fourth. So it is mostly
-	 * useful for irq hardware that does not mask cleanly in an
-	 * SMP environment.
-	 */
-	for (;;) {
-		irqreturn_t action_ret;
-
-		raw_spin_unlock(&desc->lock);
-
-		action_ret = handle_IRQ_event(irq, action);
-		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
-
-		raw_spin_lock(&desc->lock);
-		if (likely(!(desc->status & IRQ_PENDING)))
-			break;
-		desc->status &= ~IRQ_PENDING;
-	}
-	desc->status &= ~IRQ_INPROGRESS;
-
-out:
-	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
-	 */
-	desc->irq_data.chip->end(irq);
-	raw_spin_unlock(&desc->lock);
-
-	return 1;
-}
-#endif
-- 
cgit v1.3-8-gc7d7


From 547e9fd7d328af261f184bf66effc5033c886498 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 19 Jan 2011 12:51:39 +0100
Subject: perf: Annotate cpuctx->ctx.mutex to avoid a lockdep splat

Lockdep spotted:

	loop_1b_instruc/1899 is trying to acquire lock:
	 (event_mutex){+.+.+.}, at: [<ffffffff810e1908>] perf_trace_init+0x3b/0x2f7

	but task is already holding lock:
	 (&ctx->mutex){+.+.+.}, at: [<ffffffff810eb45b>] perf_event_init_context+0xc0/0x218

	which lock already depends on the new lock.

	the existing dependency chain (in reverse order) is:

	-> #3 (&ctx->mutex){+.+.+.}:
	-> #2 (cpu_hotplug.lock){+.+.+.}:
	-> #1 (module_mutex){+.+...}:
	-> #0 (event_mutex){+.+.+.}:

But because the deadlock would be cpuhotplug (cpu-event) vs fork
(task-event) it cannot, in fact, happen. We can annotate this by giving the
perf_event_context used for the cpuctx a different lock class from those
used by tasks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 244ca3acb0ee..c5fa717cf099 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -5380,6 +5380,8 @@ free_dev:
 	goto out;
 }
 
+static struct lock_class_key cpuctx_mutex;
+
 int perf_pmu_register(struct pmu *pmu, char *name, int type)
 {
 	int cpu, ret;
@@ -5428,6 +5430,7 @@ skip_type:
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		__perf_event_init_context(&cpuctx->ctx);
+		lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
 		cpuctx->ctx.type = cpu_context;
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->jiffies_interval = 1;
-- 
cgit v1.3-8-gc7d7


From 806839b22cbda90176d7f8d421889bddd7826e93 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 21 Jan 2011 18:45:47 +0100
Subject: perf: perf_event_exit_task_context:
 s/rcu_dereference/rcu_dereference_raw/

In theory, almost every user of task->child->perf_event_ctxp[]
is wrong. find_get_context() can install the new context at any
moment, we need read_barrier_depends().

dbe08d82ce3967ccdf459f7951d02589cf967300 "perf: Fix
find_get_context() vs perf_event_exit_task() race" added
rcu_dereference() into perf_event_exit_task_context() to make
the precedent, but this makes __rcu_dereference_check() unhappy.
Use rcu_dereference_raw() to shut up the warning.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: acme@redhat.com
Cc: paulus@samba.org
Cc: stern@rowland.harvard.edu
Cc: a.p.zijlstra@chello.nl
Cc: fweisbec@gmail.com
Cc: roland@redhat.com
Cc: prasad@linux.vnet.ibm.com
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
LKML-Reference: <20110121174547.GA8796@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c5fa717cf099..126a302c481c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -6136,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
 	 * scheduled, so we are now safe from rescheduling changing
 	 * our context.
 	 */
-	child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]);
+	child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
 	task_ctx_sched_out(child_ctx, EVENT_ALL);
 
 	/*
-- 
cgit v1.3-8-gc7d7


From e94965ed5beb23c6fabf7ed31f625e66d7ff28de Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor@vmware.com>
Date: Wed, 15 Dec 2010 14:00:19 -0800
Subject: module: show version information for built-in modules in sysfs

Currently only drivers that are built as modules have their versions
shown in /sys/module/<module_name>/version, but this information might
also be useful for built-in drivers as well. This especially important
for drivers that do not define any parameters - such drivers, if
built-in, are completely invisible from userspace.

This patch changes MODULE_VERSION() macro so that in case when we are
compiling built-in module, version information is stored in a separate
section. Kernel then uses this data to create 'version' sysfs attribute
in the same fashion it creates attributes for module parameters.

Signed-off-by: Dmitry Torokhov <dtor@vmware.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/asm-generic/vmlinux.lds.h |  7 +++++
 include/linux/module.h            | 27 ++++++++++++++++
 kernel/params.c                   | 65 ++++++++++++++++++++++++++++++++-------
 3 files changed, 88 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 68649336c4ad..6ebb81030d2d 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -364,6 +364,13 @@
 		VMLINUX_SYMBOL(__start___param) = .;			\
 		*(__param)						\
 		VMLINUX_SYMBOL(__stop___param) = .;			\
+	}								\
+									\
+	/* Built-in module versions. */					\
+	__modver : AT(ADDR(__modver) - LOAD_OFFSET) {			\
+		VMLINUX_SYMBOL(__start___modver) = .;			\
+		*(__modver)						\
+		VMLINUX_SYMBOL(__stop___modver) = .;			\
 		. = ALIGN((align));					\
 		VMLINUX_SYMBOL(__end_rodata) = .;			\
 	}								\
diff --git a/include/linux/module.h b/include/linux/module.h
index 8b17fd8c790d..50efcd3ae850 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -58,6 +58,12 @@ struct module_attribute {
 	void (*free)(struct module *);
 };
 
+struct module_version_attribute {
+	struct module_attribute mattr;
+	const char *module_name;
+	const char *version;
+};
+
 struct module_kobject
 {
 	struct kobject kobj;
@@ -161,7 +167,28 @@ extern struct module __this_module;
   Using this automatically adds a checksum of the .c files and the
   local headers in "srcversion".
 */
+
+#ifdef MODULE
 #define MODULE_VERSION(_version) MODULE_INFO(version, _version)
+#else
+#define MODULE_VERSION(_version)					\
+	extern ssize_t __modver_version_show(struct module_attribute *,	\
+					     struct module *, char *);	\
+	static struct module_version_attribute __modver_version_attr	\
+	__used								\
+    __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \
+	= {								\
+		.mattr	= {						\
+			.attr	= {					\
+				.name	= "version",			\
+				.mode	= S_IRUGO,			\
+			},						\
+			.show	= __modver_version_show,		\
+		},							\
+		.module_name	= KBUILD_MODNAME,			\
+		.version	= _version,				\
+	}
+#endif
 
 /* Optional firmware file (or files) needed by the module
  * format is simply firmware file name.  Multiple firmware
diff --git a/kernel/params.c b/kernel/params.c
index 08107d181758..0da1411222b9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num)
 			params[i].ops->free(params[i].arg);
 }
 
-static void __init kernel_add_sysfs_param(const char *name,
-					  struct kernel_param *kparam,
-					  unsigned int name_skip)
+static struct module_kobject * __init locate_module_kobject(const char *name)
 {
 	struct module_kobject *mk;
 	struct kobject *kobj;
@@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name,
 
 	kobj = kset_find_obj(module_kset, name);
 	if (kobj) {
-		/* We already have one.  Remove params so we can add more. */
 		mk = to_module_kobject(kobj);
-		/* We need to remove it before adding parameters. */
-		sysfs_remove_group(&mk->kobj, &mk->mp->grp);
 	} else {
 		mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
 		BUG_ON(!mk);
@@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name,
 					   "%s", name);
 		if (err) {
 			kobject_put(&mk->kobj);
-			printk(KERN_ERR "Module '%s' failed add to sysfs, "
-			       "error number %d\n", name, err);
-			printk(KERN_ERR	"The system will be unstable now.\n");
-			return;
+			printk(KERN_ERR
+				"Module '%s' failed add to sysfs, error number %d\n",
+				name, err);
+			printk(KERN_ERR
+				"The system will be unstable now.\n");
+			return NULL;
 		}
-		/* So that exit path is even. */
+
+		/* So that we hold reference in both cases. */
 		kobject_get(&mk->kobj);
 	}
 
+	return mk;
+}
+
+static void __init kernel_add_sysfs_param(const char *name,
+					  struct kernel_param *kparam,
+					  unsigned int name_skip)
+{
+	struct module_kobject *mk;
+	int err;
+
+	mk = locate_module_kobject(name);
+	if (!mk)
+		return;
+
+	/* We need to remove old parameters before adding more. */
+	if (mk->mp)
+		sysfs_remove_group(&mk->kobj, &mk->mp->grp);
+
 	/* These should not fail at boot. */
 	err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
 	BUG_ON(err);
@@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void)
 	}
 }
 
+ssize_t __modver_version_show(struct module_attribute *mattr,
+			      struct module *mod, char *buf)
+{
+	struct module_version_attribute *vattr =
+		container_of(mattr, struct module_version_attribute, mattr);
+
+	return sprintf(buf, "%s\n", vattr->version);
+}
+
+extern struct module_version_attribute __start___modver[], __stop___modver[];
+
+static void __init version_sysfs_builtin(void)
+{
+	const struct module_version_attribute *vattr;
+	struct module_kobject *mk;
+	int err;
+
+	for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+		mk = locate_module_kobject(vattr->module_name);
+		if (mk) {
+			err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
+			kobject_uevent(&mk->kobj, KOBJ_ADD);
+			kobject_put(&mk->kobj);
+		}
+	}
+}
 
 /* module-related sysfs stuff */
 
@@ -875,6 +917,7 @@ static int __init param_sysfs_init(void)
 	}
 	module_sysfs_initialized = 1;
 
+	version_sysfs_builtin();
 	param_sysfs_builtin();
 
 	return 0;
-- 
cgit v1.3-8-gc7d7


From 3ff6dcac735704824c1dff64dc6863c390d364cc Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Mon, 24 Jan 2011 15:33:52 +0800
Subject: sched: Fix poor interactivity on UP systems due to group scheduler
 nice tune bug

Michael Witten and Christian Kujau reported that the autogroup
scheduling feature hurts interactivity on their UP systems.

It turns out that this is an older bug in the group scheduling code,
and the wider appeal provided by the autogroup feature exposed it
more prominently.

When on UP with FAIR_GROUP_SCHED enabled, tune shares
only affect tg->shares, but is not reflected in
tg->se->load. The reason is that update_cfs_shares()
does nothing on UP.

So introduce update_cfs_shares() for UP && FAIR_GROUP_SCHED.

This issue was found when enable autogroup scheduling was enabled,
but it is an older bug that also exists on cgroup.cpu on UP.

Reported-and-Tested-by: Michael Witten <mfwitten@gmail.com>
Reported-and-Tested-by: Christian Kujau <christian@nerdbynature.de>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Acked-by: Pekka Enberg <penberg@kernel.org>
Acked-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110124073352.GA24186@windriver.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 78 ++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 77e9166d7bbf..354769979c02 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	cfs_rq->nr_running--;
 }
 
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_FAIR_GROUP_SCHED
+# ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
 					    int global_update)
 {
@@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 		list_del_leaf_cfs_rq(cfs_rq);
 }
 
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+				long weight_delta)
+{
+	long load_weight, load, shares;
+
+	load = cfs_rq->load.weight + weight_delta;
+
+	load_weight = atomic_read(&tg->load_weight);
+	load_weight -= cfs_rq->load_contribution;
+	load_weight += load;
+
+	shares = (tg->shares * load);
+	if (load_weight)
+		shares /= load_weight;
+
+	if (shares < MIN_SHARES)
+		shares = MIN_SHARES;
+	if (shares > tg->shares)
+		shares = tg->shares;
+
+	return shares;
+}
+
+static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
+		update_cfs_load(cfs_rq, 0);
+		update_cfs_shares(cfs_rq, 0);
+	}
+}
+# else /* CONFIG_SMP */
+static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+{
+}
+
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
+				long weight_delta)
+{
+	return tg->shares;
+}
+
+static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
+{
+}
+# endif /* CONFIG_SMP */
 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			    unsigned long weight)
 {
@@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 {
 	struct task_group *tg;
 	struct sched_entity *se;
-	long load_weight, load, shares;
+	long shares;
 
 	if (!cfs_rq)
 		return;
@@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
 	if (!se)
 		return;
-
-	load = cfs_rq->load.weight + weight_delta;
-
-	load_weight = atomic_read(&tg->load_weight);
-	load_weight -= cfs_rq->load_contribution;
-	load_weight += load;
-
-	shares = (tg->shares * load);
-	if (load_weight)
-		shares /= load_weight;
-
-	if (shares < MIN_SHARES)
-		shares = MIN_SHARES;
-	if (shares > tg->shares)
-		shares = tg->shares;
+#ifndef CONFIG_SMP
+	if (likely(se->load.weight == tg->shares))
+		return;
+#endif
+	shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
-
-static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
-{
-	if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
-		update_cfs_load(cfs_rq, 0);
-		update_cfs_shares(cfs_rq, 0);
-	}
-}
 #else /* CONFIG_FAIR_GROUP_SCHED */
 static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 {
-- 
cgit v1.3-8-gc7d7


From 8c6a98b22b750c9eb52653ba643faa17db8d3881 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Mon, 24 Jan 2011 09:31:38 -0800
Subject: Input: sysrq - ensure sysrq_enabled and __sysrq_enabled are
 consistent

Currently sysrq_enabled and __sysrq_enabled are initialised separately
and inconsistently, leading to sysrq being actually enabled by reported
as not enabled in sysfs.  The first change to the sysfs configurable
synchronises these two:

    static int __read_mostly sysrq_enabled = 1;
    static int __sysrq_enabled;

Add a common define to carry the default for these preventing them becoming
out of sync again.  Default this to 1 to mirror previous behaviour.

Signed-off-by: Andy Whitcroft <apw@canonical.com>
Cc: stable@kernel.org
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/char/sysrq.c  | 2 +-
 include/linux/sysrq.h | 3 +++
 kernel/sysctl.c       | 3 ++-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index c556ed9db13d..8e0dd254eb11 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -46,7 +46,7 @@
 #include <asm/irq_regs.h>
 
 /* Whether we react on sysrq keys or just ignore them */
-static int __read_mostly sysrq_enabled = 1;
+static int __read_mostly sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
 static bool __read_mostly sysrq_always_enabled;
 
 static bool sysrq_on(void)
diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h
index 387fa7d05c98..7faf933cced7 100644
--- a/include/linux/sysrq.h
+++ b/include/linux/sysrq.h
@@ -17,6 +17,9 @@
 #include <linux/errno.h>
 #include <linux/types.h>
 
+/* Enable/disable SYSRQ support by default (0==no, 1==yes). */
+#define SYSRQ_DEFAULT_ENABLE	1
+
 /* Possible values of bitmask for enabling sysrq functions */
 /* 0x0001 is reserved for enable everything */
 #define SYSRQ_ENABLE_LOG	0x0002
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c33a1edb799f..3afce4dc9ba5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,8 @@ static int proc_taint(struct ctl_table *table, int write,
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
+/* Note: sysrq code uses it's own private copy */
+static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
 
 static int sysrq_sysctl_handler(ctl_table *table, int write,
 				void __user *buffer, size_t *lenp,
-- 
cgit v1.3-8-gc7d7


From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001
From: Torben Hohn <torbenh@gmx.de>
Date: Tue, 25 Jan 2011 15:07:35 -0800
Subject: console: rename acquire/release_console_sem() to
 console_lock/unlock()

The -rt patches change the console_semaphore to console_mutex.  As a
result, a quite large chunk of the patches changes all
acquire/release_console_sem() to acquire/release_console_mutex()

This commit makes things use more neutral function names which dont make
implications about the underlying lock.

The only real change is the return value of console_trylock which is
inverted from try_acquire_console_sem()

This patch also paves the way to switching console_sem from a semaphore to
a mutex.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert]
Signed-off-by: Torben Hohn <torbenh@gmx.de>
Cc: Thomas Gleixner <tglx@tglx.de>
Cc: Greg KH <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-omap2/pm24xx.c           |   4 +-
 arch/arm/mach-omap2/pm34xx.c           |   4 +-
 arch/arm/mach-omap2/serial.c           |   4 +-
 arch/parisc/kernel/pdc_cons.c          |   4 +-
 drivers/char/bfin_jtag_comm.c          |   8 +--
 drivers/gpu/drm/nouveau/nouveau_drv.c  |   8 +--
 drivers/gpu/drm/radeon/radeon_device.c |  10 +--
 drivers/staging/msm/msm_fb.c           |   8 +--
 drivers/staging/olpc_dcon/olpc_dcon.c  |  10 +--
 drivers/staging/sm7xx/smtcfb.c         |   8 +--
 drivers/tty/serial/sb1250-duart.c      |   2 +-
 drivers/tty/tty_io.c                   |   4 +-
 drivers/tty/vt/selection.c             |   4 +-
 drivers/tty/vt/vc_screen.c             |  16 ++---
 drivers/tty/vt/vt.c                    | 124 ++++++++++++++++-----------------
 drivers/tty/vt/vt_ioctl.c              |  60 ++++++++--------
 drivers/video/arkfb.c                  |  12 ++--
 drivers/video/aty/aty128fb.c           |  12 ++--
 drivers/video/aty/atyfb_base.c         |  10 +--
 drivers/video/aty/radeon_pm.c          |  10 +--
 drivers/video/chipsfb.c                |   8 +--
 drivers/video/console/fbcon.c          |  42 +++++------
 drivers/video/da8xx-fb.c               |   8 +--
 drivers/video/fbmem.c                  |  12 ++--
 drivers/video/fbsysfs.c                |  20 +++---
 drivers/video/geode/gxfb_core.c        |   8 +--
 drivers/video/geode/lxfb_core.c        |   8 +--
 drivers/video/i810/i810_main.c         |   8 +--
 drivers/video/jz4740_fb.c              |   8 +--
 drivers/video/mx3fb.c                  |   8 +--
 drivers/video/nvidia/nvidia.c          |   8 +--
 drivers/video/ps3fb.c                  |  16 ++---
 drivers/video/s3fb.c                   |  16 ++---
 drivers/video/savage/savagefb_driver.c |   8 +--
 drivers/video/sh_mobile_hdmi.c         |   8 +--
 drivers/video/sh_mobile_lcdcfb.c       |   4 +-
 drivers/video/sm501fb.c                |   8 +--
 drivers/video/tmiofb.c                 |  10 +--
 drivers/video/via/viafbdev.c           |   8 +--
 drivers/video/vt8623fb.c               |  12 ++--
 drivers/video/xen-fbfront.c            |   4 +-
 fs/proc/consoles.c                     |   4 +-
 include/linux/console.h                |   6 +-
 kernel/printk.c                        | 100 ++++++++++++++------------
 44 files changed, 336 insertions(+), 328 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-omap2/pm24xx.c b/arch/arm/mach-omap2/pm24xx.c
index 9e5dc8ed51e9..97feb3ab6a69 100644
--- a/arch/arm/mach-omap2/pm24xx.c
+++ b/arch/arm/mach-omap2/pm24xx.c
@@ -134,7 +134,7 @@ static void omap2_enter_full_retention(void)
 
 	/* Block console output in case it is on one of the OMAP UARTs */
 	if (!is_suspending())
-		if (try_acquire_console_sem())
+		if (!console_trylock())
 			goto no_sleep;
 
 	omap_uart_prepare_idle(0);
@@ -151,7 +151,7 @@ static void omap2_enter_full_retention(void)
 	omap_uart_resume_idle(0);
 
 	if (!is_suspending())
-		release_console_sem();
+		console_unlock();
 
 no_sleep:
 	if (omap2_pm_debug) {
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 8cbbeade4b8a..a4aa1920a75c 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -398,7 +398,7 @@ void omap_sram_idle(void)
 	if (!is_suspending())
 		if (per_next_state < PWRDM_POWER_ON ||
 		    core_next_state < PWRDM_POWER_ON)
-			if (try_acquire_console_sem())
+			if (!console_trylock())
 				goto console_still_active;
 
 	/* PER */
@@ -481,7 +481,7 @@ void omap_sram_idle(void)
 	}
 
 	if (!is_suspending())
-		release_console_sem();
+		console_unlock();
 
 console_still_active:
 	/* Disable IO-PAD and IO-CHAIN wakeup */
diff --git a/arch/arm/mach-omap2/serial.c b/arch/arm/mach-omap2/serial.c
index 302da7403a10..32e91a9c8b6b 100644
--- a/arch/arm/mach-omap2/serial.c
+++ b/arch/arm/mach-omap2/serial.c
@@ -812,7 +812,7 @@ void __init omap_serial_init_port(struct omap_board_data *bdata)
 
 	oh->dev_attr = uart;
 
-	acquire_console_sem(); /* in case the earlycon is on the UART */
+	console_lock(); /* in case the earlycon is on the UART */
 
 	/*
 	 * Because of early UART probing, UART did not get idled
@@ -838,7 +838,7 @@ void __init omap_serial_init_port(struct omap_board_data *bdata)
 	omap_uart_block_sleep(uart);
 	uart->timeout = DEFAULT_TIMEOUT;
 
-	release_console_sem();
+	console_unlock();
 
 	if ((cpu_is_omap34xx() && uart->padconf) ||
 	    (uart->wk_en && uart->wk_mask)) {
diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c
index 11bdd68e5762..fc770be465ff 100644
--- a/arch/parisc/kernel/pdc_cons.c
+++ b/arch/parisc/kernel/pdc_cons.c
@@ -169,11 +169,11 @@ static int __init pdc_console_tty_driver_init(void)
 
 	struct console *tmp;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(tmp)
 		if (tmp == &pdc_cons)
 			break;
-	release_console_sem();
+	console_unlock();
 
 	if (!tmp) {
 		printk(KERN_INFO "PDC console driver not registered anymore, not creating %s\n", pdc_cons.name);
diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c
index e397df3ad98e..16402445f2b2 100644
--- a/drivers/char/bfin_jtag_comm.c
+++ b/drivers/char/bfin_jtag_comm.c
@@ -183,16 +183,16 @@ bfin_jc_circ_write(const unsigned char *buf, int count)
 }
 
 #ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE
-# define acquire_console_sem()
-# define release_console_sem()
+# define console_lock()
+# define console_unlock()
 #endif
 static int
 bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count)
 {
 	int i;
-	acquire_console_sem();
+	console_lock();
 	i = bfin_jc_circ_write(buf, count);
-	release_console_sem();
+	console_unlock();
 	wake_up_process(bfin_jc_kthread);
 	return i;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c
index 13bb672a16f4..f658a04eecf9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.c
@@ -234,9 +234,9 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state)
 		pci_set_power_state(pdev, PCI_D3hot);
 	}
 
-	acquire_console_sem();
+	console_lock();
 	nouveau_fbcon_set_suspend(dev, 1);
-	release_console_sem();
+	console_unlock();
 	nouveau_fbcon_restore_accel(dev);
 	return 0;
 
@@ -359,9 +359,9 @@ nouveau_pci_resume(struct pci_dev *pdev)
 		nv_crtc->lut.depth = 0;
 	}
 
-	acquire_console_sem();
+	console_lock();
 	nouveau_fbcon_set_suspend(dev, 0);
-	release_console_sem();
+	console_unlock();
 
 	nouveau_fbcon_zfill_all(dev);
 
diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c
index 26091d602b84..0d478932b1a9 100644
--- a/drivers/gpu/drm/radeon/radeon_device.c
+++ b/drivers/gpu/drm/radeon/radeon_device.c
@@ -891,9 +891,9 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state)
 		pci_disable_device(dev->pdev);
 		pci_set_power_state(dev->pdev, PCI_D3hot);
 	}
-	acquire_console_sem();
+	console_lock();
 	radeon_fbdev_set_suspend(rdev, 1);
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 
@@ -905,11 +905,11 @@ int radeon_resume_kms(struct drm_device *dev)
 	if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	pci_set_power_state(dev->pdev, PCI_D0);
 	pci_restore_state(dev->pdev);
 	if (pci_enable_device(dev->pdev)) {
-		release_console_sem();
+		console_unlock();
 		return -1;
 	}
 	pci_set_master(dev->pdev);
@@ -920,7 +920,7 @@ int radeon_resume_kms(struct drm_device *dev)
 	radeon_restore_bios_scratch_regs(rdev);
 
 	radeon_fbdev_set_suspend(rdev, 0);
-	release_console_sem();
+	console_unlock();
 
 	/* reset hpd state */
 	radeon_hpd_init(rdev);
diff --git a/drivers/staging/msm/msm_fb.c b/drivers/staging/msm/msm_fb.c
index 23fa049b51f2..a2f29d464051 100644
--- a/drivers/staging/msm/msm_fb.c
+++ b/drivers/staging/msm/msm_fb.c
@@ -347,7 +347,7 @@ static int msm_fb_suspend(struct platform_device *pdev, pm_message_t state)
 	if ((!mfd) || (mfd->key != MFD_KEY))
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(mfd->fbi, 1);
 
 	ret = msm_fb_suspend_sub(mfd);
@@ -358,7 +358,7 @@ static int msm_fb_suspend(struct platform_device *pdev, pm_message_t state)
 		pdev->dev.power.power_state = state;
 	}
 
-	release_console_sem();
+	console_unlock();
 	return ret;
 }
 #else
@@ -431,11 +431,11 @@ static int msm_fb_resume(struct platform_device *pdev)
 	if ((!mfd) || (mfd->key != MFD_KEY))
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	ret = msm_fb_resume_sub(mfd);
 	pdev->dev.power.power_state = PMSG_ON;
 	fb_set_suspend(mfd->fbi, 1);
-	release_console_sem();
+	console_unlock();
 
 	return ret;
 }
diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c
index 9f26dc9408bb..56a283d1a74d 100644
--- a/drivers/staging/olpc_dcon/olpc_dcon.c
+++ b/drivers/staging/olpc_dcon/olpc_dcon.c
@@ -373,17 +373,17 @@ static void dcon_source_switch(struct work_struct *work)
 		 *
 		 * For now, we just hope..
 		 */
-		acquire_console_sem();
+		console_lock();
 		ignore_fb_events = 1;
 		if (fb_blank(fbinfo, FB_BLANK_UNBLANK)) {
 			ignore_fb_events = 0;
-			release_console_sem();
+			console_unlock();
 			printk(KERN_ERR "olpc-dcon:  Failed to enter CPU mode\n");
 			dcon_pending = DCON_SOURCE_DCON;
 			return;
 		}
 		ignore_fb_events = 0;
-		release_console_sem();
+		console_unlock();
 
 		/* And turn off the DCON */
 		pdata->set_dconload(1);
@@ -435,12 +435,12 @@ static void dcon_source_switch(struct work_struct *work)
 			}
 		}
 
-		acquire_console_sem();
+		console_lock();
 		ignore_fb_events = 1;
 		if (fb_blank(fbinfo, FB_BLANK_POWERDOWN))
 			printk(KERN_ERR "olpc-dcon:  couldn't blank fb!\n");
 		ignore_fb_events = 0;
-		release_console_sem();
+		console_unlock();
 
 		printk(KERN_INFO "olpc-dcon: The DCON has control\n");
 		break;
diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c
index 0bc113c44d39..d007e4a12c14 100644
--- a/drivers/staging/sm7xx/smtcfb.c
+++ b/drivers/staging/sm7xx/smtcfb.c
@@ -1044,9 +1044,9 @@ static int __maybe_unused smtcfb_suspend(struct pci_dev *pdev, pm_message_t msg)
 
 	/* when doing suspend, call fb apis and pci apis */
 	if (msg.event == PM_EVENT_SUSPEND) {
-		acquire_console_sem();
+		console_lock();
 		fb_set_suspend(&sfb->fb, 1);
-		release_console_sem();
+		console_unlock();
 		retv = pci_save_state(pdev);
 		pci_disable_device(pdev);
 		retv = pci_choose_state(pdev, msg);
@@ -1105,9 +1105,9 @@ static int __maybe_unused smtcfb_resume(struct pci_dev *pdev)
 
 	smtcfb_setmode(sfb);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(&sfb->fb, 0);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c
index a2f2b3254499..602d9845c52f 100644
--- a/drivers/tty/serial/sb1250-duart.c
+++ b/drivers/tty/serial/sb1250-duart.c
@@ -829,7 +829,7 @@ static void __init sbd_probe_duarts(void)
 #ifdef CONFIG_SERIAL_SB1250_DUART_CONSOLE
 /*
  * Serial console stuff.  Very basic, polling driver for doing serial
- * console output.  The console_sem is held by the caller, so we
+ * console output.  The console_lock is held by the caller, so we
  * shouldn't be interrupted for more console activity.
  */
 static void sbd_console_putchar(struct uart_port *uport, int ch)
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 464d09d97873..6158eae0f64a 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3256,7 +3256,7 @@ static ssize_t show_cons_active(struct device *dev,
 	struct console *c;
 	ssize_t count = 0;
 
-	acquire_console_sem();
+	console_lock();
 	for (c = console_drivers; c; c = c->next) {
 		if (!c->device)
 			continue;
@@ -3271,7 +3271,7 @@ static ssize_t show_cons_active(struct device *dev,
 	while (i--)
 		count += sprintf(buf + count, "%s%d%c",
 				 cs[i]->name, cs[i]->index, i ? ' ':'\n');
-	release_console_sem();
+	console_unlock();
 
 	return count;
 }
diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c
index ebae344ce910..c956ed6c83a3 100644
--- a/drivers/tty/vt/selection.c
+++ b/drivers/tty/vt/selection.c
@@ -316,9 +316,9 @@ int paste_selection(struct tty_struct *tty)
 	/* always called with BTM from vt_ioctl */
 	WARN_ON(!tty_locked());
 
-	acquire_console_sem();
+	console_lock();
 	poke_blanked_console();
-	release_console_sem();
+	console_unlock();
 
 	ld = tty_ldisc_ref(tty);
 	if (!ld) {
diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c
index eab3a1ff99e4..a672ed192d33 100644
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -202,7 +202,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	/* Select the proper current console and verify
 	 * sanity of the situation under the console lock.
 	 */
-	acquire_console_sem();
+	console_lock();
 
 	attr = (currcons & 128);
 	currcons = (currcons & 127);
@@ -336,9 +336,9 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 		 * the pagefault handling code may want to call printk().
 		 */
 
-		release_console_sem();
+		console_unlock();
 		ret = copy_to_user(buf, con_buf_start, orig_count);
-		acquire_console_sem();
+		console_lock();
 
 		if (ret) {
 			read += (orig_count - ret);
@@ -354,7 +354,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	if (read)
 		ret = read;
 unlock_out:
-	release_console_sem();
+	console_unlock();
 	mutex_unlock(&con_buf_mtx);
 	return ret;
 }
@@ -379,7 +379,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 	/* Select the proper current console and verify
 	 * sanity of the situation under the console lock.
 	 */
-	acquire_console_sem();
+	console_lock();
 
 	attr = (currcons & 128);
 	currcons = (currcons & 127);
@@ -414,9 +414,9 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 		/* Temporarily drop the console lock so that we can read
 		 * in the write data from userspace safely.
 		 */
-		release_console_sem();
+		console_unlock();
 		ret = copy_from_user(con_buf, buf, this_round);
-		acquire_console_sem();
+		console_lock();
 
 		if (ret) {
 			this_round -= ret;
@@ -542,7 +542,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
 		vcs_scr_updated(vc);
 
 unlock_out:
-	release_console_sem();
+	console_unlock();
 
 	mutex_unlock(&con_buf_mtx);
 
diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
index 76407eca9ab0..b230bd3f056f 100644
--- a/drivers/tty/vt/vt.c
+++ b/drivers/tty/vt/vt.c
@@ -1003,9 +1003,9 @@ static int vt_resize(struct tty_struct *tty, struct winsize *ws)
 	struct vc_data *vc = tty->driver_data;
 	int ret;
 
-	acquire_console_sem();
+	console_lock();
 	ret = vc_do_resize(tty, vc, ws->ws_col, ws->ws_row);
-	release_console_sem();
+	console_unlock();
 	return ret;
 }
 
@@ -1271,7 +1271,7 @@ static void default_attr(struct vc_data *vc)
 	vc->vc_color = vc->vc_def_color;
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void csi_m(struct vc_data *vc)
 {
 	int i;
@@ -1415,7 +1415,7 @@ int mouse_reporting(void)
 	return vc_cons[fg_console].d->vc_report_mouse;
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void set_mode(struct vc_data *vc, int on_off)
 {
 	int i;
@@ -1485,7 +1485,7 @@ static void set_mode(struct vc_data *vc, int on_off)
 		}
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void setterm_command(struct vc_data *vc)
 {
 	switch(vc->vc_par[0]) {
@@ -1545,7 +1545,7 @@ static void setterm_command(struct vc_data *vc)
 	}
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void csi_at(struct vc_data *vc, unsigned int nr)
 {
 	if (nr > vc->vc_cols - vc->vc_x)
@@ -1555,7 +1555,7 @@ static void csi_at(struct vc_data *vc, unsigned int nr)
 	insert_char(vc, nr);
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void csi_L(struct vc_data *vc, unsigned int nr)
 {
 	if (nr > vc->vc_rows - vc->vc_y)
@@ -1566,7 +1566,7 @@ static void csi_L(struct vc_data *vc, unsigned int nr)
 	vc->vc_need_wrap = 0;
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void csi_P(struct vc_data *vc, unsigned int nr)
 {
 	if (nr > vc->vc_cols - vc->vc_x)
@@ -1576,7 +1576,7 @@ static void csi_P(struct vc_data *vc, unsigned int nr)
 	delete_char(vc, nr);
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void csi_M(struct vc_data *vc, unsigned int nr)
 {
 	if (nr > vc->vc_rows - vc->vc_y)
@@ -1587,7 +1587,7 @@ static void csi_M(struct vc_data *vc, unsigned int nr)
 	vc->vc_need_wrap = 0;
 }
 
-/* console_sem is held (except via vc_init->reset_terminal */
+/* console_lock is held (except via vc_init->reset_terminal */
 static void save_cur(struct vc_data *vc)
 {
 	vc->vc_saved_x		= vc->vc_x;
@@ -1603,7 +1603,7 @@ static void save_cur(struct vc_data *vc)
 	vc->vc_saved_G1		= vc->vc_G1_charset;
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void restore_cur(struct vc_data *vc)
 {
 	gotoxy(vc, vc->vc_saved_x, vc->vc_saved_y);
@@ -1625,7 +1625,7 @@ enum { ESnormal, ESesc, ESsquare, ESgetpars, ESgotpars, ESfunckey,
 	EShash, ESsetG0, ESsetG1, ESpercent, ESignore, ESnonstd,
 	ESpalette };
 
-/* console_sem is held (except via vc_init()) */
+/* console_lock is held (except via vc_init()) */
 static void reset_terminal(struct vc_data *vc, int do_clear)
 {
 	vc->vc_top		= 0;
@@ -1685,7 +1685,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear)
 	    csi_J(vc, 2);
 }
 
-/* console_sem is held */
+/* console_lock is held */
 static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c)
 {
 	/*
@@ -2119,7 +2119,7 @@ static int is_double_width(uint32_t ucs)
 	return bisearch(ucs, double_width, ARRAY_SIZE(double_width) - 1);
 }
 
-/* acquires console_sem */
+/* acquires console_lock */
 static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count)
 {
 #ifdef VT_BUF_VRAM_ONLY
@@ -2147,11 +2147,11 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 
 	might_sleep();
 
-	acquire_console_sem();
+	console_lock();
 	vc = tty->driver_data;
 	if (vc == NULL) {
 		printk(KERN_ERR "vt: argh, driver_data is NULL !\n");
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -2159,7 +2159,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co
 	if (!vc_cons_allocated(currcons)) {
 	    /* could this happen? */
 		printk_once("con_write: tty %d not allocated\n", currcons+1);
-	    release_console_sem();
+	    console_unlock();
 	    return 0;
 	}
 
@@ -2375,7 +2375,7 @@ rescan_last_byte:
 	}
 	FLUSH
 	console_conditional_schedule();
-	release_console_sem();
+	console_unlock();
 	notify_update(vc);
 	return n;
 #undef FLUSH
@@ -2388,11 +2388,11 @@ rescan_last_byte:
  * us to do the switches asynchronously (needed when we want
  * to switch due to a keyboard interrupt).  Synchronization
  * with other console code and prevention of re-entrancy is
- * ensured with console_sem.
+ * ensured with console_lock.
  */
 static void console_callback(struct work_struct *ignored)
 {
-	acquire_console_sem();
+	console_lock();
 
 	if (want_console >= 0) {
 		if (want_console != fg_console &&
@@ -2422,7 +2422,7 @@ static void console_callback(struct work_struct *ignored)
 	}
 	notify_update(vc_cons[fg_console].d);
 
-	release_console_sem();
+	console_unlock();
 }
 
 int set_console(int nr)
@@ -2603,7 +2603,7 @@ static struct console vt_console_driver = {
  */
 
 /*
- * Generally a bit racy with respect to console_sem().
+ * Generally a bit racy with respect to console_lock();.
  *
  * There are some functions which don't need it.
  *
@@ -2629,17 +2629,17 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 	switch (type)
 	{
 		case TIOCL_SETSEL:
-			acquire_console_sem();
+			console_lock();
 			ret = set_selection((struct tiocl_selection __user *)(p+1), tty);
-			release_console_sem();
+			console_unlock();
 			break;
 		case TIOCL_PASTESEL:
 			ret = paste_selection(tty);
 			break;
 		case TIOCL_UNBLANKSCREEN:
-			acquire_console_sem();
+			console_lock();
 			unblank_screen();
-			release_console_sem();
+			console_unlock();
 			break;
 		case TIOCL_SELLOADLUT:
 			ret = sel_loadlut(p);
@@ -2688,10 +2688,10 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 			}
 			break;
 		case TIOCL_BLANKSCREEN:	/* until explicitly unblanked, not only poked */
-			acquire_console_sem();
+			console_lock();
 			ignore_poke = 1;
 			do_blank_screen(0);
-			release_console_sem();
+			console_unlock();
 			break;
 		case TIOCL_BLANKEDSCREEN:
 			ret = console_blanked;
@@ -2790,11 +2790,11 @@ static void con_flush_chars(struct tty_struct *tty)
 		return;
 
 	/* if we race with con_close(), vt may be null */
-	acquire_console_sem();
+	console_lock();
 	vc = tty->driver_data;
 	if (vc)
 		set_cursor(vc);
-	release_console_sem();
+	console_unlock();
 }
 
 /*
@@ -2805,7 +2805,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 	unsigned int currcons = tty->index;
 	int ret = 0;
 
-	acquire_console_sem();
+	console_lock();
 	if (tty->driver_data == NULL) {
 		ret = vc_allocate(currcons);
 		if (ret == 0) {
@@ -2813,7 +2813,7 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 
 			/* Still being freed */
 			if (vc->port.tty) {
-				release_console_sem();
+				console_unlock();
 				return -ERESTARTSYS;
 			}
 			tty->driver_data = vc;
@@ -2827,11 +2827,11 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 				tty->termios->c_iflag |= IUTF8;
 			else
 				tty->termios->c_iflag &= ~IUTF8;
-			release_console_sem();
+			console_unlock();
 			return ret;
 		}
 	}
-	release_console_sem();
+	console_unlock();
 	return ret;
 }
 
@@ -2844,9 +2844,9 @@ static void con_shutdown(struct tty_struct *tty)
 {
 	struct vc_data *vc = tty->driver_data;
 	BUG_ON(vc == NULL);
-	acquire_console_sem();
+	console_lock();
 	vc->port.tty = NULL;
-	release_console_sem();
+	console_unlock();
 	tty_shutdown(tty);
 }
 
@@ -2893,13 +2893,13 @@ static int __init con_init(void)
 	struct vc_data *vc;
 	unsigned int currcons = 0, i;
 
-	acquire_console_sem();
+	console_lock();
 
 	if (conswitchp)
 		display_desc = conswitchp->con_startup();
 	if (!display_desc) {
 		fg_console = 0;
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -2946,7 +2946,7 @@ static int __init con_init(void)
 	printable = 1;
 	printk("\n");
 
-	release_console_sem();
+	console_unlock();
 
 #ifdef CONFIG_VT_CONSOLE
 	register_console(&vt_console_driver);
@@ -3037,7 +3037,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last,
 	if (!try_module_get(owner))
 		return -ENODEV;
 
-	acquire_console_sem();
+	console_lock();
 
 	/* check if driver is registered */
 	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
@@ -3122,7 +3122,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last,
 
 	retval = 0;
 err:
-	release_console_sem();
+	console_unlock();
 	module_put(owner);
 	return retval;
 };
@@ -3171,7 +3171,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 	if (!try_module_get(owner))
 		return -ENODEV;
 
-	acquire_console_sem();
+	console_lock();
 
 	/* check if driver is registered and if it is unbindable */
 	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
@@ -3185,7 +3185,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 	}
 
 	if (retval) {
-		release_console_sem();
+		console_unlock();
 		goto err;
 	}
 
@@ -3204,12 +3204,12 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 	}
 
 	if (retval) {
-		release_console_sem();
+		console_unlock();
 		goto err;
 	}
 
 	if (!con_is_bound(csw)) {
-		release_console_sem();
+		console_unlock();
 		goto err;
 	}
 
@@ -3238,7 +3238,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt)
 	if (!con_is_bound(csw))
 		con_driver->flag &= ~CON_DRIVER_FLAG_INIT;
 
-	release_console_sem();
+	console_unlock();
 	/* ignore return value, binding should not fail */
 	bind_con_driver(defcsw, first, last, deflt);
 err:
@@ -3538,7 +3538,7 @@ int register_con_driver(const struct consw *csw, int first, int last)
 	if (!try_module_get(owner))
 		return -ENODEV;
 
-	acquire_console_sem();
+	console_lock();
 
 	for (i = 0; i < MAX_NR_CON_DRIVER; i++) {
 		con_driver = &registered_con_driver[i];
@@ -3592,7 +3592,7 @@ int register_con_driver(const struct consw *csw, int first, int last)
 	}
 
 err:
-	release_console_sem();
+	console_unlock();
 	module_put(owner);
 	return retval;
 }
@@ -3613,7 +3613,7 @@ int unregister_con_driver(const struct consw *csw)
 {
 	int i, retval = -ENODEV;
 
-	acquire_console_sem();
+	console_lock();
 
 	/* cannot unregister a bound driver */
 	if (con_is_bound(csw))
@@ -3639,7 +3639,7 @@ int unregister_con_driver(const struct consw *csw)
 		}
 	}
 err:
-	release_console_sem();
+	console_unlock();
 	return retval;
 }
 EXPORT_SYMBOL(unregister_con_driver);
@@ -3934,9 +3934,9 @@ int con_set_cmap(unsigned char __user *arg)
 {
 	int rc;
 
-	acquire_console_sem();
+	console_lock();
 	rc = set_get_cmap (arg,1);
-	release_console_sem();
+	console_unlock();
 
 	return rc;
 }
@@ -3945,9 +3945,9 @@ int con_get_cmap(unsigned char __user *arg)
 {
 	int rc;
 
-	acquire_console_sem();
+	console_lock();
 	rc = set_get_cmap (arg,0);
-	release_console_sem();
+	console_unlock();
 
 	return rc;
 }
@@ -3994,12 +3994,12 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op)
 	} else
 		font.data = NULL;
 
-	acquire_console_sem();
+	console_lock();
 	if (vc->vc_sw->con_font_get)
 		rc = vc->vc_sw->con_font_get(vc, &font);
 	else
 		rc = -ENOSYS;
-	release_console_sem();
+	console_unlock();
 
 	if (rc)
 		goto out;
@@ -4076,12 +4076,12 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op)
 	font.data = memdup_user(op->data, size);
 	if (IS_ERR(font.data))
 		return PTR_ERR(font.data);
-	acquire_console_sem();
+	console_lock();
 	if (vc->vc_sw->con_font_set)
 		rc = vc->vc_sw->con_font_set(vc, &font, op->flags);
 	else
 		rc = -ENOSYS;
-	release_console_sem();
+	console_unlock();
 	kfree(font.data);
 	return rc;
 }
@@ -4103,12 +4103,12 @@ static int con_font_default(struct vc_data *vc, struct console_font_op *op)
 	else
 		name[MAX_FONT_NAME - 1] = 0;
 
-	acquire_console_sem();
+	console_lock();
 	if (vc->vc_sw->con_font_default)
 		rc = vc->vc_sw->con_font_default(vc, &font, s);
 	else
 		rc = -ENOSYS;
-	release_console_sem();
+	console_unlock();
 	if (!rc) {
 		op->width = font.width;
 		op->height = font.height;
@@ -4124,7 +4124,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op)
 	if (vc->vc_mode != KD_TEXT)
 		return -EINVAL;
 
-	acquire_console_sem();
+	console_lock();
 	if (!vc->vc_sw->con_font_copy)
 		rc = -ENOSYS;
 	else if (con < 0 || !vc_cons_allocated(con))
@@ -4133,7 +4133,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op)
 		rc = 0;
 	else
 		rc = vc->vc_sw->con_font_copy(vc, con);
-	release_console_sem();
+	console_unlock();
 	return rc;
 }
 
diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c
index 6b68a0fb4611..1235ebda6e1c 100644
--- a/drivers/tty/vt/vt_ioctl.c
+++ b/drivers/tty/vt/vt_ioctl.c
@@ -649,12 +649,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		/*
 		 * explicitly blank/unblank the screen if switching modes
 		 */
-		acquire_console_sem();
+		console_lock();
 		if (arg == KD_TEXT)
 			do_unblank_screen(1);
 		else
 			do_blank_screen(1);
-		release_console_sem();
+		console_unlock();
 		break;
 
 	case KDGETMODE:
@@ -893,7 +893,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			ret = -EINVAL;
 			goto out;
 		}
-		acquire_console_sem();
+		console_lock();
 		vc->vt_mode = tmp;
 		/* the frsig is ignored, so we set it to 0 */
 		vc->vt_mode.frsig = 0;
@@ -901,7 +901,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		vc->vt_pid = get_pid(task_pid(current));
 		/* no switch is required -- saw@shade.msu.ru */
 		vc->vt_newvt = -1;
-		release_console_sem();
+		console_unlock();
 		break;
 	}
 
@@ -910,9 +910,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		struct vt_mode tmp;
 		int rc;
 
-		acquire_console_sem();
+		console_lock();
 		memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode));
-		release_console_sem();
+		console_unlock();
 
 		rc = copy_to_user(up, &tmp, sizeof(struct vt_mode));
 		if (rc)
@@ -965,9 +965,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			ret =  -ENXIO;
 		else {
 			arg--;
-			acquire_console_sem();
+			console_lock();
 			ret = vc_allocate(arg);
-			release_console_sem();
+			console_unlock();
 			if (ret)
 				break;
 			set_console(arg);
@@ -990,7 +990,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			ret = -ENXIO;
 		else {
 			vsa.console--;
-			acquire_console_sem();
+			console_lock();
 			ret = vc_allocate(vsa.console);
 			if (ret == 0) {
 				struct vc_data *nvc;
@@ -1003,7 +1003,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 				put_pid(nvc->vt_pid);
 				nvc->vt_pid = get_pid(task_pid(current));
 			}
-			release_console_sem();
+			console_unlock();
 			if (ret)
 				break;
 			/* Commence switch and lock */
@@ -1044,7 +1044,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		/*
 		 * Switching-from response
 		 */
-		acquire_console_sem();
+		console_lock();
 		if (vc->vt_newvt >= 0) {
 			if (arg == 0)
 				/*
@@ -1063,7 +1063,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 				vc->vt_newvt = -1;
 				ret = vc_allocate(newvt);
 				if (ret) {
-					release_console_sem();
+					console_unlock();
 					break;
 				}
 				/*
@@ -1083,7 +1083,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 			if (arg != VT_ACKACQ)
 				ret = -EINVAL;
 		}
-		release_console_sem();
+		console_unlock();
 		break;
 
 	 /*
@@ -1096,20 +1096,20 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		}
 		if (arg == 0) {
 		    /* deallocate all unused consoles, but leave 0 */
-			acquire_console_sem();
+			console_lock();
 			for (i=1; i<MAX_NR_CONSOLES; i++)
 				if (! VT_BUSY(i))
 					vc_deallocate(i);
-			release_console_sem();
+			console_unlock();
 		} else {
 			/* deallocate a single console, if possible */
 			arg--;
 			if (VT_BUSY(arg))
 				ret = -EBUSY;
 			else if (arg) {			      /* leave 0 */
-				acquire_console_sem();
+				console_lock();
 				vc_deallocate(arg);
-				release_console_sem();
+				console_unlock();
 			}
 		}
 		break;
@@ -1126,7 +1126,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		    get_user(cc, &vtsizes->v_cols))
 			ret = -EFAULT;
 		else {
-			acquire_console_sem();
+			console_lock();
 			for (i = 0; i < MAX_NR_CONSOLES; i++) {
 				vc = vc_cons[i].d;
 
@@ -1135,7 +1135,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 					vc_resize(vc_cons[i].d, cc, ll);
 				}
 			}
-			release_console_sem();
+			console_unlock();
 		}
 		break;
 	}
@@ -1187,14 +1187,14 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 		for (i = 0; i < MAX_NR_CONSOLES; i++) {
 			if (!vc_cons[i].d)
 				continue;
-			acquire_console_sem();
+			console_lock();
 			if (vlin)
 				vc_cons[i].d->vc_scan_lines = vlin;
 			if (clin)
 				vc_cons[i].d->vc_font.height = clin;
 			vc_cons[i].d->vc_resize_user = 1;
 			vc_resize(vc_cons[i].d, cc, ll);
-			release_console_sem();
+			console_unlock();
 		}
 		break;
 	}
@@ -1367,7 +1367,7 @@ void vc_SAK(struct work_struct *work)
 	struct vc_data *vc;
 	struct tty_struct *tty;
 
-	acquire_console_sem();
+	console_lock();
 	vc = vc_con->d;
 	if (vc) {
 		tty = vc->port.tty;
@@ -1379,7 +1379,7 @@ void vc_SAK(struct work_struct *work)
 			__do_SAK(tty);
 		reset_vc(vc);
 	}
-	release_console_sem();
+	console_unlock();
 }
 
 #ifdef CONFIG_COMPAT
@@ -1737,10 +1737,10 @@ int vt_move_to_console(unsigned int vt, int alloc)
 {
 	int prev;
 
-	acquire_console_sem();
+	console_lock();
 	/* Graphics mode - up to X */
 	if (disable_vt_switch) {
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 	prev = fg_console;
@@ -1748,7 +1748,7 @@ int vt_move_to_console(unsigned int vt, int alloc)
 	if (alloc && vc_allocate(vt)) {
 		/* we can't have a free VC for now. Too bad,
 		 * we don't want to mess the screen for now. */
-		release_console_sem();
+		console_unlock();
 		return -ENOSPC;
 	}
 
@@ -1758,10 +1758,10 @@ int vt_move_to_console(unsigned int vt, int alloc)
 		 * Let the calling function know so it can decide
 		 * what to do.
 		 */
-		release_console_sem();
+		console_unlock();
 		return -EIO;
 	}
-	release_console_sem();
+	console_unlock();
 	tty_lock();
 	if (vt_waitactive(vt + 1)) {
 		pr_debug("Suspend: Can't switch VCs.");
@@ -1781,8 +1781,8 @@ int vt_move_to_console(unsigned int vt, int alloc)
  */
 void pm_set_vt_switch(int do_switch)
 {
-	acquire_console_sem();
+	console_lock();
 	disable_vt_switch = !do_switch;
-	release_console_sem();
+	console_unlock();
 }
 EXPORT_SYMBOL(pm_set_vt_switch);
diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c
index d583bea608fd..391ac939f011 100644
--- a/drivers/video/arkfb.c
+++ b/drivers/video/arkfb.c
@@ -23,7 +23,7 @@
 #include <linux/svga.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
+#include <linux/console.h> /* Why should fb driver call console functions? because console_lock() */
 #include <video/vga.h>
 
 #ifdef CONFIG_MTRR
@@ -1091,12 +1091,12 @@ static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state)
 
 	dev_info(info->device, "suspend\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
 		mutex_unlock(&(par->open_lock));
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -1107,7 +1107,7 @@ static int ark_pci_suspend (struct pci_dev* dev, pm_message_t state)
 	pci_set_power_state(dev, pci_choose_state(dev, state));
 
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -1122,7 +1122,7 @@ static int ark_pci_resume (struct pci_dev* dev)
 
 	dev_info(info->device, "resume\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if (par->ref_count == 0)
@@ -1141,7 +1141,7 @@ static int ark_pci_resume (struct pci_dev* dev)
 
 fail:
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 #else
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index dd9de2e80580..4cb6a576c567 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -1860,11 +1860,11 @@ static void aty128_early_resume(void *data)
 {
         struct aty128fb_par *par = data;
 
-	if (try_acquire_console_sem())
+	if (!console_trylock())
 		return;
 	pci_restore_state(par->pdev);
 	aty128_do_resume(par->pdev);
-	release_console_sem();
+	console_unlock();
 }
 #endif /* CONFIG_PPC_PMAC */
 
@@ -2438,7 +2438,7 @@ static int aty128_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 
 	printk(KERN_DEBUG "aty128fb: suspending...\n");
 	
-	acquire_console_sem();
+	console_lock();
 
 	fb_set_suspend(info, 1);
 
@@ -2470,7 +2470,7 @@ static int aty128_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 	if (state.event != PM_EVENT_ON)
 		aty128_set_suspend(par, 1);
 
-	release_console_sem();
+	console_unlock();
 
 	pdev->dev.power.power_state = state;
 
@@ -2527,9 +2527,9 @@ static int aty128_pci_resume(struct pci_dev *pdev)
 {
 	int rc;
 
-	acquire_console_sem();
+	console_lock();
 	rc = aty128_do_resume(pdev);
-	release_console_sem();
+	console_unlock();
 
 	return rc;
 }
diff --git a/drivers/video/aty/atyfb_base.c b/drivers/video/aty/atyfb_base.c
index 767ab4fb1a05..94e293fce1d2 100644
--- a/drivers/video/aty/atyfb_base.c
+++ b/drivers/video/aty/atyfb_base.c
@@ -2069,7 +2069,7 @@ static int atyfb_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 	if (state.event == pdev->dev.power.power_state.event)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 
 	fb_set_suspend(info, 1);
 
@@ -2097,14 +2097,14 @@ static int atyfb_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 		par->lock_blank = 0;
 		atyfb_blank(FB_BLANK_UNBLANK, info);
 		fb_set_suspend(info, 0);
-		release_console_sem();
+		console_unlock();
 		return -EIO;
 	}
 #else
 	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 #endif
 
-	release_console_sem();
+	console_unlock();
 
 	pdev->dev.power.power_state = state;
 
@@ -2133,7 +2133,7 @@ static int atyfb_pci_resume(struct pci_dev *pdev)
 	if (pdev->dev.power.power_state.event == PM_EVENT_ON)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 
 	/*
 	 * PCI state will have been restored by the core, so
@@ -2161,7 +2161,7 @@ static int atyfb_pci_resume(struct pci_dev *pdev)
 	par->lock_blank = 0;
 	atyfb_blank(FB_BLANK_UNBLANK, info);
 
-	release_console_sem();
+	console_unlock();
 
 	pdev->dev.power.power_state = PMSG_ON;
 
diff --git a/drivers/video/aty/radeon_pm.c b/drivers/video/aty/radeon_pm.c
index c4e17642d9c5..92bda5848516 100644
--- a/drivers/video/aty/radeon_pm.c
+++ b/drivers/video/aty/radeon_pm.c
@@ -2626,7 +2626,7 @@ int radeonfb_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
 		goto done;
 	}
 
-	acquire_console_sem();
+	console_lock();
 
 	fb_set_suspend(info, 1);
 
@@ -2690,7 +2690,7 @@ int radeonfb_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
 	if (rinfo->pm_mode & radeon_pm_d2)
 		radeon_set_suspend(rinfo, 1);
 
-	release_console_sem();
+	console_unlock();
 
  done:
 	pdev->dev.power.power_state = mesg;
@@ -2715,10 +2715,10 @@ int radeonfb_pci_resume(struct pci_dev *pdev)
 		return 0;
 
 	if (rinfo->no_schedule) {
-		if (try_acquire_console_sem())
+		if (!console_trylock())
 			return 0;
 	} else
-		acquire_console_sem();
+		console_lock();
 
 	printk(KERN_DEBUG "radeonfb (%s): resuming from state: %d...\n",
 	       pci_name(pdev), pdev->dev.power.power_state.event);
@@ -2783,7 +2783,7 @@ int radeonfb_pci_resume(struct pci_dev *pdev)
 	pdev->dev.power.power_state = PMSG_ON;
 
  bail:
-	release_console_sem();
+	console_unlock();
 
 	return rc;
 }
diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c
index d637e1f53172..cff742abdc5d 100644
--- a/drivers/video/chipsfb.c
+++ b/drivers/video/chipsfb.c
@@ -460,10 +460,10 @@ static int chipsfb_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 	if (!(state.event & PM_EVENT_SLEEP))
 		goto done;
 
-	acquire_console_sem();
+	console_lock();
 	chipsfb_blank(1, p);
 	fb_set_suspend(p, 1);
-	release_console_sem();
+	console_unlock();
  done:
 	pdev->dev.power.power_state = state;
 	return 0;
@@ -473,10 +473,10 @@ static int chipsfb_pci_resume(struct pci_dev *pdev)
 {
         struct fb_info *p = pci_get_drvdata(pdev);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(p, 0);
 	chipsfb_blank(0, p);
-	release_console_sem();
+	console_unlock();
 
 	pdev->dev.power.power_state = PMSG_ON;
 	return 0;
diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c
index 7ccc967831f0..9c092b8d64e6 100644
--- a/drivers/video/console/fbcon.c
+++ b/drivers/video/console/fbcon.c
@@ -375,14 +375,14 @@ static void fb_flashcursor(struct work_struct *work)
 	int c;
 	int mode;
 
-	acquire_console_sem();
+	console_lock();
 	if (ops && ops->currcon != -1)
 		vc = vc_cons[ops->currcon].d;
 
 	if (!vc || !CON_IS_VISIBLE(vc) ||
  	    registered_fb[con2fb_map[vc->vc_num]] != info ||
 	    vc->vc_deccm != 1) {
-		release_console_sem();
+		console_unlock();
 		return;
 	}
 
@@ -392,7 +392,7 @@ static void fb_flashcursor(struct work_struct *work)
 		CM_ERASE : CM_DRAW;
 	ops->cursor(vc, info, mode, softback_lines, get_color(vc, info, c, 1),
 		    get_color(vc, info, c, 0));
-	release_console_sem();
+	console_unlock();
 }
 
 static void cursor_timer_handler(unsigned long dev_addr)
@@ -836,7 +836,7 @@ static int set_con2fb_map(int unit, int newidx, int user)
 
 	found = search_fb_in_map(newidx);
 
-	acquire_console_sem();
+	console_lock();
 	con2fb_map[unit] = newidx;
 	if (!err && !found)
  		err = con2fb_acquire_newinfo(vc, info, unit, oldidx);
@@ -863,7 +863,7 @@ static int set_con2fb_map(int unit, int newidx, int user)
 	if (!search_fb_in_map(info_idx))
 		info_idx = newidx;
 
-	release_console_sem();
+	console_unlock();
  	return err;
 }
 
@@ -3321,7 +3321,7 @@ static ssize_t store_rotate(struct device *device,
 	if (fbcon_has_exited)
 		return count;
 
-	acquire_console_sem();
+	console_lock();
 	idx = con2fb_map[fg_console];
 
 	if (idx == -1 || registered_fb[idx] == NULL)
@@ -3331,7 +3331,7 @@ static ssize_t store_rotate(struct device *device,
 	rotate = simple_strtoul(buf, last, 0);
 	fbcon_rotate(info, rotate);
 err:
-	release_console_sem();
+	console_unlock();
 	return count;
 }
 
@@ -3346,7 +3346,7 @@ static ssize_t store_rotate_all(struct device *device,
 	if (fbcon_has_exited)
 		return count;
 
-	acquire_console_sem();
+	console_lock();
 	idx = con2fb_map[fg_console];
 
 	if (idx == -1 || registered_fb[idx] == NULL)
@@ -3356,7 +3356,7 @@ static ssize_t store_rotate_all(struct device *device,
 	rotate = simple_strtoul(buf, last, 0);
 	fbcon_rotate_all(info, rotate);
 err:
-	release_console_sem();
+	console_unlock();
 	return count;
 }
 
@@ -3369,7 +3369,7 @@ static ssize_t show_rotate(struct device *device,
 	if (fbcon_has_exited)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	idx = con2fb_map[fg_console];
 
 	if (idx == -1 || registered_fb[idx] == NULL)
@@ -3378,7 +3378,7 @@ static ssize_t show_rotate(struct device *device,
 	info = registered_fb[idx];
 	rotate = fbcon_get_rotate(info);
 err:
-	release_console_sem();
+	console_unlock();
 	return snprintf(buf, PAGE_SIZE, "%d\n", rotate);
 }
 
@@ -3392,7 +3392,7 @@ static ssize_t show_cursor_blink(struct device *device,
 	if (fbcon_has_exited)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	idx = con2fb_map[fg_console];
 
 	if (idx == -1 || registered_fb[idx] == NULL)
@@ -3406,7 +3406,7 @@ static ssize_t show_cursor_blink(struct device *device,
 
 	blink = (ops->flags & FBCON_FLAGS_CURSOR_TIMER) ? 1 : 0;
 err:
-	release_console_sem();
+	console_unlock();
 	return snprintf(buf, PAGE_SIZE, "%d\n", blink);
 }
 
@@ -3421,7 +3421,7 @@ static ssize_t store_cursor_blink(struct device *device,
 	if (fbcon_has_exited)
 		return count;
 
-	acquire_console_sem();
+	console_lock();
 	idx = con2fb_map[fg_console];
 
 	if (idx == -1 || registered_fb[idx] == NULL)
@@ -3443,7 +3443,7 @@ static ssize_t store_cursor_blink(struct device *device,
 	}
 
 err:
-	release_console_sem();
+	console_unlock();
 	return count;
 }
 
@@ -3482,7 +3482,7 @@ static void fbcon_start(void)
 	if (num_registered_fb) {
 		int i;
 
-		acquire_console_sem();
+		console_lock();
 
 		for (i = 0; i < FB_MAX; i++) {
 			if (registered_fb[i] != NULL) {
@@ -3491,7 +3491,7 @@ static void fbcon_start(void)
 			}
 		}
 
-		release_console_sem();
+		console_unlock();
 		fbcon_takeover(0);
 	}
 }
@@ -3552,7 +3552,7 @@ static int __init fb_console_init(void)
 {
 	int i;
 
-	acquire_console_sem();
+	console_lock();
 	fb_register_client(&fbcon_event_notifier);
 	fbcon_device = device_create(fb_class, NULL, MKDEV(0, 0), NULL,
 				     "fbcon");
@@ -3568,7 +3568,7 @@ static int __init fb_console_init(void)
 	for (i = 0; i < MAX_NR_CONSOLES; i++)
 		con2fb_map[i] = -1;
 
-	release_console_sem();
+	console_unlock();
 	fbcon_start();
 	return 0;
 }
@@ -3591,12 +3591,12 @@ static void __exit fbcon_deinit_device(void)
 
 static void __exit fb_console_exit(void)
 {
-	acquire_console_sem();
+	console_lock();
 	fb_unregister_client(&fbcon_event_notifier);
 	fbcon_deinit_device();
 	device_destroy(fb_class, MKDEV(0, 0));
 	fbcon_exit();
-	release_console_sem();
+	console_unlock();
 	unregister_con_driver(&fb_con);
 }	
 
diff --git a/drivers/video/da8xx-fb.c b/drivers/video/da8xx-fb.c
index 520047ac6e3e..8d61ef96eedd 100644
--- a/drivers/video/da8xx-fb.c
+++ b/drivers/video/da8xx-fb.c
@@ -1131,14 +1131,14 @@ static int fb_suspend(struct platform_device *dev, pm_message_t state)
 	struct fb_info *info = platform_get_drvdata(dev);
 	struct da8xx_fb_par *par = info->par;
 
-	acquire_console_sem();
+	console_lock();
 	if (par->panel_power_ctrl)
 		par->panel_power_ctrl(0);
 
 	fb_set_suspend(info, 1);
 	lcd_disable_raster();
 	clk_disable(par->lcdc_clk);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -1147,14 +1147,14 @@ static int fb_resume(struct platform_device *dev)
 	struct fb_info *info = platform_get_drvdata(dev);
 	struct da8xx_fb_par *par = info->par;
 
-	acquire_console_sem();
+	console_lock();
 	if (par->panel_power_ctrl)
 		par->panel_power_ctrl(1);
 
 	clk_enable(par->lcdc_clk);
 	lcd_enable_raster();
 	fb_set_suspend(info, 0);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index 4ac1201ad6c2..e2bf95370e40 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1036,11 +1036,11 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EFAULT;
 		if (!lock_fb_info(info))
 			return -ENODEV;
-		acquire_console_sem();
+		console_lock();
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_set_var(info, &var);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
-		release_console_sem();
+		console_unlock();
 		unlock_fb_info(info);
 		if (!ret && copy_to_user(argp, &var, sizeof(var)))
 			ret = -EFAULT;
@@ -1072,9 +1072,9 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 			return -EFAULT;
 		if (!lock_fb_info(info))
 			return -ENODEV;
-		acquire_console_sem();
+		console_lock();
 		ret = fb_pan_display(info, &var);
-		release_console_sem();
+		console_unlock();
 		unlock_fb_info(info);
 		if (ret == 0 && copy_to_user(argp, &var, sizeof(var)))
 			return -EFAULT;
@@ -1119,11 +1119,11 @@ static long do_fb_ioctl(struct fb_info *info, unsigned int cmd,
 	case FBIOBLANK:
 		if (!lock_fb_info(info))
 			return -ENODEV;
-		acquire_console_sem();
+		console_lock();
 		info->flags |= FBINFO_MISC_USEREVENT;
 		ret = fb_blank(info, arg);
 		info->flags &= ~FBINFO_MISC_USEREVENT;
-		release_console_sem();
+		console_unlock();
 		unlock_fb_info(info);
 		break;
 	default:
diff --git a/drivers/video/fbsysfs.c b/drivers/video/fbsysfs.c
index 0a08f1341227..f4a32779168b 100644
--- a/drivers/video/fbsysfs.c
+++ b/drivers/video/fbsysfs.c
@@ -90,11 +90,11 @@ static int activate(struct fb_info *fb_info, struct fb_var_screeninfo *var)
 	int err;
 
 	var->activate |= FB_ACTIVATE_FORCE;
-	acquire_console_sem();
+	console_lock();
 	fb_info->flags |= FBINFO_MISC_USEREVENT;
 	err = fb_set_var(fb_info, var);
 	fb_info->flags &= ~FBINFO_MISC_USEREVENT;
-	release_console_sem();
+	console_unlock();
 	if (err)
 		return err;
 	return 0;
@@ -175,7 +175,7 @@ static ssize_t store_modes(struct device *device,
 	if (i * sizeof(struct fb_videomode) != count)
 		return -EINVAL;
 
-	acquire_console_sem();
+	console_lock();
 	list_splice(&fb_info->modelist, &old_list);
 	fb_videomode_to_modelist((const struct fb_videomode *)buf, i,
 				 &fb_info->modelist);
@@ -185,7 +185,7 @@ static ssize_t store_modes(struct device *device,
 	} else
 		fb_destroy_modelist(&old_list);
 
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -301,11 +301,11 @@ static ssize_t store_blank(struct device *device,
 	char *last = NULL;
 	int err;
 
-	acquire_console_sem();
+	console_lock();
 	fb_info->flags |= FBINFO_MISC_USEREVENT;
 	err = fb_blank(fb_info, simple_strtoul(buf, &last, 0));
 	fb_info->flags &= ~FBINFO_MISC_USEREVENT;
-	release_console_sem();
+	console_unlock();
 	if (err < 0)
 		return err;
 	return count;
@@ -364,9 +364,9 @@ static ssize_t store_pan(struct device *device,
 		return -EINVAL;
 	var.yoffset = simple_strtoul(last, &last, 0);
 
-	acquire_console_sem();
+	console_lock();
 	err = fb_pan_display(fb_info, &var);
-	release_console_sem();
+	console_unlock();
 
 	if (err < 0)
 		return err;
@@ -399,9 +399,9 @@ static ssize_t store_fbstate(struct device *device,
 
 	state = simple_strtoul(buf, &last, 0);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(fb_info, (int)state);
-	release_console_sem();
+	console_unlock();
 
 	return count;
 }
diff --git a/drivers/video/geode/gxfb_core.c b/drivers/video/geode/gxfb_core.c
index 70b1d9d51c96..b4f19db9bb54 100644
--- a/drivers/video/geode/gxfb_core.c
+++ b/drivers/video/geode/gxfb_core.c
@@ -344,10 +344,10 @@ static int gxfb_suspend(struct pci_dev *pdev, pm_message_t state)
 	struct fb_info *info = pci_get_drvdata(pdev);
 
 	if (state.event == PM_EVENT_SUSPEND) {
-		acquire_console_sem();
+		console_lock();
 		gx_powerdown(info);
 		fb_set_suspend(info, 1);
-		release_console_sem();
+		console_unlock();
 	}
 
 	/* there's no point in setting PCI states; we emulate PCI, so
@@ -361,7 +361,7 @@ static int gxfb_resume(struct pci_dev *pdev)
 	struct fb_info *info = pci_get_drvdata(pdev);
 	int ret;
 
-	acquire_console_sem();
+	console_lock();
 	ret = gx_powerup(info);
 	if (ret) {
 		printk(KERN_ERR "gxfb:  power up failed!\n");
@@ -369,7 +369,7 @@ static int gxfb_resume(struct pci_dev *pdev)
 	}
 
 	fb_set_suspend(info, 0);
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 #endif
diff --git a/drivers/video/geode/lxfb_core.c b/drivers/video/geode/lxfb_core.c
index 39bdbedf43b4..416851ca8754 100644
--- a/drivers/video/geode/lxfb_core.c
+++ b/drivers/video/geode/lxfb_core.c
@@ -465,10 +465,10 @@ static int lxfb_suspend(struct pci_dev *pdev, pm_message_t state)
 	struct fb_info *info = pci_get_drvdata(pdev);
 
 	if (state.event == PM_EVENT_SUSPEND) {
-		acquire_console_sem();
+		console_lock();
 		lx_powerdown(info);
 		fb_set_suspend(info, 1);
-		release_console_sem();
+		console_unlock();
 	}
 
 	/* there's no point in setting PCI states; we emulate PCI, so
@@ -482,7 +482,7 @@ static int lxfb_resume(struct pci_dev *pdev)
 	struct fb_info *info = pci_get_drvdata(pdev);
 	int ret;
 
-	acquire_console_sem();
+	console_lock();
 	ret = lx_powerup(info);
 	if (ret) {
 		printk(KERN_ERR "lxfb:  power up failed!\n");
@@ -490,7 +490,7 @@ static int lxfb_resume(struct pci_dev *pdev)
 	}
 
 	fb_set_suspend(info, 0);
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 #else
diff --git a/drivers/video/i810/i810_main.c b/drivers/video/i810/i810_main.c
index 5743ea25e818..318f6fb895b2 100644
--- a/drivers/video/i810/i810_main.c
+++ b/drivers/video/i810/i810_main.c
@@ -1574,7 +1574,7 @@ static int i810fb_suspend(struct pci_dev *dev, pm_message_t mesg)
 		return 0;
 	}
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(info, 1);
 
 	if (info->fbops->fb_sync)
@@ -1587,7 +1587,7 @@ static int i810fb_suspend(struct pci_dev *dev, pm_message_t mesg)
 	pci_save_state(dev);
 	pci_disable_device(dev);
 	pci_set_power_state(dev, pci_choose_state(dev, mesg));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -1605,7 +1605,7 @@ static int i810fb_resume(struct pci_dev *dev)
 		return 0;
 	}
 
-	acquire_console_sem();
+	console_lock();
 	pci_set_power_state(dev, PCI_D0);
 	pci_restore_state(dev);
 
@@ -1621,7 +1621,7 @@ static int i810fb_resume(struct pci_dev *dev)
 	fb_set_suspend (info, 0);
 	info->fbops->fb_blank(VESA_NO_BLANKING, info);
 fail:
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 /***********************************************************************
diff --git a/drivers/video/jz4740_fb.c b/drivers/video/jz4740_fb.c
index 670ecaa0385a..de366937c933 100644
--- a/drivers/video/jz4740_fb.c
+++ b/drivers/video/jz4740_fb.c
@@ -778,9 +778,9 @@ static int jzfb_suspend(struct device *dev)
 {
 	struct jzfb *jzfb = dev_get_drvdata(dev);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(jzfb->fb, 1);
-	release_console_sem();
+	console_unlock();
 
 	mutex_lock(&jzfb->lock);
 	if (jzfb->is_enabled)
@@ -800,9 +800,9 @@ static int jzfb_resume(struct device *dev)
 		jzfb_enable(jzfb);
 	mutex_unlock(&jzfb->lock);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(jzfb->fb, 0);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/mx3fb.c b/drivers/video/mx3fb.c
index cb013919e9ce..7e3a490e8d76 100644
--- a/drivers/video/mx3fb.c
+++ b/drivers/video/mx3fb.c
@@ -1177,9 +1177,9 @@ static int mx3fb_suspend(struct platform_device *pdev, pm_message_t state)
 	struct mx3fb_data *mx3fb = platform_get_drvdata(pdev);
 	struct mx3fb_info *mx3_fbi = mx3fb->fbi->par;
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(mx3fb->fbi, 1);
-	release_console_sem();
+	console_unlock();
 
 	if (mx3_fbi->blank == FB_BLANK_UNBLANK) {
 		sdc_disable_channel(mx3_fbi);
@@ -1202,9 +1202,9 @@ static int mx3fb_resume(struct platform_device *pdev)
 		sdc_set_brightness(mx3fb, mx3fb->backlight_level);
 	}
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(mx3fb->fbi, 0);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/nvidia/nvidia.c b/drivers/video/nvidia/nvidia.c
index efe10ff86d63..081dc4745274 100644
--- a/drivers/video/nvidia/nvidia.c
+++ b/drivers/video/nvidia/nvidia.c
@@ -1057,7 +1057,7 @@ static int nvidiafb_suspend(struct pci_dev *dev, pm_message_t mesg)
 
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
-	acquire_console_sem();
+	console_lock();
 	par->pm_state = mesg.event;
 
 	if (mesg.event & PM_EVENT_SLEEP) {
@@ -1070,7 +1070,7 @@ static int nvidiafb_suspend(struct pci_dev *dev, pm_message_t mesg)
 	}
 	dev->dev.power.power_state = mesg;
 
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 
@@ -1079,7 +1079,7 @@ static int nvidiafb_resume(struct pci_dev *dev)
 	struct fb_info *info = pci_get_drvdata(dev);
 	struct nvidia_par *par = info->par;
 
-	acquire_console_sem();
+	console_lock();
 	pci_set_power_state(dev, PCI_D0);
 
 	if (par->pm_state != PM_EVENT_FREEZE) {
@@ -1097,7 +1097,7 @@ static int nvidiafb_resume(struct pci_dev *dev)
 	nvidiafb_blank(FB_BLANK_UNBLANK, info);
 
 fail:
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 #else
diff --git a/drivers/video/ps3fb.c b/drivers/video/ps3fb.c
index 9c0144ee7ae5..65560a1a0439 100644
--- a/drivers/video/ps3fb.c
+++ b/drivers/video/ps3fb.c
@@ -513,9 +513,9 @@ static int ps3fb_release(struct fb_info *info, int user)
 	if (atomic_dec_and_test(&ps3fb.f_count)) {
 		if (atomic_read(&ps3fb.ext_flip)) {
 			atomic_set(&ps3fb.ext_flip, 0);
-			if (!try_acquire_console_sem()) {
+			if (console_trylock()) {
 				ps3fb_sync(info, 0);	/* single buffer */
-				release_console_sem();
+				console_unlock();
 			}
 		}
 	}
@@ -830,14 +830,14 @@ static int ps3fb_ioctl(struct fb_info *info, unsigned int cmd,
 			if (vmode) {
 				var = info->var;
 				fb_videomode_to_var(&var, vmode);
-				acquire_console_sem();
+				console_lock();
 				info->flags |= FBINFO_MISC_USEREVENT;
 				/* Force, in case only special bits changed */
 				var.activate |= FB_ACTIVATE_FORCE;
 				par->new_mode_id = val;
 				retval = fb_set_var(info, &var);
 				info->flags &= ~FBINFO_MISC_USEREVENT;
-				release_console_sem();
+				console_unlock();
 			}
 			break;
 		}
@@ -881,9 +881,9 @@ static int ps3fb_ioctl(struct fb_info *info, unsigned int cmd,
 			break;
 
 		dev_dbg(info->device, "PS3FB_IOCTL_FSEL:%d\n", val);
-		acquire_console_sem();
+		console_lock();
 		retval = ps3fb_sync(info, val);
-		release_console_sem();
+		console_unlock();
 		break;
 
 	default:
@@ -903,9 +903,9 @@ static int ps3fbd(void *arg)
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (ps3fb.is_kicked) {
 			ps3fb.is_kicked = 0;
-			acquire_console_sem();
+			console_lock();
 			ps3fb_sync(info, 0);	/* single buffer */
-			release_console_sem();
+			console_unlock();
 		}
 		schedule();
 	}
diff --git a/drivers/video/s3fb.c b/drivers/video/s3fb.c
index dce8c97b4333..75738a928610 100644
--- a/drivers/video/s3fb.c
+++ b/drivers/video/s3fb.c
@@ -22,7 +22,7 @@
 #include <linux/svga.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
+#include <linux/console.h> /* Why should fb driver call console functions? because console_lock() */
 #include <video/vga.h>
 
 #ifdef CONFIG_MTRR
@@ -1113,12 +1113,12 @@ static int s3_pci_suspend(struct pci_dev* dev, pm_message_t state)
 
 	dev_info(info->device, "suspend\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
 		mutex_unlock(&(par->open_lock));
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -1129,7 +1129,7 @@ static int s3_pci_suspend(struct pci_dev* dev, pm_message_t state)
 	pci_set_power_state(dev, pci_choose_state(dev, state));
 
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -1145,12 +1145,12 @@ static int s3_pci_resume(struct pci_dev* dev)
 
 	dev_info(info->device, "resume\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if (par->ref_count == 0) {
 		mutex_unlock(&(par->open_lock));
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -1159,7 +1159,7 @@ static int s3_pci_resume(struct pci_dev* dev)
 	err = pci_enable_device(dev);
 	if (err) {
 		mutex_unlock(&(par->open_lock));
-		release_console_sem();
+		console_unlock();
 		dev_err(info->device, "error %d enabling device for resume\n", err);
 		return err;
 	}
@@ -1169,7 +1169,7 @@ static int s3_pci_resume(struct pci_dev* dev)
 	fb_set_suspend(info, 0);
 
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/savage/savagefb_driver.c b/drivers/video/savage/savagefb_driver.c
index 842d157e1025..487911e2926c 100644
--- a/drivers/video/savage/savagefb_driver.c
+++ b/drivers/video/savage/savagefb_driver.c
@@ -2373,7 +2373,7 @@ static int savagefb_suspend(struct pci_dev *dev, pm_message_t mesg)
 	if (mesg.event == PM_EVENT_FREEZE)
 		return 0;
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(info, 1);
 
 	if (info->fbops->fb_sync)
@@ -2385,7 +2385,7 @@ static int savagefb_suspend(struct pci_dev *dev, pm_message_t mesg)
 	pci_save_state(dev);
 	pci_disable_device(dev);
 	pci_set_power_state(dev, pci_choose_state(dev, mesg));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -2409,7 +2409,7 @@ static int savagefb_resume(struct pci_dev* dev)
 		return 0;
 	}
 
-	acquire_console_sem();
+	console_lock();
 
 	pci_set_power_state(dev, PCI_D0);
 	pci_restore_state(dev);
@@ -2423,7 +2423,7 @@ static int savagefb_resume(struct pci_dev* dev)
 	savagefb_set_par(info);
 	fb_set_suspend(info, 0);
 	savagefb_blank(FB_BLANK_UNBLANK, info);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/sh_mobile_hdmi.c b/drivers/video/sh_mobile_hdmi.c
index 74d9f546a2e8..2b9e56a6bde4 100644
--- a/drivers/video/sh_mobile_hdmi.c
+++ b/drivers/video/sh_mobile_hdmi.c
@@ -1151,7 +1151,7 @@ static void sh_hdmi_edid_work_fn(struct work_struct *work)
 
 		ch = info->par;
 
-		acquire_console_sem();
+		console_lock();
 
 		/* HDMI plug in */
 		if (!sh_hdmi_must_reconfigure(hdmi) &&
@@ -1171,7 +1171,7 @@ static void sh_hdmi_edid_work_fn(struct work_struct *work)
 			fb_set_suspend(info, 0);
 		}
 
-		release_console_sem();
+		console_unlock();
 	} else {
 		ret = 0;
 		if (!hdmi->info)
@@ -1181,12 +1181,12 @@ static void sh_hdmi_edid_work_fn(struct work_struct *work)
 		fb_destroy_modedb(hdmi->monspec.modedb);
 		hdmi->monspec.modedb = NULL;
 
-		acquire_console_sem();
+		console_lock();
 
 		/* HDMI disconnect */
 		fb_set_suspend(hdmi->info, 1);
 
-		release_console_sem();
+		console_unlock();
 		pm_runtime_put(hdmi->dev);
 	}
 
diff --git a/drivers/video/sh_mobile_lcdcfb.c b/drivers/video/sh_mobile_lcdcfb.c
index bd4840a8a6b7..bf12e53aed5c 100644
--- a/drivers/video/sh_mobile_lcdcfb.c
+++ b/drivers/video/sh_mobile_lcdcfb.c
@@ -912,9 +912,9 @@ static int sh_mobile_release(struct fb_info *info, int user)
 
 	/* Nothing to reconfigure, when called from fbcon */
 	if (user) {
-		acquire_console_sem();
+		console_lock();
 		sh_mobile_fb_reconfig(info);
-		release_console_sem();
+		console_unlock();
 	}
 
 	mutex_unlock(&ch->open_lock);
diff --git a/drivers/video/sm501fb.c b/drivers/video/sm501fb.c
index b7dc1800efa9..bcb44a594ebc 100644
--- a/drivers/video/sm501fb.c
+++ b/drivers/video/sm501fb.c
@@ -2010,9 +2010,9 @@ static int sm501fb_suspend_fb(struct sm501fb_info *info,
 
 	/* tell console/fb driver we are suspending */
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(fbi, 1);
-	release_console_sem();
+	console_unlock();
 
 	/* backup copies in case chip is powered down over suspend */
 
@@ -2069,9 +2069,9 @@ static void sm501fb_resume_fb(struct sm501fb_info *info,
 		memcpy_toio(par->cursor.k_addr, par->store_cursor,
 			    par->cursor.size);
 
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(fbi, 0);
-	release_console_sem();
+	console_unlock();
 
 	vfree(par->store_fb);
 	vfree(par->store_cursor);
diff --git a/drivers/video/tmiofb.c b/drivers/video/tmiofb.c
index 6913fe168c25..dfef88c803d4 100644
--- a/drivers/video/tmiofb.c
+++ b/drivers/video/tmiofb.c
@@ -25,7 +25,7 @@
 #include <linux/fb.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
-/* Why should fb driver call console functions? because acquire_console_sem() */
+/* Why should fb driver call console functions? because console_lock() */
 #include <linux/console.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/tmio.h>
@@ -944,7 +944,7 @@ static int tmiofb_suspend(struct platform_device *dev, pm_message_t state)
 	struct mfd_cell *cell = dev->dev.platform_data;
 	int retval = 0;
 
-	acquire_console_sem();
+	console_lock();
 
 	fb_set_suspend(info, 1);
 
@@ -965,7 +965,7 @@ static int tmiofb_suspend(struct platform_device *dev, pm_message_t state)
 	if (cell->suspend)
 		retval = cell->suspend(dev);
 
-	release_console_sem();
+	console_unlock();
 
 	return retval;
 }
@@ -976,7 +976,7 @@ static int tmiofb_resume(struct platform_device *dev)
 	struct mfd_cell *cell = dev->dev.platform_data;
 	int retval = 0;
 
-	acquire_console_sem();
+	console_lock();
 
 	if (cell->resume) {
 		retval = cell->resume(dev);
@@ -992,7 +992,7 @@ static int tmiofb_resume(struct platform_device *dev)
 
 	fb_set_suspend(info, 0);
 out:
-	release_console_sem();
+	console_unlock();
 	return retval;
 }
 #else
diff --git a/drivers/video/via/viafbdev.c b/drivers/video/via/viafbdev.c
index 289edd519527..4e66349e4366 100644
--- a/drivers/video/via/viafbdev.c
+++ b/drivers/video/via/viafbdev.c
@@ -1674,17 +1674,17 @@ static int parse_mode(const char *str, u32 *xres, u32 *yres)
 #ifdef CONFIG_PM
 static int viafb_suspend(void *unused)
 {
-	acquire_console_sem();
+	console_lock();
 	fb_set_suspend(viafbinfo, 1);
 	viafb_sync(viafbinfo);
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
 
 static int viafb_resume(void *unused)
 {
-	acquire_console_sem();
+	console_lock();
 	if (viaparinfo->shared->vdev->engine_mmio)
 		viafb_reset_engine(viaparinfo);
 	viafb_set_par(viafbinfo);
@@ -1692,7 +1692,7 @@ static int viafb_resume(void *unused)
 		viafb_set_par(viafbinfo1);
 	fb_set_suspend(viafbinfo, 0);
 
-	release_console_sem();
+	console_unlock();
 	return 0;
 }
 
diff --git a/drivers/video/vt8623fb.c b/drivers/video/vt8623fb.c
index 85d76ec4c63e..a2965ab92cfb 100644
--- a/drivers/video/vt8623fb.c
+++ b/drivers/video/vt8623fb.c
@@ -23,7 +23,7 @@
 #include <linux/svga.h>
 #include <linux/init.h>
 #include <linux/pci.h>
-#include <linux/console.h> /* Why should fb driver call console functions? because acquire_console_sem() */
+#include <linux/console.h> /* Why should fb driver call console functions? because console_lock() */
 #include <video/vga.h>
 
 #ifdef CONFIG_MTRR
@@ -819,12 +819,12 @@ static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state)
 
 	dev_info(info->device, "suspend\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if ((state.event == PM_EVENT_FREEZE) || (par->ref_count == 0)) {
 		mutex_unlock(&(par->open_lock));
-		release_console_sem();
+		console_unlock();
 		return 0;
 	}
 
@@ -835,7 +835,7 @@ static int vt8623_pci_suspend(struct pci_dev* dev, pm_message_t state)
 	pci_set_power_state(dev, pci_choose_state(dev, state));
 
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
@@ -850,7 +850,7 @@ static int vt8623_pci_resume(struct pci_dev* dev)
 
 	dev_info(info->device, "resume\n");
 
-	acquire_console_sem();
+	console_lock();
 	mutex_lock(&(par->open_lock));
 
 	if (par->ref_count == 0)
@@ -869,7 +869,7 @@ static int vt8623_pci_resume(struct pci_dev* dev)
 
 fail:
 	mutex_unlock(&(par->open_lock));
-	release_console_sem();
+	console_unlock();
 
 	return 0;
 }
diff --git a/drivers/video/xen-fbfront.c b/drivers/video/xen-fbfront.c
index 3e6934d4bea8..a20218c2fda8 100644
--- a/drivers/video/xen-fbfront.c
+++ b/drivers/video/xen-fbfront.c
@@ -491,12 +491,12 @@ xenfb_make_preferred_console(void)
 	if (console_set_on_cmdline)
 		return;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(c) {
 		if (!strcmp(c->name, "tty") && c->index == 0)
 			break;
 	}
-	release_console_sem();
+	console_unlock();
 	if (c) {
 		unregister_console(c);
 		c->flags |= CON_CONSDEV;
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index eafc22ab1fdd..b701eaa482bf 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -67,7 +67,7 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 	struct console *con;
 	loff_t off = 0;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(con)
 		if (off++ == *pos)
 			break;
@@ -84,7 +84,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void c_stop(struct seq_file *m, void *v)
 {
-	release_console_sem();
+	console_unlock();
 }
 
 static const struct seq_operations consoles_op = {
diff --git a/include/linux/console.h b/include/linux/console.h
index 9774fe6a1a97..7453cfd593c8 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -139,9 +139,9 @@ extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_n
 extern void register_console(struct console *);
 extern int unregister_console(struct console *);
 extern struct console *console_drivers;
-extern void acquire_console_sem(void);
-extern int try_acquire_console_sem(void);
-extern void release_console_sem(void);
+extern void console_lock(void);
+extern int console_trylock(void);
+extern void console_unlock(void);
 extern void console_conditional_schedule(void);
 extern void console_unblank(void);
 extern struct tty_driver *console_device(int *);
diff --git a/kernel/printk.c b/kernel/printk.c
index 53d9a9ec88e6..2ddbdc73aade 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -97,7 +97,7 @@ static int console_locked, console_suspended;
 /*
  * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
  * It is also used in interesting ways to provide interlocking in
- * release_console_sem().
+ * console_unlock();.
  */
 static DEFINE_SPINLOCK(logbuf_lock);
 
@@ -501,7 +501,7 @@ static void _call_console_drivers(unsigned start,
 /*
  * Call the console drivers, asking them to write out
  * log_buf[start] to log_buf[end - 1].
- * The console_sem must be held.
+ * The console_lock must be held.
  */
 static void call_console_drivers(unsigned start, unsigned end)
 {
@@ -604,11 +604,11 @@ static int have_callable_console(void)
  *
  * This is printk().  It can be called from any context.  We want it to work.
  *
- * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
+ * We try to grab the console_lock.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
- * notice the new output in release_console_sem() and will send it to the
- * consoles before releasing the semaphore.
+ * notice the new output in console_unlock(); and will send it to the
+ * consoles before releasing the lock.
  *
  * One effect of this deferred printing is that code which calls printk() and
  * then changes console_loglevel may break. This is because console_loglevel
@@ -659,19 +659,19 @@ static inline int can_use_console(unsigned int cpu)
 /*
  * Try to get console ownership to actually show the kernel
  * messages from a 'printk'. Return true (and with the
- * console_semaphore held, and 'console_locked' set) if it
+ * console_lock held, and 'console_locked' set) if it
  * is successful, false otherwise.
  *
  * This gets called with the 'logbuf_lock' spinlock held and
  * interrupts disabled. It should return with 'lockbuf_lock'
  * released but interrupts still disabled.
  */
-static int acquire_console_semaphore_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(unsigned int cpu)
 	__releases(&logbuf_lock)
 {
 	int retval = 0;
 
-	if (!try_acquire_console_sem()) {
+	if (console_trylock()) {
 		retval = 1;
 
 		/*
@@ -827,12 +827,12 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	 * actual magic (print out buffers, wake up klogd,
 	 * etc). 
 	 *
-	 * The acquire_console_semaphore_for_printk() function
+	 * The console_trylock_for_printk() function
 	 * will release 'logbuf_lock' regardless of whether it
 	 * actually gets the semaphore or not.
 	 */
-	if (acquire_console_semaphore_for_printk(this_cpu))
-		release_console_sem();
+	if (console_trylock_for_printk(this_cpu))
+		console_unlock();
 
 	lockdep_on();
 out_restore_irqs:
@@ -993,7 +993,7 @@ void suspend_console(void)
 	if (!console_suspend_enabled)
 		return;
 	printk("Suspending console(s) (use no_console_suspend to debug)\n");
-	acquire_console_sem();
+	console_lock();
 	console_suspended = 1;
 	up(&console_sem);
 }
@@ -1004,7 +1004,7 @@ void resume_console(void)
 		return;
 	down(&console_sem);
 	console_suspended = 0;
-	release_console_sem();
+	console_unlock();
 }
 
 /**
@@ -1027,21 +1027,21 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
 	case CPU_DYING:
 	case CPU_DOWN_FAILED:
 	case CPU_UP_CANCELED:
-		acquire_console_sem();
-		release_console_sem();
+		console_lock();
+		console_unlock();
 	}
 	return NOTIFY_OK;
 }
 
 /**
- * acquire_console_sem - lock the console system for exclusive use.
+ * console_lock - lock the console system for exclusive use.
  *
- * Acquires a semaphore which guarantees that the caller has
+ * Acquires a lock which guarantees that the caller has
  * exclusive access to the console system and the console_drivers list.
  *
  * Can sleep, returns nothing.
  */
-void acquire_console_sem(void)
+void console_lock(void)
 {
 	BUG_ON(in_interrupt());
 	down(&console_sem);
@@ -1050,21 +1050,29 @@ void acquire_console_sem(void)
 	console_locked = 1;
 	console_may_schedule = 1;
 }
-EXPORT_SYMBOL(acquire_console_sem);
+EXPORT_SYMBOL(console_lock);
 
-int try_acquire_console_sem(void)
+/**
+ * console_trylock - try to lock the console system for exclusive use.
+ *
+ * Tried to acquire a lock which guarantees that the caller has
+ * exclusive access to the console system and the console_drivers list.
+ *
+ * returns 1 on success, and 0 on failure to acquire the lock.
+ */
+int console_trylock(void)
 {
 	if (down_trylock(&console_sem))
-		return -1;
+		return 0;
 	if (console_suspended) {
 		up(&console_sem);
-		return -1;
+		return 0;
 	}
 	console_locked = 1;
 	console_may_schedule = 0;
-	return 0;
+	return 1;
 }
-EXPORT_SYMBOL(try_acquire_console_sem);
+EXPORT_SYMBOL(console_trylock);
 
 int is_console_locked(void)
 {
@@ -1095,20 +1103,20 @@ void wake_up_klogd(void)
 }
 
 /**
- * release_console_sem - unlock the console system
+ * console_unlock - unlock the console system
  *
- * Releases the semaphore which the caller holds on the console system
+ * Releases the console_lock which the caller holds on the console system
  * and the console driver list.
  *
- * While the semaphore was held, console output may have been buffered
- * by printk().  If this is the case, release_console_sem() emits
- * the output prior to releasing the semaphore.
+ * While the console_lock was held, console output may have been buffered
+ * by printk().  If this is the case, console_unlock(); emits
+ * the output prior to releasing the lock.
  *
  * If there is output waiting for klogd, we wake it up.
  *
- * release_console_sem() may be called from any context.
+ * console_unlock(); may be called from any context.
  */
-void release_console_sem(void)
+void console_unlock(void)
 {
 	unsigned long flags;
 	unsigned _con_start, _log_end;
@@ -1141,7 +1149,7 @@ void release_console_sem(void)
 	if (wake_klogd)
 		wake_up_klogd();
 }
-EXPORT_SYMBOL(release_console_sem);
+EXPORT_SYMBOL(console_unlock);
 
 /**
  * console_conditional_schedule - yield the CPU if required
@@ -1150,7 +1158,7 @@ EXPORT_SYMBOL(release_console_sem);
  * if this CPU should yield the CPU to another task, do
  * so here.
  *
- * Must be called within acquire_console_sem().
+ * Must be called within console_lock();.
  */
 void __sched console_conditional_schedule(void)
 {
@@ -1171,14 +1179,14 @@ void console_unblank(void)
 		if (down_trylock(&console_sem) != 0)
 			return;
 	} else
-		acquire_console_sem();
+		console_lock();
 
 	console_locked = 1;
 	console_may_schedule = 0;
 	for_each_console(c)
 		if ((c->flags & CON_ENABLED) && c->unblank)
 			c->unblank();
-	release_console_sem();
+	console_unlock();
 }
 
 /*
@@ -1189,7 +1197,7 @@ struct tty_driver *console_device(int *index)
 	struct console *c;
 	struct tty_driver *driver = NULL;
 
-	acquire_console_sem();
+	console_lock();
 	for_each_console(c) {
 		if (!c->device)
 			continue;
@@ -1197,7 +1205,7 @@ struct tty_driver *console_device(int *index)
 		if (driver)
 			break;
 	}
-	release_console_sem();
+	console_unlock();
 	return driver;
 }
 
@@ -1208,17 +1216,17 @@ struct tty_driver *console_device(int *index)
  */
 void console_stop(struct console *console)
 {
-	acquire_console_sem();
+	console_lock();
 	console->flags &= ~CON_ENABLED;
-	release_console_sem();
+	console_unlock();
 }
 EXPORT_SYMBOL(console_stop);
 
 void console_start(struct console *console)
 {
-	acquire_console_sem();
+	console_lock();
 	console->flags |= CON_ENABLED;
-	release_console_sem();
+	console_unlock();
 }
 EXPORT_SYMBOL(console_start);
 
@@ -1340,7 +1348,7 @@ void register_console(struct console *newcon)
 	 *	Put this console in the list - keep the
 	 *	preferred driver at the head of the list.
 	 */
-	acquire_console_sem();
+	console_lock();
 	if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) {
 		newcon->next = console_drivers;
 		console_drivers = newcon;
@@ -1352,14 +1360,14 @@ void register_console(struct console *newcon)
 	}
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/*
-		 * release_console_sem() will print out the buffered messages
+		 * console_unlock(); will print out the buffered messages
 		 * for us.
 		 */
 		spin_lock_irqsave(&logbuf_lock, flags);
 		con_start = log_start;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
-	release_console_sem();
+	console_unlock();
 	console_sysfs_notify();
 
 	/*
@@ -1396,7 +1404,7 @@ int unregister_console(struct console *console)
 		return braille_unregister_console(console);
 #endif
 
-	acquire_console_sem();
+	console_lock();
 	if (console_drivers == console) {
 		console_drivers=console->next;
 		res = 0;
@@ -1418,7 +1426,7 @@ int unregister_console(struct console *console)
 	if (console_drivers != NULL && console->flags & CON_CONSDEV)
 		console_drivers->flags |= CON_CONSDEV;
 
-	release_console_sem();
+	console_unlock();
 	console_sysfs_notify();
 	return res;
 }
-- 
cgit v1.3-8-gc7d7


From e37b6a7b27b400c3aa488db8c6629a05095bc79c Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:44:59 -0800
Subject: sched: Fix sign under-flows in wake_affine

While care is taken around the zero-point in effective_load to not exceed
the instantaneous rq->weight, it's still possible (e.g. using wake_idx != 0)
for (load + effective_load) to underflow.

In this case the comparing the unsigned values can result in incorrect balanced
decisions.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.734245014@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 354769979c02..ccecfec02a70 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1432,7 +1432,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
-	unsigned long this_load, load;
+	s64 this_load, load;
 	int idx, this_cpu, prev_cpu;
 	unsigned long tl_per_task;
 	struct task_group *tg;
@@ -1471,8 +1471,8 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * Otherwise check if either cpus are near enough in load to allow this
 	 * task to be woken on this_cpu.
 	 */
-	if (this_load) {
-		unsigned long this_eff_load, prev_eff_load;
+	if (this_load > 0) {
+		s64 this_eff_load, prev_eff_load;
 
 		this_eff_load = 100;
 		this_eff_load *= power_of(prev_cpu);
-- 
cgit v1.3-8-gc7d7


From b815f1963e47b9b69bb17e0588bd5af5b1114ae0 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:00 -0800
Subject: sched: Fix/remove redundant cfs_rq checks

Since updates are against an entity's queuing cfs_rq it's not possible to
enter update_cfs_{shares,load} with a NULL cfs_rq.  (Indeed, update_cfs_load
would crash prior to the check if we did anyway since we load is examined
during the initializers).

Also, in the update_cfs_load case there's no point
in maintaining averages for rq->cfs_rq since we don't perform shares
distribution at that level -- NULL check is replaced accordingly.

Thanks to Dan Carpenter for pointing out the deference before NULL check.

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044851.825284940@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index ccecfec02a70..1997383ba4d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -722,7 +722,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	u64 now, delta;
 	unsigned long load = cfs_rq->load.weight;
 
-	if (!cfs_rq)
+	if (cfs_rq->tg == &root_task_group)
 		return;
 
 	now = rq_of(cfs_rq)->clock;
@@ -830,9 +830,6 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
 	struct sched_entity *se;
 	long shares;
 
-	if (!cfs_rq)
-		return;
-
 	tg = cfs_rq->tg;
 	se = tg->se[cpu_of(rq_of(cfs_rq))];
 	if (!se)
-- 
cgit v1.3-8-gc7d7


From 05ca62c6ca17f39b88fa956d5ebc1fa6e93ad5e3 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Fri, 21 Jan 2011 20:45:02 -0800
Subject: sched: Use rq->clock_task instead of rq->clock for correctly
 maintaining load averages

The delta in clock_task is a more fair attribution of how much time a tg has
been contributing load to the current cpu.

While not really important it also means we're more in sync (by magnitude)
with respect to periodic updates (since __update_curr deltas are clock_task
based).

Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110122044852.007092349@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 1997383ba4d6..0c26e2df450e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -725,7 +725,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
 	if (cfs_rq->tg == &root_task_group)
 		return;
 
-	now = rq_of(cfs_rq)->clock;
+	now = rq_of(cfs_rq)->clock_task;
 	delta = now - cfs_rq->load_stamp;
 
 	/* truncate load history at 4 idle periods */
-- 
cgit v1.3-8-gc7d7


From 88d4f0db7fa8785859c1d637f9aac210932b6216 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 25 Jan 2011 19:40:51 +0100
Subject: perf: Fix alloc_callchain_buffers()

Commit 927c7a9e92c4 ("perf: Fix race in callchains") introduced
a mismatch in the sizing of struct callchain_cpus_entries.

nr_cpu_ids must be used instead of num_possible_cpus(), or we
might get out of bound memory accesses on some machines.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Miller <davem@davemloft.net>
Cc: Stephane Eranian <eranian@google.com>
CC: stable@kernel.org
LKML-Reference: <1295980851.3588.351.camel@edumazet-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 126a302c481c..852ae8c66502 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1999,8 +1999,7 @@ static int alloc_callchain_buffers(void)
 	 * accessed from NMI. Use a temporary manual per cpu allocation
 	 * until that gets sorted out.
 	 */
-	size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
-		num_possible_cpus();
+	size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
 
 	entries = kzalloc(size, GFP_KERNEL);
 	if (!entries)
-- 
cgit v1.3-8-gc7d7


From aa5bd67dcfdf9af34c7fa36ebc87d4e1f7e91873 Mon Sep 17 00:00:00 2001
From: Kacper Kornet <kornet@camk.edu.pl>
Date: Sat, 29 Jan 2011 00:21:04 +0100
Subject: Fix prlimit64 for suid/sgid processes

Since check_prlimit_permission always fails in the case of SUID/GUID
processes, such processes are not able to read or set their own limits.
This commit changes this by assuming that process can always read/change
its own limits.

Signed-off-by: Kacper Kornet <kornet@camk.edu.pl>
Acked-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sys.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 31b71a276b40..18da702ec813 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1385,7 +1385,8 @@ static int check_prlimit_permission(struct task_struct *task)
 	const struct cred *cred = current_cred(), *tcred;
 
 	tcred = __task_cred(task);
-	if ((cred->uid != tcred->euid ||
+	if (current != task &&
+	    (cred->uid != tcred->euid ||
 	     cred->uid != tcred->suid ||
 	     cred->uid != tcred->uid  ||
 	     cred->gid != tcred->egid ||
-- 
cgit v1.3-8-gc7d7


From 4135038a582c20ffdadfcf6564852e0b72a20968 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 28 Jan 2011 11:00:31 -0500
Subject: watchdog: Fix broken nowatchdog logic

Passing nowatchdog to kernel disables 2 things: creation of
watchdog threads AND initialization of percpu watchdog_hrtimer.
As hrtimers are initialized only at boot it's not possible to
enable watchdog later - for me all watchdog threads started to
eat 100% of CPU time, but they could just crash.

Additionally, even if these threads would start properly,
watchdog_disable_all_cpus was guarded by no_watchdog check, so
you couldn't disable watchdog.

To fix this, remove no_watchdog variable and use already
existing watchdog_enabled variable.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
[ removed another no_watchdog instance ]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org>
LKML-Reference: <1296230433-6261-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d7ebdf4cea98..d9961ea1c3f4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -27,7 +27,7 @@
 #include <asm/irq_regs.h>
 #include <linux/perf_event.h>
 
-int watchdog_enabled;
+int watchdog_enabled = 1;
 int __read_mostly softlockup_thresh = 60;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -43,9 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
 static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
 #endif
 
-static int no_watchdog;
-
-
 /* boot commands */
 /*
  * Should we panic when a soft-lockup or hard-lockup occurs:
@@ -58,7 +55,7 @@ static int __init hardlockup_panic_setup(char *str)
 	if (!strncmp(str, "panic", 5))
 		hardlockup_panic = 1;
 	else if (!strncmp(str, "0", 1))
-		no_watchdog = 1;
+		watchdog_enabled = 0;
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -77,7 +74,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 
 static int __init nowatchdog_setup(char *str)
 {
-	no_watchdog = 1;
+	watchdog_enabled = 0;
 	return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
@@ -85,7 +82,7 @@ __setup("nowatchdog", nowatchdog_setup);
 /* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-	no_watchdog = 1;
+	watchdog_enabled = 0;
 	return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
@@ -476,9 +473,6 @@ static void watchdog_disable_all_cpus(void)
 {
 	int cpu;
 
-	if (no_watchdog)
-		return;
-
 	for_each_online_cpu(cpu)
 		watchdog_disable(cpu);
 
@@ -530,7 +524,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		err = watchdog_enable(hotcpu);
+		if (watchdog_enabled)
+			err = watchdog_enable(hotcpu);
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
@@ -555,9 +550,6 @@ void __init lockup_detector_init(void)
 	void *cpu = (void *)(long)smp_processor_id();
 	int err;
 
-	if (no_watchdog)
-		return;
-
 	err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
 	WARN_ON(notifier_to_errno(err));
 
-- 
cgit v1.3-8-gc7d7


From 397357666de6b5b6adb5fa99f9758ec8cf30ac34 Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 28 Jan 2011 11:00:32 -0500
Subject: watchdog: Fix sysctl consistency

If it was not possible to enable watchdog for any cpu, switch
watchdog_enabled back to 0, because it's visible via
kernel.watchdog sysctl.

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org>
LKML-Reference: <1296230433-6261-2-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d9961ea1c3f4..c7e0049344bd 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -429,9 +429,6 @@ static int watchdog_enable(int cpu)
 		wake_up_process(p);
 	}
 
-	/* if any cpu succeeds, watchdog is considered enabled for the system */
-	watchdog_enabled = 1;
-
 	return 0;
 }
 
@@ -459,12 +456,16 @@ static void watchdog_disable(int cpu)
 static void watchdog_enable_all_cpus(void)
 {
 	int cpu;
-	int result = 0;
+
+	watchdog_enabled = 0;
 
 	for_each_online_cpu(cpu)
-		result += watchdog_enable(cpu);
+		if (!watchdog_enable(cpu))
+			/* if any cpu succeeds, watchdog is considered
+			   enabled for the system */
+			watchdog_enabled = 1;
 
-	if (result)
+	if (!watchdog_enabled)
 		printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
 
 }
-- 
cgit v1.3-8-gc7d7


From 9ffdc6c37df131f89d52001e0ef03091b158826f Mon Sep 17 00:00:00 2001
From: Marcin Slusarz <marcin.slusarz@gmail.com>
Date: Fri, 28 Jan 2011 11:00:33 -0500
Subject: watchdog: Don't change watchdog state on read of sysctl

Signed-off-by: Marcin Slusarz <marcin.slusarz@gmail.com>
[ add {}'s to fix a warning ]
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org>
LKML-Reference: <1296230433-6261-3-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c7e0049344bd..f37f974aa81b 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -493,10 +493,12 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write,
 {
 	proc_dointvec(table, write, buffer, length, ppos);
 
-	if (watchdog_enabled)
-		watchdog_enable_all_cpus();
-	else
-		watchdog_disable_all_cpus();
+	if (write) {
+		if (watchdog_enabled)
+			watchdog_enable_all_cpus();
+		else
+			watchdog_disable_all_cpus();
+	}
 	return 0;
 }
 
-- 
cgit v1.3-8-gc7d7


From f1a06390d013244e721372b3f9b66e39b6429c71 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 28 Jan 2011 08:47:15 +0100
Subject: genirq: Prevent irq storm on migration

move_native_irq() masks and unmasks the interrupt line
unconditionally, but the interrupt line might be masked due to a
threaded oneshot handler in progress. Unmasking the line in that case
can lead to interrupt storms. Observed on PREEMPT_RT.

Originally-from: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org
---
 kernel/irq/migration.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 1d2541940480..441fd629ff04 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -56,6 +56,7 @@ void move_masked_irq(int irq)
 void move_native_irq(int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
+	bool masked;
 
 	if (likely(!(desc->status & IRQ_MOVE_PENDING)))
 		return;
@@ -63,8 +64,15 @@ void move_native_irq(int irq)
 	if (unlikely(desc->status & IRQ_DISABLED))
 		return;
 
-	desc->irq_data.chip->irq_mask(&desc->irq_data);
+	/*
+	 * Be careful vs. already masked interrupts. If this is a
+	 * threaded interrupt with ONESHOT set, we can end up with an
+	 * interrupt storm.
+	 */
+	masked = desc->status & IRQ_MASKED;
+	if (!masked)
+		desc->irq_data.chip->irq_mask(&desc->irq_data);
 	move_masked_irq(irq);
-	desc->irq_data.chip->irq_unmask(&desc->irq_data);
+	if (!masked)
+		desc->irq_data.chip->irq_unmask(&desc->irq_data);
 }
-
-- 
cgit v1.3-8-gc7d7


From e4a9ea5ee7c8812a7bf0c3fb725ceeaa3d4c2fcc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 27 Jan 2011 09:15:30 -0500
Subject: tracing: Replace trace_event struct array with pointer array

Currently the trace_event structures are placed in the _ftrace_events
section, and at link time, the linker makes one large array of all
the trace_event structures. On boot up, this array is read (much like
the initcall sections) and the events are processed.

The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.

A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).

Instead of packing the structures into an array, the structures' addresses
are now put into the _ftrace_event section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).

By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.

The _ftrace_event section is also moved into the .init.data section
as it is now only needed at boot up.

Suggested-by: David Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h |  7 +++----
 include/linux/module.h            |  2 +-
 include/linux/syscalls.h          | 10 ++++++----
 include/trace/ftrace.h            | 24 +++++++++++++-----------
 kernel/trace/trace_events.c       | 12 ++++++------
 kernel/trace/trace_export.c       |  6 +++---
 6 files changed, 32 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 6ebb81030d2d..f53708be95eb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -124,7 +124,8 @@
 #endif
 
 #ifdef CONFIG_EVENT_TRACING
-#define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
+#define FTRACE_EVENTS()	. = ALIGN(8);					\
+			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			*(_ftrace_events)				\
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
 #else
@@ -179,9 +180,6 @@
 	TRACE_PRINTKS()							\
 									\
 	STRUCT_ALIGN();							\
-	FTRACE_EVENTS()							\
-									\
-	STRUCT_ALIGN();							\
 	TRACE_SYSCALLS()
 
 /*
@@ -482,6 +480,7 @@
 	KERNEL_CTORS()							\
 	*(.init.rodata)							\
 	MCOUNT_REC()							\
+	FTRACE_EVENTS()							\
 	DEV_DISCARD(init.rodata)					\
 	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.rodata)					\
diff --git a/include/linux/module.h b/include/linux/module.h
index e7c6385c6683..7695a303bb55 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -389,7 +389,7 @@ struct module
 	unsigned int num_trace_bprintk_fmt;
 #endif
 #ifdef CONFIG_EVENT_TRACING
-	struct ftrace_event_call *trace_events;
+	struct ftrace_event_call **trace_events;
 	unsigned int num_trace_events;
 #endif
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 18cd0684fc4e..45508fec366d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -128,28 +128,30 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata					\
 	__attribute__((__aligned__(4))) __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
-	  __attribute__((__aligned__(4)))				\
-	  __attribute__((section("_ftrace_events")))			\
 	  event_enter_##sname = {					\
 		.name                   = "sys_enter"#sname,		\
 		.class			= &event_class_syscall_enter,	\
 		.event.funcs            = &enter_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
 	};								\
+	static struct ftrace_event_call __used				\
+	  __attribute__((section("_ftrace_events")))			\
+	 *__event_enter_##sname = &event_enter_##sname;			\
 	__TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
 	static struct syscall_metadata					\
 	__attribute__((__aligned__(4))) __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
-	  __attribute__((__aligned__(4)))				\
-	  __attribute__((section("_ftrace_events")))			\
 	  event_exit_##sname = {					\
 		.name                   = "sys_exit"#sname,		\
 		.class			= &event_class_syscall_exit,	\
 		.event.funcs		= &exit_syscall_print_funcs,	\
 		.data			= (void *)&__syscall_meta_##sname,\
 	};								\
+	static struct ftrace_event_call __used				\
+	  __attribute__((section("_ftrace_events")))			\
+	*__event_exit_##sname = &event_exit_##sname;			\
 	__TRACE_EVENT_FLAGS(exit_##sname, TRACE_EVENT_FL_CAP_ANY)
 
 #define SYSCALL_METADATA(sname, nb)				\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index e16610c208c9..3e68366d485a 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -446,14 +446,16 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	.reg			= ftrace_event_reg,
  * };
  *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * static struct ftrace_event_call event_<call> = {
  *	.name			= "<call>",
  *	.class			= event_class_<template>,
  *	.event			= &ftrace_event_type_<call>,
  *	.print_fmt		= print_fmt_<call>,
  * };
+ * // its only safe to use pointers when doing linker tricks to
+ * // create an array.
+ * static struct ftrace_event_call __used
+ * __attribute__((section("_ftrace_events"))) *__event_<call> = &event_<call>;
  *
  */
 
@@ -579,28 +581,28 @@ static struct ftrace_event_class __used event_class_##call = {		\
 #undef DEFINE_EVENT
 #define DEFINE_EVENT(template, call, proto, args)			\
 									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
+static struct ftrace_event_call __used event_##call = {			\
 	.name			= #call,				\
 	.class			= &event_class_##template,		\
 	.event.funcs		= &ftrace_event_type_funcs_##template,	\
 	.print_fmt		= print_fmt_##template,			\
-};
+};									\
+static struct ftrace_event_call __used					\
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 
 #undef DEFINE_EVENT_PRINT
 #define DEFINE_EVENT_PRINT(template, call, proto, args, print)		\
 									\
 static const char print_fmt_##call[] = print;				\
 									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
+static struct ftrace_event_call __used event_##call = {			\
 	.name			= #call,				\
 	.class			= &event_class_##template,		\
 	.event.funcs		= &ftrace_event_type_funcs_##call,	\
 	.print_fmt		= print_fmt_##call,			\
-}
+};									\
+static struct ftrace_event_call __used					\
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 35fde09b81de..5f499e0438a4 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1284,7 +1284,7 @@ trace_create_file_ops(struct module *mod)
 static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops = NULL;
-	struct ftrace_event_call *call, *start, *end;
+	struct ftrace_event_call **call, **start, **end;
 
 	start = mod->trace_events;
 	end = mod->trace_events + mod->num_trace_events;
@@ -1297,7 +1297,7 @@ static void trace_module_add_events(struct module *mod)
 		return;
 
 	for_each_event(call, start, end) {
-		__trace_add_event_call(call, mod,
+		__trace_add_event_call(*call, mod,
 				       &file_ops->id, &file_ops->enable,
 				       &file_ops->filter, &file_ops->format);
 	}
@@ -1367,8 +1367,8 @@ static struct notifier_block trace_module_nb = {
 	.priority = 0,
 };
 
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
+extern struct ftrace_event_call *__start_ftrace_events[];
+extern struct ftrace_event_call *__stop_ftrace_events[];
 
 static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
 
@@ -1384,7 +1384,7 @@ __setup("trace_event=", setup_trace_event);
 
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_call **call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
@@ -1430,7 +1430,7 @@ static __init int event_trace_init(void)
 		pr_warning("tracing: Failed to allocate common fields");
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
-		__trace_add_event_call(call, NULL, &ftrace_event_id_fops,
+		__trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
 				       &ftrace_enable_fops,
 				       &ftrace_event_filter_fops,
 				       &ftrace_event_format_fops);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4b74d71705c0..bbeec31e0ae3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -161,13 +161,13 @@ struct ftrace_event_class event_class_ftrace_##call = {			\
 	.fields			= LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
 };									\
 									\
-struct ftrace_event_call __used						\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
+struct ftrace_event_call __used event_##call = {			\
 	.name			= #call,				\
 	.event.type		= etype,				\
 	.class			= &event_class_ftrace_##call,		\
 	.print_fmt		= print,				\
 };									\
+struct ftrace_event_call __used						\
+__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
 
 #include "trace_entries.h"
-- 
cgit v1.3-8-gc7d7


From 542e72fc90f5ed9eecb574f80f70868c7f296093 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 26 Jan 2011 15:38:35 +0100
Subject: perf: Fix reading in perf_event_read()

It is quite possible for the event to have been disabled between
perf_event_read() sending the IPI and the CPU servicing the IPI and
calling __perf_event_read(), hence revalidate the state.

Reported-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 852ae8c66502..999835b6112b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1901,11 +1901,12 @@ static void __perf_event_read(void *info)
 		return;
 
 	raw_spin_lock(&ctx->lock);
-	update_context_time(ctx);
+	if (ctx->is_active)
+		update_context_time(ctx);
 	update_event_times(event);
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		event->pmu->read(event);
 	raw_spin_unlock(&ctx->lock);
-
-	event->pmu->read(event);
 }
 
 static inline u64 perf_event_count(struct perf_event *event)
-- 
cgit v1.3-8-gc7d7


From 06c3bc655697b19521901f9254eb0bbb2c67e7e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 2 Feb 2011 13:19:48 +0100
Subject: sched: Fix update_curr_rt()

cpu_stopper_thread()
  migration_cpu_stop()
    __migrate_task()
      deactivate_task()
        dequeue_task()
          dequeue_task_rq()
            update_curr_rt()

Will call update_curr_rt() on rq->curr, which at that time is
rq->stop. The problem is that rq->stop.prio matches an RT prio and
thus falsely assumes its a rt_sched_class task.

Reported-Debuged-Tested-Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: stable@kernel.org # .37
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index c914ec747ca6..ad6267714c84 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -625,7 +625,7 @@ static void update_curr_rt(struct rq *rq)
 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	u64 delta_exec;
 
-	if (!task_has_rt_policy(curr))
+	if (curr->sched_class != &rt_sched_class)
 		return;
 
 	delta_exec = rq->clock_task - curr->se.exec_start;
-- 
cgit v1.3-8-gc7d7


From 654986462939cd7ec18f276c6379a334dac106a7 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 26 Jan 2011 17:26:22 -0500
Subject: tracepoints: Fix section alignment using pointer array

Make the tracepoints more robust, making them solid enough to handle compiler
changes by not relying on anything based on compiler-specific behavior with
respect to structure alignment. Implement an approach proposed by David Miller:
use an array of const pointers to refer to the individual structures, and export
this pointer array through the linker script rather than the structures per se.
It will consume 32 extra bytes per tracepoint (24 for structure padding and 8
for the pointers), but are less likely to break due to compiler changes.

History:

commit 7e066fb8 tracepoints: add DECLARE_TRACE() and DEFINE_TRACE()
added the aligned(32) type and variable attribute to the tracepoint structures
to deal with gcc happily aligning statically defined structures on 32-byte
multiples.

One attempt was to use a 8-byte alignment for tracepoint structures by applying
both the variable and type attribute to tracepoint structures definitions and
declarations. It worked fine with gcc 4.5.1, but broke with gcc 4.4.4 and 4.4.5.

The reason is that the "aligned" attribute only specify the _minimum_ alignment
for a structure, leaving both the compiler and the linker free to align on
larger multiples. Because tracepoint.c expects the structures to be placed as an
array within each section, up-alignment cause NULL-pointer exceptions due to the
extra unexpected padding.

(this patch applies on top of -tip)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: David S. Miller <davem@davemloft.net>
LKML-Reference: <20110126222622.GA10794@Krystal>
CC: Frederic Weisbecker <fweisbec@gmail.com>
CC: Ingo Molnar <mingo@elte.hu>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h |  8 +++++---
 include/linux/module.h            |  2 +-
 include/linux/tracepoint.h        | 35 ++++++++++++++++++++---------------
 kernel/module.c                   | 16 ++++++++--------
 kernel/tracepoint.c               | 31 ++++++++++++++++---------------
 5 files changed, 50 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f53708be95eb..57b1b6811b61 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -166,10 +166,8 @@
 	CPU_KEEP(exit.data)						\
 	MEM_KEEP(init.data)						\
 	MEM_KEEP(exit.data)						\
-	. = ALIGN(32);							\
-	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
+	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
 	/* implement dynamic printk debug */				\
 	. = ALIGN(8);							\
 	VMLINUX_SYMBOL(__start___verbose) = .;                          \
@@ -218,6 +216,10 @@
 		VMLINUX_SYMBOL(__start_rodata) = .;			\
 		*(.rodata) *(.rodata.*)					\
 		*(__vermagic)		/* Kernel version magic */	\
+		. = ALIGN(8);						\
+		VMLINUX_SYMBOL(__start___tracepoints_ptrs) = .;		\
+		*(__tracepoints_ptrs)	/* Tracepoints: pointer array */\
+		VMLINUX_SYMBOL(__stop___tracepoints_ptrs) = .;		\
 		*(__markers_strings)	/* Markers: strings */		\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
 	}								\
diff --git a/include/linux/module.h b/include/linux/module.h
index 7695a303bb55..9bdf27c7615b 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -377,7 +377,7 @@ struct module
 	   keeping pointers to this stuff */
 	char *args;
 #ifdef CONFIG_TRACEPOINTS
-	struct tracepoint *tracepoints;
+	struct tracepoint * const *tracepoints_ptrs;
 	unsigned int num_tracepoints;
 #endif
 #ifdef HAVE_JUMP_LABEL
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index c6814616653b..97c84a58efb8 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -33,12 +33,7 @@ struct tracepoint {
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
-} __attribute__((aligned(32)));		/*
-					 * Aligned on 32 bytes because it is
-					 * globally visible and gcc happily
-					 * align these on the structure size.
-					 * Keep in sync with vmlinux.lds.h.
-					 */
+};
 
 /*
  * Connect a probe to a tracepoint.
@@ -61,15 +56,15 @@ extern void tracepoint_probe_update_all(void);
 
 struct tracepoint_iter {
 	struct module *module;
-	struct tracepoint *tracepoint;
+	struct tracepoint * const *tracepoint;
 };
 
 extern void tracepoint_iter_start(struct tracepoint_iter *iter);
 extern void tracepoint_iter_next(struct tracepoint_iter *iter);
 extern void tracepoint_iter_stop(struct tracepoint_iter *iter);
 extern void tracepoint_iter_reset(struct tracepoint_iter *iter);
-extern int tracepoint_get_iter_range(struct tracepoint **tracepoint,
-	struct tracepoint *begin, struct tracepoint *end);
+extern int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+	struct tracepoint * const *begin, struct tracepoint * const *end);
 
 /*
  * tracepoint_synchronize_unregister must be called between the last tracepoint
@@ -84,11 +79,13 @@ static inline void tracepoint_synchronize_unregister(void)
 #define PARAMS(args...) args
 
 #ifdef CONFIG_TRACEPOINTS
-extern void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end);
+extern
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
+	struct tracepoint * const *end);
 #else
-static inline void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end)
+static inline
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
+	struct tracepoint * const *end)
 { }
 #endif /* CONFIG_TRACEPOINTS */
 
@@ -174,12 +171,20 @@ do_trace:								\
 	{								\
 	}
 
+/*
+ * We have no guarantee that gcc and the linker won't up-align the tracepoint
+ * structures, so we create an array of pointers that will be used for iteration
+ * on the tracepoints.
+ */
 #define DEFINE_TRACE_FN(name, reg, unreg)				\
 	static const char __tpstrtab_##name[]				\
 	__attribute__((section("__tracepoints_strings"))) = #name;	\
 	struct tracepoint __tracepoint_##name				\
-	__attribute__((section("__tracepoints"), aligned(32))) =	\
-		{ __tpstrtab_##name, 0, reg, unreg, NULL }
+	__attribute__((section("__tracepoints"))) =			\
+		{ __tpstrtab_##name, 0, reg, unreg, NULL };		\
+	static struct tracepoint * const __tracepoint_ptr_##name __used	\
+	__attribute__((section("__tracepoints_ptrs"))) =		\
+		&__tracepoint_##name;
 
 #define DEFINE_TRACE(name)						\
 	DEFINE_TRACE_FN(name, NULL, NULL);
diff --git a/kernel/module.c b/kernel/module.c
index 34e00b708fad..efa290ea94bf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2460,9 +2460,9 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 #endif
 
 #ifdef CONFIG_TRACEPOINTS
-	mod->tracepoints = section_objs(info, "__tracepoints",
-					sizeof(*mod->tracepoints),
-					&mod->num_tracepoints);
+	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
+					     sizeof(*mod->tracepoints_ptrs),
+					     &mod->num_tracepoints);
 #endif
 #ifdef HAVE_JUMP_LABEL
 	mod->jump_entries = section_objs(info, "__jump_table",
@@ -3393,7 +3393,7 @@ void module_layout(struct module *mod,
 		   struct modversion_info *ver,
 		   struct kernel_param *kp,
 		   struct kernel_symbol *ks,
-		   struct tracepoint *tp)
+		   struct tracepoint * const *tp)
 {
 }
 EXPORT_SYMBOL(module_layout);
@@ -3407,8 +3407,8 @@ void module_update_tracepoints(void)
 	mutex_lock(&module_mutex);
 	list_for_each_entry(mod, &modules, list)
 		if (!mod->taints)
-			tracepoint_update_probe_range(mod->tracepoints,
-				mod->tracepoints + mod->num_tracepoints);
+			tracepoint_update_probe_range(mod->tracepoints_ptrs,
+				mod->tracepoints_ptrs + mod->num_tracepoints);
 	mutex_unlock(&module_mutex);
 }
 
@@ -3432,8 +3432,8 @@ int module_get_iter_tracepoints(struct tracepoint_iter *iter)
 			else if (iter_mod > iter->module)
 				iter->tracepoint = NULL;
 			found = tracepoint_get_iter_range(&iter->tracepoint,
-				iter_mod->tracepoints,
-				iter_mod->tracepoints
+				iter_mod->tracepoints_ptrs,
+				iter_mod->tracepoints_ptrs
 					+ iter_mod->num_tracepoints);
 			if (found) {
 				iter->module = iter_mod;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index e95ee7f31d43..68187af4889e 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -27,8 +27,8 @@
 #include <linux/sched.h>
 #include <linux/jump_label.h>
 
-extern struct tracepoint __start___tracepoints[];
-extern struct tracepoint __stop___tracepoints[];
+extern struct tracepoint * const __start___tracepoints_ptrs[];
+extern struct tracepoint * const __stop___tracepoints_ptrs[];
 
 /* Set to 1 to enable tracepoint debug output */
 static const int tracepoint_debug;
@@ -298,10 +298,10 @@ static void disable_tracepoint(struct tracepoint *elem)
  *
  * Updates the probe callback corresponding to a range of tracepoints.
  */
-void
-tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
+void tracepoint_update_probe_range(struct tracepoint * const *begin,
+				   struct tracepoint * const *end)
 {
-	struct tracepoint *iter;
+	struct tracepoint * const *iter;
 	struct tracepoint_entry *mark_entry;
 
 	if (!begin)
@@ -309,12 +309,12 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
-		mark_entry = get_tracepoint(iter->name);
+		mark_entry = get_tracepoint((*iter)->name);
 		if (mark_entry) {
-			set_tracepoint(&mark_entry, iter,
+			set_tracepoint(&mark_entry, *iter,
 					!!mark_entry->refcount);
 		} else {
-			disable_tracepoint(iter);
+			disable_tracepoint(*iter);
 		}
 	}
 	mutex_unlock(&tracepoints_mutex);
@@ -326,8 +326,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 static void tracepoint_update_probes(void)
 {
 	/* Core kernel tracepoints */
-	tracepoint_update_probe_range(__start___tracepoints,
-		__stop___tracepoints);
+	tracepoint_update_probe_range(__start___tracepoints_ptrs,
+		__stop___tracepoints_ptrs);
 	/* tracepoints in modules. */
 	module_update_tracepoints();
 }
@@ -514,8 +514,8 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
  * Will return the first tracepoint in the range if the input tracepoint is
  * NULL.
  */
-int tracepoint_get_iter_range(struct tracepoint **tracepoint,
-	struct tracepoint *begin, struct tracepoint *end)
+int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+	struct tracepoint * const *begin, struct tracepoint * const *end)
 {
 	if (!*tracepoint && begin != end) {
 		*tracepoint = begin;
@@ -534,7 +534,8 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
 	/* Core kernel tracepoints */
 	if (!iter->module) {
 		found = tracepoint_get_iter_range(&iter->tracepoint,
-				__start___tracepoints, __stop___tracepoints);
+				__start___tracepoints_ptrs,
+				__stop___tracepoints_ptrs);
 		if (found)
 			goto end;
 	}
@@ -585,8 +586,8 @@ int tracepoint_module_notify(struct notifier_block *self,
 	switch (val) {
 	case MODULE_STATE_COMING:
 	case MODULE_STATE_GOING:
-		tracepoint_update_probe_range(mod->tracepoints,
-			mod->tracepoints + mod->num_tracepoints);
+		tracepoint_update_probe_range(mod->tracepoints_ptrs,
+			mod->tracepoints_ptrs + mod->num_tracepoints);
 		break;
 	}
 	return 0;
-- 
cgit v1.3-8-gc7d7


From 3d56e331b6537671c66f1b510bed0f1e0331dfc8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 2 Feb 2011 17:06:09 -0500
Subject: tracing: Replace syscall_meta_data struct array with pointer array

Currently the syscall_meta structures for the syscall tracepoints are
placed in the __syscall_metadata section, and at link time, the linker
makes one large array of all these syscall metadata structures. On boot
up, this array is read (much like the initcall sections) and the syscall
data is processed.

The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.

A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).

Instead of packing the structures into an array, the structures' addresses
are now put into the __syscall_metadata section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).

By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.

The __syscall_metadata section is also moved into the .init.data section
as it is now only needed at boot up.

Suggested-by: David Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/asm-generic/vmlinux.lds.h |  9 ++++-----
 include/linux/syscalls.h          | 18 +++++++++---------
 kernel/trace/trace_syscalls.c     | 19 ++++++++++---------
 3 files changed, 23 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 57b1b6811b61..fe77e3395b40 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -141,7 +141,8 @@
 #endif
 
 #ifdef CONFIG_FTRACE_SYSCALLS
-#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+#define TRACE_SYSCALLS() . = ALIGN(8);					\
+			 VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
 			 *(__syscalls_metadata)				\
 			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
 #else
@@ -175,10 +176,7 @@
 	VMLINUX_SYMBOL(__stop___verbose) = .;				\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
-	TRACE_PRINTKS()							\
-									\
-	STRUCT_ALIGN();							\
-	TRACE_SYSCALLS()
+	TRACE_PRINTKS()
 
 /*
  * Data section helpers
@@ -483,6 +481,7 @@
 	*(.init.rodata)							\
 	MCOUNT_REC()							\
 	FTRACE_EVENTS()							\
+	TRACE_SYSCALLS()						\
 	DEV_DISCARD(init.rodata)					\
 	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.rodata)					\
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 45508fec366d..98664db1be47 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -125,8 +125,7 @@ extern struct trace_event_functions enter_syscall_print_funcs;
 extern struct trace_event_functions exit_syscall_print_funcs;
 
 #define SYSCALL_TRACE_ENTER_EVENT(sname)				\
-	static struct syscall_metadata					\
-	__attribute__((__aligned__(4))) __syscall_meta_##sname;		\
+	static struct syscall_metadata __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
 	  event_enter_##sname = {					\
 		.name                   = "sys_enter"#sname,		\
@@ -140,8 +139,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	__TRACE_EVENT_FLAGS(enter_##sname, TRACE_EVENT_FL_CAP_ANY)
 
 #define SYSCALL_TRACE_EXIT_EVENT(sname)					\
-	static struct syscall_metadata					\
-	__attribute__((__aligned__(4))) __syscall_meta_##sname;		\
+	static struct syscall_metadata __syscall_meta_##sname;		\
 	static struct ftrace_event_call __used				\
 	  event_exit_##sname = {					\
 		.name                   = "sys_exit"#sname,		\
@@ -158,8 +156,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_TRACE_ENTER_EVENT(sname);			\
 	SYSCALL_TRACE_EXIT_EVENT(sname);			\
 	static struct syscall_metadata __used			\
-	  __attribute__((__aligned__(4)))			\
-	  __attribute__((section("__syscalls_metadata")))	\
 	  __syscall_meta_##sname = {				\
 		.name 		= "sys"#sname,			\
 		.nb_args 	= nb,				\
@@ -168,14 +164,15 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.enter_event	= &event_enter_##sname,		\
 		.exit_event	= &event_exit_##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
-	};
+	};							\
+	static struct syscall_metadata __used			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
 
 #define SYSCALL_DEFINE0(sname)					\
 	SYSCALL_TRACE_ENTER_EVENT(_##sname);			\
 	SYSCALL_TRACE_EXIT_EVENT(_##sname);			\
 	static struct syscall_metadata __used			\
-	  __attribute__((__aligned__(4)))			\
-	  __attribute__((section("__syscalls_metadata")))	\
 	  __syscall_meta__##sname = {				\
 		.name 		= "sys_"#sname,			\
 		.nb_args 	= 0,				\
@@ -183,6 +180,9 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.exit_event	= &event_exit__##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
 	};							\
+	static struct syscall_metadata __used			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	 *__p_syscall_meta_##sname = &__syscall_meta__##sname;	\
 	asmlinkage long sys_##sname(void)
 #else
 #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index b706529b4fc7..5c9fe08d2093 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -55,20 +55,21 @@ struct ftrace_event_class event_class_syscall_exit = {
 	.raw_init	= init_syscall_trace,
 };
 
-extern unsigned long __start_syscalls_metadata[];
-extern unsigned long __stop_syscalls_metadata[];
+extern struct syscall_metadata *__start_syscalls_metadata[];
+extern struct syscall_metadata *__stop_syscalls_metadata[];
 
 static struct syscall_metadata **syscalls_metadata;
 
-static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
+static __init struct syscall_metadata *
+find_syscall_meta(unsigned long syscall)
 {
-	struct syscall_metadata *start;
-	struct syscall_metadata *stop;
+	struct syscall_metadata **start;
+	struct syscall_metadata **stop;
 	char str[KSYM_SYMBOL_LEN];
 
 
-	start = (struct syscall_metadata *)__start_syscalls_metadata;
-	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
+	start = __start_syscalls_metadata;
+	stop = __stop_syscalls_metadata;
 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
 
 	for ( ; start < stop; start++) {
@@ -78,8 +79,8 @@ static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
 		 * with "SyS" instead of "sys", leading to an unwanted
 		 * mismatch.
 		 */
-		if (start->name && !strcmp(start->name + 3, str + 3))
-			return start;
+		if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+			return *start;
 	}
 	return NULL;
 }
-- 
cgit v1.3-8-gc7d7


From f266a5110d453b7987194460ac7edd31f1a5426c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 3 Feb 2011 15:09:41 +0100
Subject: lockdep, timer: Fix del_timer_sync() annotation

Calling local_bh_enable() will want to actually start processing
softirqs, which isn't a good idea since this can get called with IRQs
disabled.

Cure this by using _local_bh_enable() which doesn't start processing
softirqs, and use raw_local_irq_save() to avoid any softirqs from
happening without letting lockdep think IRQs are in fact disabled.

Reported-by: Nick Bowler <nbowler@elliptictech.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
LKML-Reference: <20110203141548.039540914@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/timer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 43ca9936f2d0..d53ce66daea0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -969,10 +969,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
 int del_timer_sync(struct timer_list *timer)
 {
 #ifdef CONFIG_LOCKDEP
+	unsigned long flags;
+
+	raw_local_irq_save(flags);
 	local_bh_disable();
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-	local_bh_enable();
+	_local_bh_enable();
+	raw_local_irq_restore(flags);
 #endif
 	/*
 	 * don't use it in hardirq context, because it
-- 
cgit v1.3-8-gc7d7


From 2edeaa34a6e3f2c43b667f6c4f7b27944b811695 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 7 Feb 2011 13:36:10 +0000
Subject: CRED: Fix BUG() upon security_cred_alloc_blank() failure

In cred_alloc_blank() since 2.6.32, abort_creds(new) is called with
new->security == NULL and new->magic == 0 when security_cred_alloc_blank()
returns an error.  As a result, BUG() will be triggered if SELinux is enabled
or CONFIG_DEBUG_CREDENTIALS=y.

If CONFIG_DEBUG_CREDENTIALS=y, BUG() is called from __invalid_creds() because
cred->magic == 0.  Failing that, BUG() is called from selinux_cred_free()
because selinux_cred_free() is not expecting cred->security == NULL.  This does
not affect smack_cred_free(), tomoyo_cred_free() or apparmor_cred_free().

Fix these bugs by

(1) Set new->magic before calling security_cred_alloc_blank().

(2) Handle null cred->security in creds_are_invalid() and selinux_cred_free().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cred.c            | 12 ++++++++----
 security/selinux/hooks.c |  6 +++++-
 2 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index 6a1aa004e376..fcf104bb5aa9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -252,13 +252,13 @@ struct cred *cred_alloc_blank(void)
 #endif
 
 	atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+	new->magic = CRED_MAGIC;
+#endif
 
 	if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
 		goto error;
 
-#ifdef CONFIG_DEBUG_CREDENTIALS
-	new->magic = CRED_MAGIC;
-#endif
 	return new;
 
 error:
@@ -748,7 +748,11 @@ bool creds_are_invalid(const struct cred *cred)
 	if (cred->magic != CRED_MAGIC)
 		return true;
 #ifdef CONFIG_SECURITY_SELINUX
-	if (selinux_is_enabled()) {
+	/*
+	 * cred->security == NULL if security_cred_alloc_blank() or
+	 * security_prepare_creds() returned an error.
+	 */
+	if (selinux_is_enabled() && cred->security) {
 		if ((unsigned long) cred->security < PAGE_SIZE)
 			return true;
 		if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index e276eb468536..c8d699270687 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3198,7 +3198,11 @@ static void selinux_cred_free(struct cred *cred)
 {
 	struct task_security_struct *tsec = cred->security;
 
-	BUG_ON((unsigned long) cred->security < PAGE_SIZE);
+	/*
+	 * cred->security == NULL if security_cred_alloc_blank() or
+	 * security_prepare_creds() returned an error.
+	 */
+	BUG_ON(cred->security && (unsigned long) cred->security < PAGE_SIZE);
 	cred->security = (void *) 0x7UL;
 	kfree(tsec);
 }
-- 
cgit v1.3-8-gc7d7


From fb2b2a1d37f80cc818fd4487b510f4e11816e5e1 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 7 Feb 2011 13:36:16 +0000
Subject: CRED: Fix memory and refcount leaks upon security_prepare_creds()
 failure

In prepare_kernel_cred() since 2.6.29, put_cred(new) is called without
assigning new->usage when security_prepare_creds() returned an error.  As a
result, memory for new and refcount for new->{user,group_info,tgcred} are
leaked because put_cred(new) won't call __put_cred() unless old->usage == 1.

Fix these leaks by assigning new->usage (and new->subscribers which was added
in 2.6.32) before calling security_prepare_creds().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cred.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cred.c b/kernel/cred.c
index fcf104bb5aa9..3a9d6dd53a6c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -657,6 +657,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	validate_creds(old);
 
 	*new = *old;
+	atomic_set(&new->usage, 1);
+	set_cred_subscribers(new, 0);
 	get_uid(new->user);
 	get_group_info(new->group_info);
 
@@ -674,8 +676,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
 	if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
 		goto error;
 
-	atomic_set(&new->usage, 1);
-	set_cred_subscribers(new, 0);
 	put_cred(old);
 	validate_creds(new);
 	return new;
-- 
cgit v1.3-8-gc7d7


From 7ff207928eb0761fa6b6c39eda82ac07a5241acf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 8 Feb 2011 15:18:00 +0100
Subject: Revert "lockdep, timer: Fix del_timer_sync() annotation"

Both attempts at trying to allow softirq usage for
del_timer_sync() failed (produced bogus warnings),
so revert the commit for this release:

  f266a5110d45: lockdep, timer: Fix del_timer_sync() annotation

and try again later.

Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yong Zhang <yong.zhang0@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1297174680.13327.107.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/timer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index d53ce66daea0..d6459923d245 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -959,7 +959,7 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
  *
  * Synchronization rules: Callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
- * hardirq contexts. The caller must not hold locks which would prevent
+ * interrupt contexts. The caller must not hold locks which would prevent
  * completion of the timer's handler. The timer's handler must not call
  * add_timer_on(). Upon exit the timer is not queued and the handler is
  * not running on any CPU.
@@ -971,12 +971,10 @@ int del_timer_sync(struct timer_list *timer)
 #ifdef CONFIG_LOCKDEP
 	unsigned long flags;
 
-	raw_local_irq_save(flags);
-	local_bh_disable();
+	local_irq_save(flags);
 	lock_map_acquire(&timer->lockdep_map);
 	lock_map_release(&timer->lockdep_map);
-	_local_bh_enable();
-	raw_local_irq_restore(flags);
+	local_irq_restore(flags);
 #endif
 	/*
 	 * don't use it in hardirq context, because it
-- 
cgit v1.3-8-gc7d7


From 5651f7f47dbb1cf2b95a60582546db4ff508e2b4 Mon Sep 17 00:00:00 2001
From: Don Zickus <dzickus@redhat.com>
Date: Wed, 9 Feb 2011 14:02:33 -0500
Subject: watchdog, nmi: Lower the severity of error messages

During boot if the hardlockup detector fails to initialize, it
complains very loudly.  Some failures should be expected under
certain situations, ie no lapics, or resource in-use.  Tone
those error messages down a bit.  Keep the rest at a high level.

Reported-by: Paul Bolle <pebolle@tiscali.nl>
Tested-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <1297278153-21111-1-git-send-email-dzickus@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index f37f974aa81b..18bb15776c57 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -363,8 +363,14 @@ static int watchdog_nmi_enable(int cpu)
 		goto out_save;
 	}
 
-	printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
-	       cpu, PTR_ERR(event));
+
+	/* vary the KERN level based on the returned errno */
+	if (PTR_ERR(event) == -EOPNOTSUPP)
+		printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+	else if (PTR_ERR(event) == -ENOENT)
+		printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+	else
+		printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
 	return PTR_ERR(event);
 
 	/* success path */
-- 
cgit v1.3-8-gc7d7


From ee24aebffb75a7f940cf52c8cf6910947b3130c0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 10 Feb 2011 17:53:55 -0800
Subject: cap_syslog: accept CAP_SYS_ADMIN for now

In commit ce6ada35bdf7 ("security: Define CAP_SYSLOG") Serge Hallyn
introduced CAP_SYSLOG, but broke backwards compatibility by no longer
accepting CAP_SYS_ADMIN as an override (it would cause a warning and
then reject the operation).

Re-instate CAP_SYS_ADMIN - but keeping the warning - as an acceptable
capability until any legacy applications have been updated.  There are
apparently applications out there that drop all capabilities except for
CAP_SYS_ADMIN in order to access the syslog.

(This is a re-implementation of a patch by Serge, cleaning the logic up
and making the code more readable)

Acked-by: Serge Hallyn <serge@hallyn.com>
Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 54 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 35 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 2ddbdc73aade..36231525e22f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -262,25 +262,47 @@ int dmesg_restrict = 1;
 int dmesg_restrict;
 #endif
 
+static int syslog_action_restricted(int type)
+{
+	if (dmesg_restrict)
+		return 1;
+	/* Unless restricted, we allow "read all" and "get buffer size" for everybody */
+	return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+	/*
+	 * If this is from /proc/kmsg and we've already opened it, then we've
+	 * already done the capabilities checks at open time.
+	 */
+	if (from_file && type != SYSLOG_ACTION_OPEN)
+		return 0;
+
+	if (syslog_action_restricted(type)) {
+		if (capable(CAP_SYSLOG))
+			return 0;
+		/* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
+		if (capable(CAP_SYS_ADMIN)) {
+			WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+				 "but no CAP_SYSLOG (deprecated).\n");
+			return 0;
+		}
+		return -EPERM;
+	}
+	return 0;
+}
+
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
 	unsigned i, j, limit, count;
 	int do_clear = 0;
 	char c;
-	int error = 0;
+	int error;
 
-	/*
-	 * If this is from /proc/kmsg we only do the capabilities checks
-	 * at open time.
-	 */
-	if (type == SYSLOG_ACTION_OPEN || !from_file) {
-		if (dmesg_restrict && !capable(CAP_SYSLOG))
-			goto warn; /* switch to return -EPERM after 2.6.39 */
-		if ((type != SYSLOG_ACTION_READ_ALL &&
-		     type != SYSLOG_ACTION_SIZE_BUFFER) &&
-		    !capable(CAP_SYSLOG))
-			goto warn; /* switch to return -EPERM after 2.6.39 */
-	}
+	error = check_syslog_permissions(type, from_file);
+	if (error)
+		goto out;
 
 	error = security_syslog(type);
 	if (error)
@@ -423,12 +445,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	}
 out:
 	return error;
-warn:
-	/* remove after 2.6.39 */
-	if (capable(CAP_SYS_ADMIN))
-		WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
-		  "but no CAP_SYSLOG (deprecated and denied).\n");
-	return -EPERM;
 }
 
 SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
-- 
cgit v1.3-8-gc7d7


From 6037b715d6fab139742c3df8851db4c823081561 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Wed, 9 Feb 2011 22:11:51 -0800
Subject: security: add cred argument to security_capable()

Expand security_capable() to include cred, so that it can be usable in a
wider range of call sites.

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/security.h | 6 +++---
 kernel/capability.c      | 2 +-
 security/security.c      | 5 ++---
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/security.h b/include/linux/security.h
index c642bb8b8f5a..b2b7f9749f5e 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1662,7 +1662,7 @@ int security_capset(struct cred *new, const struct cred *old,
 		    const kernel_cap_t *effective,
 		    const kernel_cap_t *inheritable,
 		    const kernel_cap_t *permitted);
-int security_capable(int cap);
+int security_capable(const struct cred *cred, int cap);
 int security_real_capable(struct task_struct *tsk, int cap);
 int security_real_capable_noaudit(struct task_struct *tsk, int cap);
 int security_sysctl(struct ctl_table *table, int op);
@@ -1856,9 +1856,9 @@ static inline int security_capset(struct cred *new,
 	return cap_capset(new, old, effective, inheritable, permitted);
 }
 
-static inline int security_capable(int cap)
+static inline int security_capable(const struct cred *cred, int cap)
 {
-	return cap_capable(current, current_cred(), cap, SECURITY_CAP_AUDIT);
+	return cap_capable(current, cred, cap, SECURITY_CAP_AUDIT);
 }
 
 static inline int security_real_capable(struct task_struct *tsk, int cap)
diff --git a/kernel/capability.c b/kernel/capability.c
index 2f05303715a5..9e9385f132c8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -306,7 +306,7 @@ int capable(int cap)
 		BUG();
 	}
 
-	if (security_capable(cap) == 0) {
+	if (security_capable(current_cred(), cap) == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
diff --git a/security/security.c b/security/security.c
index 739e40362f44..7b7308ace8c5 100644
--- a/security/security.c
+++ b/security/security.c
@@ -154,10 +154,9 @@ int security_capset(struct cred *new, const struct cred *old,
 				    effective, inheritable, permitted);
 }
 
-int security_capable(int cap)
+int security_capable(const struct cred *cred, int cap)
 {
-	return security_ops->capable(current, current_cred(), cap,
-				     SECURITY_CAP_AUDIT);
+	return security_ops->capable(current, cred, cap, SECURITY_CAP_AUDIT);
 }
 
 int security_real_capable(struct task_struct *tsk, int cap)
-- 
cgit v1.3-8-gc7d7


From 01e05e9a90b8f4c3997ae0537e87720eb475e532 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 10 Feb 2011 15:01:22 -0800
Subject: ptrace: use safer wake up on ptrace_detach()

The wake_up_process() call in ptrace_detach() is spurious and not
interlocked with the tracee state.  IOW, the tracee could be running or
sleeping in any place in the kernel by the time wake_up_process() is
called.  This can lead to the tracee waking up unexpectedly which can be
dangerous.

The wake_up is spurious and should be removed but for now reduce its
toxicity by only waking up if the tracee is in TRACED or STOPPED state.

This bug can possibly be used as an attack vector.  I don't think it
will take too much effort to come up with an attack which triggers oops
somewhere.  Most sleeps are wrapped in condition test loops and should
be safe but we have quite a number of places where sleep and wakeup
conditions are expected to be interlocked.  Although the window of
opportunity is tiny, ptrace can be used by non-privileged users and with
some loading the window can definitely be extended and exploited.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 99bbaa3e5b0d..1708b1e2972d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 		child->exit_code = data;
 		dead = __ptrace_detach(current, child);
 		if (!child->exit_state)
-			wake_up_process(child);
+			wake_up_state(child, TASK_TRACED | TASK_STOPPED);
 	}
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.3-8-gc7d7


From f590308536db432e4747f562b29e5858123938e9 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees.cook@canonical.com>
Date: Fri, 11 Feb 2011 19:21:25 -0800
Subject: timer debug: Hide kernel addresses via %pK in /proc/timer_list

In the continuing effort to avoid kernel addresses leaking to
unprivileged users, this patch switches to %pK for
/proc/timer_list reporting.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20110212032125.GA23571@outflux.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/timer_list.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 32a19f9397fc..3258455549f4 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -41,7 +41,7 @@ static void print_name_offset(struct seq_file *m, void *sym)
 	char symname[KSYM_NAME_LEN];
 
 	if (lookup_symbol_name((unsigned long)sym, symname) < 0)
-		SEQ_printf(m, "<%p>", sym);
+		SEQ_printf(m, "<%pK>", sym);
 	else
 		SEQ_printf(m, "%s", symname);
 }
@@ -112,7 +112,7 @@ next_one:
 static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
-	SEQ_printf(m, "  .base:       %p\n", base);
+	SEQ_printf(m, "  .base:       %pK\n", base);
 	SEQ_printf(m, "  .index:      %d\n",
 			base->index);
 	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-- 
cgit v1.3-8-gc7d7


From 7576958a9d5a4a677ad7dd40901cdbb6c1110c98 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 14 Feb 2011 14:04:46 +0100
Subject: workqueue: wake up a worker when a rescuer is leaving a gcwq

After executing the matching works, a rescuer leaves the gcwq whether
there are more pending works or not.  This may decrease the
concurrency level to zero and stall execution until a new work item is
queued on the gcwq.

Make rescuer wake up a regular worker when it leaves a gcwq if there
are more works to execute, so that execution isn't stalled.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ray Jui <rjui@broadcom.com>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 11869faa6819..90a17ca2ad0b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2047,6 +2047,15 @@ repeat:
 				move_linked_works(work, scheduled, &n);
 
 		process_scheduled_works(rescuer);
+
+		/*
+		 * Leave this gcwq.  If keep_working() is %true, notify a
+		 * regular worker; otherwise, we end up with 0 concurrency
+		 * and stalling the execution.
+		 */
+		if (keep_working(gcwq))
+			wake_up_worker(gcwq);
+
 		spin_unlock_irq(&gcwq->lock);
 	}
 
-- 
cgit v1.3-8-gc7d7


From 4fe757dd48a9e95e1a071291f15dda5421dacb66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 15 Feb 2011 22:26:07 +0100
Subject: perf: Fix throttle logic

It was possible to call pmu::start() on an already running event. In
particular this lead so some wreckage as the hrtimer events would
re-initialize active timers.

This was due to throttled events being activated again by scheduling.
Scheduling in a context would add and force start events, resulting in
running events with a possible throttle status. The next tick to hit
that task will then try to unthrottle the event and call ->start() on
an already running event.

Reported-by: Jeff Moyer <jmoyer@redhat.com>
Cc: <stable@kernel.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 999835b6112b..656222fcf767 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -782,6 +782,10 @@ retry:
 	raw_spin_unlock_irq(&ctx->lock);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_event *event, int enable);
+
 static int
 event_sched_in(struct perf_event *event,
 		 struct perf_cpu_context *cpuctx,
@@ -794,6 +798,17 @@ event_sched_in(struct perf_event *event,
 
 	event->state = PERF_EVENT_STATE_ACTIVE;
 	event->oncpu = smp_processor_id();
+
+	/*
+	 * Unthrottle events, since we scheduled we might have missed several
+	 * ticks already, also for a heavily scheduling task there is little
+	 * guarantee it'll get a tick in a timely manner.
+	 */
+	if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
+		perf_log_throttle(event, 1);
+		event->hw.interrupts = 0;
+	}
+
 	/*
 	 * The new state must be visible before we turn it on in the hardware:
 	 */
@@ -1596,10 +1611,6 @@ void __perf_event_task_sched_in(struct task_struct *task)
 	}
 }
 
-#define MAX_INTERRUPTS (~0ULL)
-
-static void perf_log_throttle(struct perf_event *event, int enable);
-
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 {
 	u64 frequency = event->attr.sample_freq;
-- 
cgit v1.3-8-gc7d7


From 58a69cb47ec6991bf006a3e5d202e8571b0327a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 09:25:31 +0100
Subject: workqueue, freezer: unify spelling of 'freeze' + 'able' to
 'freezable'

There are two spellings in use for 'freeze' + 'able' - 'freezable' and
'freezeable'.  The former is the more prominent one.  The latter is
mostly used by workqueue and in a few other odd places.  Unify the
spelling to 'freezable'.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Acked-by: Dmitry Torokhov <dtor@mail.ru>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Steven Whitehouse <swhiteho@redhat.com>
---
 Documentation/workqueue.txt      |  4 ++--
 drivers/memstick/core/memstick.c |  2 +-
 drivers/misc/tifm_core.c         |  2 +-
 drivers/misc/vmw_balloon.c       |  2 +-
 drivers/mtd/nand/r852.c          |  2 +-
 drivers/mtd/sm_ftl.c             |  2 +-
 drivers/net/can/mcp251x.c        |  2 +-
 drivers/tty/serial/max3100.c     |  2 +-
 drivers/tty/serial/max3107.c     |  2 +-
 fs/gfs2/glock.c                  |  4 ++--
 fs/gfs2/main.c                   |  2 +-
 include/linux/freezer.h          |  2 +-
 include/linux/sched.h            |  2 +-
 include/linux/workqueue.h        |  8 ++++----
 kernel/power/main.c              |  2 +-
 kernel/power/process.c           |  6 +++---
 kernel/workqueue.c               | 24 ++++++++++++------------
 17 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/workqueue.txt b/Documentation/workqueue.txt
index 996a27d9b8db..01c513fac40e 100644
--- a/Documentation/workqueue.txt
+++ b/Documentation/workqueue.txt
@@ -190,9 +190,9 @@ resources, scheduled and executed.
 	* Long running CPU intensive workloads which can be better
 	  managed by the system scheduler.
 
-  WQ_FREEZEABLE
+  WQ_FREEZABLE
 
-	A freezeable wq participates in the freeze phase of the system
+	A freezable wq participates in the freeze phase of the system
 	suspend operations.  Work items on the wq are drained and no
 	new work item starts execution until thawed.
 
diff --git a/drivers/memstick/core/memstick.c b/drivers/memstick/core/memstick.c
index e9a3eab7b0cf..8c1d85e27be4 100644
--- a/drivers/memstick/core/memstick.c
+++ b/drivers/memstick/core/memstick.c
@@ -621,7 +621,7 @@ static int __init memstick_init(void)
 {
 	int rc;
 
-	workqueue = create_freezeable_workqueue("kmemstick");
+	workqueue = create_freezable_workqueue("kmemstick");
 	if (!workqueue)
 		return -ENOMEM;
 
diff --git a/drivers/misc/tifm_core.c b/drivers/misc/tifm_core.c
index 5f6852dff40b..44d4475a09dd 100644
--- a/drivers/misc/tifm_core.c
+++ b/drivers/misc/tifm_core.c
@@ -329,7 +329,7 @@ static int __init tifm_init(void)
 {
 	int rc;
 
-	workqueue = create_freezeable_workqueue("tifm");
+	workqueue = create_freezable_workqueue("tifm");
 	if (!workqueue)
 		return -ENOMEM;
 
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c
index 4d2ea8e80140..6df5a55da110 100644
--- a/drivers/misc/vmw_balloon.c
+++ b/drivers/misc/vmw_balloon.c
@@ -785,7 +785,7 @@ static int __init vmballoon_init(void)
 	if (x86_hyper != &x86_hyper_vmware)
 		return -ENODEV;
 
-	vmballoon_wq = create_freezeable_workqueue("vmmemctl");
+	vmballoon_wq = create_freezable_workqueue("vmmemctl");
 	if (!vmballoon_wq) {
 		pr_err("failed to create workqueue\n");
 		return -ENOMEM;
diff --git a/drivers/mtd/nand/r852.c b/drivers/mtd/nand/r852.c
index d9d7efbc77cc..6322d1fb5d62 100644
--- a/drivers/mtd/nand/r852.c
+++ b/drivers/mtd/nand/r852.c
@@ -930,7 +930,7 @@ int  r852_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 
 	init_completion(&dev->dma_done);
 
-	dev->card_workqueue = create_freezeable_workqueue(DRV_NAME);
+	dev->card_workqueue = create_freezable_workqueue(DRV_NAME);
 
 	if (!dev->card_workqueue)
 		goto error9;
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 67822cf6c025..ac0d6a8613b5 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -1258,7 +1258,7 @@ static struct mtd_blktrans_ops sm_ftl_ops = {
 static __init int sm_module_init(void)
 {
 	int error = 0;
-	cache_flush_workqueue = create_freezeable_workqueue("smflush");
+	cache_flush_workqueue = create_freezable_workqueue("smflush");
 
 	if (IS_ERR(cache_flush_workqueue))
 		return PTR_ERR(cache_flush_workqueue);
diff --git a/drivers/net/can/mcp251x.c b/drivers/net/can/mcp251x.c
index 7ab534aee452..7513c4523ac4 100644
--- a/drivers/net/can/mcp251x.c
+++ b/drivers/net/can/mcp251x.c
@@ -940,7 +940,7 @@ static int mcp251x_open(struct net_device *net)
 		goto open_unlock;
 	}
 
-	priv->wq = create_freezeable_workqueue("mcp251x_wq");
+	priv->wq = create_freezable_workqueue("mcp251x_wq");
 	INIT_WORK(&priv->tx_work, mcp251x_tx_work_handler);
 	INIT_WORK(&priv->restart_work, mcp251x_restart_work_handler);
 
diff --git a/drivers/tty/serial/max3100.c b/drivers/tty/serial/max3100.c
index beb1afa27d8d..7b951adac54b 100644
--- a/drivers/tty/serial/max3100.c
+++ b/drivers/tty/serial/max3100.c
@@ -601,7 +601,7 @@ static int max3100_startup(struct uart_port *port)
 	s->rts = 0;
 
 	sprintf(b, "max3100-%d", s->minor);
-	s->workqueue = create_freezeable_workqueue(b);
+	s->workqueue = create_freezable_workqueue(b);
 	if (!s->workqueue) {
 		dev_warn(&s->spi->dev, "cannot create workqueue\n");
 		return -EBUSY;
diff --git a/drivers/tty/serial/max3107.c b/drivers/tty/serial/max3107.c
index 910870edf708..750b4f627315 100644
--- a/drivers/tty/serial/max3107.c
+++ b/drivers/tty/serial/max3107.c
@@ -833,7 +833,7 @@ static int max3107_startup(struct uart_port *port)
 	struct max3107_port *s = container_of(port, struct max3107_port, port);
 
 	/* Initialize work queue */
-	s->workqueue = create_freezeable_workqueue("max3107");
+	s->workqueue = create_freezable_workqueue("max3107");
 	if (!s->workqueue) {
 		dev_err(&s->spi->dev, "Workqueue creation failed\n");
 		return -EBUSY;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..7cd9a5a68d59 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1779,11 +1779,11 @@ int __init gfs2_glock_init(void)
 #endif
 
 	glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
-					  WQ_HIGHPRI | WQ_FREEZEABLE, 0);
+					  WQ_HIGHPRI | WQ_FREEZABLE, 0);
 	if (IS_ERR(glock_workqueue))
 		return PTR_ERR(glock_workqueue);
 	gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
-						WQ_MEM_RECLAIM | WQ_FREEZEABLE,
+						WQ_MEM_RECLAIM | WQ_FREEZABLE,
 						0);
 	if (IS_ERR(gfs2_delete_workqueue)) {
 		destroy_workqueue(glock_workqueue);
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..85ba027d1c4d 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -144,7 +144,7 @@ static int __init init_gfs2_fs(void)
 
 	error = -ENOMEM;
 	gfs_recovery_wq = alloc_workqueue("gfs_recovery",
-					  WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0);
+					  WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
 	if (!gfs_recovery_wq)
 		goto fail_wq;
 
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index da7e52b099f3..1effc8b56b4e 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -109,7 +109,7 @@ static inline void freezer_count(void)
 }
 
 /*
- * Check if the task should be counted as freezeable by the freezer
+ * Check if the task should be counted as freezable by the freezer
  */
 static inline int freezer_should_skip(struct task_struct *p)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d747f948b34e..777d8a5ed06b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1744,7 +1744,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
-#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
+#define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezable */
 #define PF_FREEZER_NOSIG 0x80000000	/* Freezer won't send signals to it */
 
 /*
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1ac11586a2f5..f7998a3bf020 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -250,7 +250,7 @@ static inline unsigned int work_static(struct work_struct *work) { return 0; }
 enum {
 	WQ_NON_REENTRANT	= 1 << 0, /* guarantee non-reentrance */
 	WQ_UNBOUND		= 1 << 1, /* not bound to any cpu */
-	WQ_FREEZEABLE		= 1 << 2, /* freeze during suspend */
+	WQ_FREEZABLE		= 1 << 2, /* freeze during suspend */
 	WQ_MEM_RECLAIM		= 1 << 3, /* may be used for memory reclaim */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
@@ -318,7 +318,7 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
  * @name: name of the workqueue
- * @flags: WQ_* flags (only WQ_FREEZEABLE and WQ_MEM_RECLAIM are meaningful)
+ * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -335,8 +335,8 @@ alloc_ordered_workqueue(const char *name, unsigned int flags)
 
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
-#define create_freezeable_workqueue(name)			\
-	alloc_workqueue((name), WQ_FREEZEABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
+#define create_freezable_workqueue(name)			\
+	alloc_workqueue((name), WQ_FREEZABLE | WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
 #define create_singlethread_workqueue(name)			\
 	alloc_workqueue((name), WQ_UNBOUND | WQ_MEM_RECLAIM, 1)
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7b5db6a8561e..701853042c28 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -326,7 +326,7 @@ EXPORT_SYMBOL_GPL(pm_wq);
 
 static int __init pm_start_workqueue(void)
 {
-	pm_wq = alloc_workqueue("pm", WQ_FREEZEABLE, 0);
+	pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0);
 
 	return pm_wq ? 0 : -ENOMEM;
 }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d6d2a10320e0..0cf3a27a6c9d 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,7 +22,7 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-static inline int freezeable(struct task_struct * p)
+static inline int freezable(struct task_struct * p)
 {
 	if ((p == current) ||
 	    (p->flags & PF_NOFREEZE) ||
@@ -53,7 +53,7 @@ static int try_to_freeze_tasks(bool sig_only)
 		todo = 0;
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
-			if (frozen(p) || !freezeable(p))
+			if (frozen(p) || !freezable(p))
 				continue;
 
 			if (!freeze_task(p, sig_only))
@@ -167,7 +167,7 @@ static void thaw_tasks(bool nosig_only)
 
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
-		if (!freezeable(p))
+		if (!freezable(p))
 			continue;
 
 		if (nosig_only && should_send_signal(p))
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 90a17ca2ad0b..88a3e34f51f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2965,7 +2965,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	 */
 	spin_lock(&workqueue_lock);
 
-	if (workqueue_freezing && wq->flags & WQ_FREEZEABLE)
+	if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
 		for_each_cwq_cpu(cpu, wq)
 			get_cwq(cpu, wq)->max_active = 0;
 
@@ -3077,7 +3077,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 
 		spin_lock_irq(&gcwq->lock);
 
-		if (!(wq->flags & WQ_FREEZEABLE) ||
+		if (!(wq->flags & WQ_FREEZABLE) ||
 		    !(gcwq->flags & GCWQ_FREEZING))
 			get_cwq(gcwq->cpu, wq)->max_active = max_active;
 
@@ -3327,7 +3327,7 @@ static int __cpuinit trustee_thread(void *__gcwq)
 	 * want to get it over with ASAP - spam rescuers, wake up as
 	 * many idlers as necessary and create new ones till the
 	 * worklist is empty.  Note that if the gcwq is frozen, there
-	 * may be frozen works in freezeable cwqs.  Don't declare
+	 * may be frozen works in freezable cwqs.  Don't declare
 	 * completion while frozen.
 	 */
 	while (gcwq->nr_workers != gcwq->nr_idle ||
@@ -3585,9 +3585,9 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
 /**
  * freeze_workqueues_begin - begin freezing workqueues
  *
- * Start freezing workqueues.  After this function returns, all
- * freezeable workqueues will queue new works to their frozen_works
- * list instead of gcwq->worklist.
+ * Start freezing workqueues.  After this function returns, all freezable
+ * workqueues will queue new works to their frozen_works list instead of
+ * gcwq->worklist.
  *
  * CONTEXT:
  * Grabs and releases workqueue_lock and gcwq->lock's.
@@ -3613,7 +3613,7 @@ void freeze_workqueues_begin(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (cwq && wq->flags & WQ_FREEZEABLE)
+			if (cwq && wq->flags & WQ_FREEZABLE)
 				cwq->max_active = 0;
 		}
 
@@ -3624,7 +3624,7 @@ void freeze_workqueues_begin(void)
 }
 
 /**
- * freeze_workqueues_busy - are freezeable workqueues still busy?
+ * freeze_workqueues_busy - are freezable workqueues still busy?
  *
  * Check whether freezing is complete.  This function must be called
  * between freeze_workqueues_begin() and thaw_workqueues().
@@ -3633,8 +3633,8 @@ void freeze_workqueues_begin(void)
  * Grabs and releases workqueue_lock.
  *
  * RETURNS:
- * %true if some freezeable workqueues are still busy.  %false if
- * freezing is complete.
+ * %true if some freezable workqueues are still busy.  %false if freezing
+ * is complete.
  */
 bool freeze_workqueues_busy(void)
 {
@@ -3654,7 +3654,7 @@ bool freeze_workqueues_busy(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZABLE))
 				continue;
 
 			BUG_ON(cwq->nr_active < 0);
@@ -3699,7 +3699,7 @@ void thaw_workqueues(void)
 		list_for_each_entry(wq, &workqueues, list) {
 			struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
 
-			if (!cwq || !(wq->flags & WQ_FREEZEABLE))
+			if (!cwq || !(wq->flags & WQ_FREEZABLE))
 				continue;
 
 			/* restore max_active and repopulate worklist */
-- 
cgit v1.3-8-gc7d7


From 3233cdbd9fa347a6d6897a94cc6ed0302ae83c4f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Feb 2011 18:10:19 +0100
Subject: workqueue: make sure MAYDAY_INITIAL_TIMEOUT is at least 2 jiffies
 long

MAYDAY_INITIAL_TIMEOUT is defined as HZ / 100 and depending on
configuration may end up 0 or 1.  Even when it's 1, depending on when
the mayday timer is added in the current jiffy interval, it may expire
way before a jiffy has passed.

Make sure MAYDAY_INITIAL_TIMEOUT is at least two to guarantee that at
least a full jiffy has passed before calling rescuers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Ray Jui <rjui@broadcom.com>
Cc: stable@kernel.org
---
 kernel/workqueue.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 88a3e34f51f6..ee6578b578ad 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -79,7 +79,9 @@ enum {
 	MAX_IDLE_WORKERS_RATIO	= 4,		/* 1/4 of busy can be idle */
 	IDLE_WORKER_TIMEOUT	= 300 * HZ,	/* keep idle ones for 5 mins */
 
-	MAYDAY_INITIAL_TIMEOUT	= HZ / 100,	/* call for help after 10ms */
+	MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
+						/* call for help after 10ms
+						   (min two ticks) */
 	MAYDAY_INTERVAL		= HZ / 10,	/* and then every 100ms */
 	CREATE_COOLDOWN		= HZ,		/* time to breath after fail */
 	TRUSTEE_COOLDOWN	= HZ / 10,	/* for trustee draining */
-- 
cgit v1.3-8-gc7d7


From 2e725a065b0153f0c449318da1923a120477633d Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <stf_xl@pop3.wp.pl>
Date: Sat, 12 Feb 2011 21:06:51 +0100
Subject: PM / Hibernate: Return error code when alloc_image_page() fails

Currently we return 0 in swsusp_alloc() when alloc_image_page() fails.
Fix that.  Also remove unneeded "error" variable since the only
useful value of error is -ENOMEM.

[rjw: Fixed up the changelog and changed subject.]

Signed-off-by: Stanislaw Gruszka <stf_xl@wp.pl>
Cc: stable@kernel.org
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/snapshot.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0dac75ea4456..64db648ff911 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1519,11 +1519,8 @@ static int
 swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 		unsigned int nr_pages, unsigned int nr_highmem)
 {
-	int error = 0;
-
 	if (nr_highmem > 0) {
-		error = get_highmem_buffer(PG_ANY);
-		if (error)
+		if (get_highmem_buffer(PG_ANY))
 			goto err_out;
 		if (nr_highmem > alloc_highmem) {
 			nr_highmem -= alloc_highmem;
@@ -1546,7 +1543,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
 
  err_out:
 	swsusp_free();
-	return error;
+	return -ENOMEM;
 }
 
 asmlinkage int swsusp_save(void)
-- 
cgit v1.3-8-gc7d7


From c1ee6264280e740a9d3ff3feef38642cf0a57013 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 17 Feb 2011 17:45:15 +0100
Subject: genirq: Prevent access beyond allocated_irqs bitmap

Lars-Peter Clausen pointed out:

   I stumbled upon this while looking through the existing archs using
   SPARSE_IRQ.  Even with SPARSE_IRQ the NR_IRQS is still the upper
   limit for the number of IRQs.

   Both PXA and MMP set NR_IRQS to IRQ_BOARD_START, with
   IRQ_BOARD_START being the number of IRQs used by the core.

   In various machine files the nr_irqs field of the ARM machine
   defintion struct is then set to "IRQ_BOARD_START + NR_BOARD_IRQS".

   As a result "nr_irqs" will greater then NR_IRQS which then again
   causes the "allocated_irqs" bitmap in the core irq code to be
   accessed beyond its size overwriting unrelated data.

The core code really misses a sanity check there.

This went unnoticed so far as by chance the compiler/linker places
data behind that bitmap which gets initialized later on those affected
platforms.

So the obvious fix would be to add a sanity check in early_irq_init()
and break all affected platforms. Though that check wants to be
backported to stable as well, which will require to fix all known
problematic platforms and probably some more yet not known ones as
well. Lots of churn.

A way simpler solution is to allocate a slightly larger bitmap and
avoid the whole churn w/o breaking anything. Add a few warnings when
an arch returns utter crap.

Reported-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@kernel.org # .37
Cc: Haojian Zhuang <haojian.zhuang@marvell.com>
Cc: Eric Miao <eric.y.miao@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/irq/internals.h |  6 ++++++
 kernel/irq/irqdesc.c   | 11 ++++++++++-
 kernel/irq/resend.c    |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 4571ae7e085a..99c3bc8a6fb4 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -3,6 +3,12 @@
  */
 #include <linux/irqdesc.h>
 
+#ifdef CONFIG_SPARSE_IRQ
+# define IRQ_BITMAP_BITS	(NR_IRQS + 8196)
+#else
+# define IRQ_BITMAP_BITS	NR_IRQS
+#endif
+
 extern int noirqdebug;
 
 #define irq_data_to_desc(data)	container_of(data, struct irq_desc, irq_data)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 282f20230e67..2039bea31bdf 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -94,7 +94,7 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 static DEFINE_MUTEX(sparse_irq_lock);
-static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
+static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS);
 
 #ifdef CONFIG_SPARSE_IRQ
 
@@ -217,6 +217,15 @@ int __init early_irq_init(void)
 	initcnt = arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
 
+	if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
+		nr_irqs = IRQ_BITMAP_BITS;
+
+	if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
+		initcnt = IRQ_BITMAP_BITS;
+
+	if (initcnt > nr_irqs)
+		nr_irqs = initcnt;
+
 	for (i = 0; i < initcnt; i++) {
 		desc = alloc_desc(i, node);
 		set_bit(i, allocated_irqs);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 891115a929aa..dc49358b73fa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -23,7 +23,7 @@
 #ifdef CONFIG_HARDIRQS_SW_RESEND
 
 /* Bitmap to handle software resend of interrupts: */
-static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS);
 
 /*
  * Run software resends of IRQ's
-- 
cgit v1.3-8-gc7d7


From 6d83f94db95cfe65d2a6359cccdf61cf087c2598 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 18 Feb 2011 23:27:23 +0100
Subject: genirq: Disable the SHIRQ_DEBUG call in request_threaded_irq for now

With CONFIG_SHIRQ_DEBUG=y we call a newly installed interrupt handler
in request_threaded_irq().

The original implementation (commit a304e1b8) called the handler
_BEFORE_ it was installed, but that caused problems with handlers
calling disable_irq_nosync(). See commit 377bf1e4.

It's braindead in the first place to call disable_irq_nosync in shared
handlers, but ....

Moving this call after we installed the handler looks innocent, but it
is very subtle broken on SMP.

Interrupt handlers rely on the fact, that the irq core prevents
reentrancy.

Now this debug call violates that promise because we run the handler
w/o the IRQ_INPROGRESS protection - which we cannot apply here because
that would result in a possibly forever masked interrupt line.

A concurrent real hardware interrupt on a different CPU results in
handler reentrancy and can lead to complete wreckage, which was
unfortunately observed in reality and took a fricking long time to
debug.

Leave the code here for now. We want this debug feature, but that's
not easy to fix. We really should get rid of those
disable_irq_nosync() abusers and remove that function completely.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Anton Vorontsov <avorontsov@ru.mvista.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: stable@kernel.org # .28 -> .37
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f747dd..9033c1c70828 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1100,7 +1100,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	if (retval)
 		kfree(action);
 
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
 	if (!retval && (irqflags & IRQF_SHARED)) {
 		/*
 		 * It's a shared IRQ -- the driver ought to be prepared for it
-- 
cgit v1.3-8-gc7d7


From 3a142a0672b48a853f00af61f184c7341ac9c99d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 25 Feb 2011 22:34:23 +0100
Subject: clockevents: Prevent oneshot mode when broadcast device is periodic

When the per cpu timer is marked CLOCK_EVT_FEAT_C3STOP, then we only
can switch into oneshot mode, when the backup broadcast device
supports oneshot mode as well. Otherwise we would try to switch the
broadcast device into an unsupported mode unconditionally. This went
unnoticed so far as the current available broadcast devices support
oneshot mode. Seth unearthed this problem while debugging and working
around an hpet related BIOS wreckage.

Add the necessary check to tick_is_oneshot_available().

Reported-and-tested-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <alpine.LFD.2.00.1102252231200.2701@localhost6.localdomain6>
Cc: stable@kernel.org # .21 ->
---
 kernel/time/tick-broadcast.c | 10 ++++++++++
 kernel/time/tick-common.c    |  6 +++++-
 kernel/time/tick-internal.h  |  3 +++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 48b2761b5668..a3b5aff62606 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -600,4 +600,14 @@ int tick_broadcast_oneshot_active(void)
 	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
 }
 
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+	struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+	return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 051bc80a0c43..ed228ef6f6b8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -51,7 +51,11 @@ int tick_is_oneshot_available(void)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+	if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+		return 0;
+	if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+		return 1;
+	return tick_broadcast_oneshot_available();
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..f65d3a723a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
 extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
 static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
 
 #else /* !ONESHOT */
@@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 	return 0;
 }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
 #endif /* !TICK_ONESHOT */
 
 /*
-- 
cgit v1.3-8-gc7d7


From 2d3a8497f8cc5aca14b722cd37d51f6c15ff9f74 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 3 Mar 2011 10:53:20 -0500
Subject: blktrace: Remove blk_fill_rwbs_rq.

If we enable trace events to trace block actions, We use
blk_fill_rwbs_rq to analyze the corresponding actions
in request's cmd_flags, but we only choose the minor 2 bits
from it, so most of other flags(e.g, REQ_SYNC) are missing.
For example, with a sync write we get:
write_test-2409  [001]   160.013869: block_rq_insert: 3,64 W 0 () 258135 + =
8 [write_test]

Since now we have integrated the flags of both bio and request,
it is safe to pass rq->cmd_flags directly to blk_fill_rwbs and
blk_fill_rwbs_rq isn't needed any more.

With this patch, after a sync write we get:
write_test-2417  [000]   226.603878: block_rq_insert: 3,64 WS 0 () 258135 +=
 8 [write_test]

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blktrace_api.h |  1 -
 include/trace/events/block.h |  6 +++---
 kernel/trace/blktrace.c      | 16 ----------------
 3 files changed, 3 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3395cf7130f5..b22fb0d3db0f 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -245,7 +245,6 @@ static inline int blk_cmd_buf_len(struct request *rq)
 
 extern void blk_dump_cmd(char *buf, struct request *rq);
 extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
-extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
 
 #endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index aba421d68f6f..78f18adb49c8 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -31,7 +31,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 
-		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 
@@ -118,7 +118,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					blk_rq_bytes(rq) : 0;
 
-		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
@@ -563,7 +563,7 @@ TRACE_EVENT(block_rq_remap,
 		__entry->nr_sector	= blk_rq_sectors(rq);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
-		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d95721f33702..cbafed7d4f38 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1827,21 +1827,5 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 	rwbs[i] = '\0';
 }
 
-void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
-{
-	int rw = rq->cmd_flags & 0x03;
-	int bytes;
-
-	if (rq->cmd_flags & REQ_DISCARD)
-		rw |= REQ_DISCARD;
-
-	if (rq->cmd_flags & REQ_SECURE)
-		rw |= REQ_SECURE;
-
-	bytes = blk_rq_bytes(rq);
-
-	blk_fill_rwbs(rwbs, rw, bytes);
-}
-
 #endif /* CONFIG_EVENT_TRACING */
 
-- 
cgit v1.3-8-gc7d7


From e3e89cc535223433a619d0969db3fa05cdd946b8 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 4 Mar 2011 09:23:30 -0800
Subject: Mark ptrace_{traceme,attach,detach} static

They are only used inside kernel/ptrace.c, and have been for a long
time.  We don't want to go back to the bad-old-days when architectures
did things on their own, so make them static and private.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 3 ---
 kernel/ptrace.c        | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 092a04f874a8..a1147e5dd245 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -102,11 +102,8 @@
 
 extern long arch_ptrace(struct task_struct *child, long request,
 			unsigned long addr, unsigned long data);
-extern int ptrace_traceme(void);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
-extern int ptrace_attach(struct task_struct *tsk);
-extern int ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1708b1e2972d..e2302e40b360 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -163,7 +163,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
 	return !err;
 }
 
-int ptrace_attach(struct task_struct *task)
+static int ptrace_attach(struct task_struct *task)
 {
 	int retval;
 
@@ -219,7 +219,7 @@ out:
  * Performs checks and sets PT_PTRACED.
  * Should be used by all ptrace implementations for PTRACE_TRACEME.
  */
-int ptrace_traceme(void)
+static int ptrace_traceme(void)
 {
 	int ret = -EPERM;
 
@@ -293,7 +293,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
 	return false;
 }
 
-int ptrace_detach(struct task_struct *child, unsigned int data)
+static int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	bool dead = false;
 
-- 
cgit v1.3-8-gc7d7


From b75f38d659e6fc747eda64cb72f3920e29dd44a4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 4 Mar 2011 17:36:21 -0800
Subject: cpuset: add a missing unlock in cpuset_write_resmask()

Don't forget to release cgroup_mutex if alloc_trial_cpuset() fails.

[akpm@linux-foundation.org: avoid multiple return points]
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4349935c2ad8..e92e98189032 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1575,8 +1575,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 		return -ENODEV;
 
 	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs)
-		return -ENOMEM;
+	if (!trialcs) {
+		retval = -ENOMEM;
+		goto out;
+	}
 
 	switch (cft->private) {
 	case FILE_CPULIST:
@@ -1591,6 +1593,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 	}
 
 	free_trial_cpuset(trialcs);
+out:
 	cgroup_unlock();
 	return retval;
 }
-- 
cgit v1.3-8-gc7d7