From b595076a180a56d1bb170e6eceda6eb9d76f4cd3 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Mon, 1 Nov 2010 15:38:34 -0400 Subject: tree-wide: fix comment/printk typos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "gadget", "through", "command", "maintain", "maintain", "controller", "address", "between", "initiali[zs]e", "instead", "function", "select", "already", "equal", "access", "management", "hierarchy", "registration", "interest", "relative", "memory", "offset", "already", Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- kernel/debug/kdb/kdb_main.c | 2 +- kernel/kexec.c | 2 +- kernel/power/swap.c | 2 +- kernel/sched.c | 2 +- kernel/sysctl_binary.c | 2 +- kernel/time/clocksource.c | 2 +- kernel/trace/trace_entries.h | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 37755d621924..7242cc71bb7e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2913,7 +2913,7 @@ static void __init kdb_cmd_init(void) } } -/* Intialize kdb_printf, breakpoint tables and kdb state */ +/* Initialize kdb_printf, breakpoint tables and kdb state */ void __init kdb_init(int lvl) { static int kdb_init_lvl = KDB_NOT_INITIALIZED; diff --git a/kernel/kexec.c b/kernel/kexec.c index b55045bc7563..ec19b92c7ebd 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -163,7 +163,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure - * the destination addreses are page aligned. Too many + * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0e4a86ccf94..cd09c22de03d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -865,7 +865,7 @@ out_finish: /** * swsusp_read - read the hibernation image. * @flags_p: flags passed by the "frozen" kernel in the image header should - * be written into this memeory location + * be written into this memory location */ int swsusp_read(unsigned int *flags_p) diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d03..554c0d6c489e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2568,7 +2568,7 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not alredy there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. this_rq() stays locked over invocation. */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 1357c5786064..d9c5fe4ff1b2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1193,7 +1193,7 @@ static ssize_t bin_dn_node_address(struct file *file, buf[result] = '\0'; - /* Convert the decnet addresss to binary */ + /* Convert the decnet address to binary */ result = -EIO; nodep = strchr(buf, '.') + 1; if (!nodep) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4b..ddbbaf2950ef 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -678,7 +678,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { - /* Intialize mult/shift and max_idle_ns */ + /* Initialize mult/shift and max_idle_ns */ __clocksource_updatefreq_scale(cs, scale, freq); /* Add clocksource to the clcoksource list */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e3dfecaf13e6..6cf223764be8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -53,7 +53,7 @@ */ /* - * Function trace entry - function address and parent function addres: + * Function trace entry - function address and parent function address: */ FTRACE_ENTRY(function, ftrace_entry, -- cgit v1.3-8-gc7d7 From 9db3b9bcc7f53487da8766b32e2d790ad03c53b9 Mon Sep 17 00:00:00 2001 From: Ross Kirk Date: Fri, 22 Oct 2010 16:43:17 +0100 Subject: audit: error message typo correction Fixes a typo in the error message raised by audit when auditd has died. Signed-off-by: Ross Kirk -- Signed-off-by: Jiri Kosina --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 77770a034d59..e4956244ae50 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -400,7 +400,7 @@ static void kauditd_send_skb(struct sk_buff *skb) if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); - audit_log_lost("auditd dissapeared\n"); + audit_log_lost("auditd disappeared\n"); audit_pid = 0; /* we might get lucky and get this in the next auditd */ audit_hold_skb(skb); -- cgit v1.3-8-gc7d7 From e525fd89d380c4a94c0d63913a1dd1a593ed25e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:17 +0100 Subject: block: make blkdev_get/put() handle exclusive access Over time, block layer has accumulated a set of APIs dealing with bdev open, close, claim and release. * blkdev_get/put() are the primary open and close functions. * bd_claim/release() deal with exclusive open. * open/close_bdev_exclusive() are combination of open and claim and the other way around, respectively. * bd_link/unlink_disk_holder() to create and remove holder/slave symlinks. * open_by_devnum() wraps bdget() + blkdev_get(). The interface is a bit confusing and the decoupling of open and claim makes it impossible to properly guarantee exclusive access as in-kernel open + claim sequence can disturb the existing exclusive open even before the block layer knows the current open if for another exclusive access. Reorganize the interface such that, * blkdev_get() is extended to include exclusive access management. @holder argument is added and, if is @FMODE_EXCL specified, it will gain exclusive access atomically w.r.t. other exclusive accesses. * blkdev_put() is similarly extended. It now takes @mode argument and if @FMODE_EXCL is set, it releases an exclusive access. Also, when the last exclusive claim is released, the holder/slave symlinks are removed automatically. * bd_claim/release() and close_bdev_exclusive() are no longer necessary and either made static or removed. * bd_link_disk_holder() remains the same but bd_unlink_disk_holder() is no longer necessary and removed. * open_bdev_exclusive() becomes a simple wrapper around lookup_bdev() and blkdev_get(). It also has an unexpected extra bdev_read_only() test which probably should be moved into blkdev_get(). * open_by_devnum() is modified to take @holder argument and pass it to blkdev_get(). Most of bdev open/close operations are unified into blkdev_get/put() and most exclusive accesses are tested atomically at the open time (as it should). This cleans up code and removes some, both valid and invalid, but unnecessary all the same, corner cases. open_bdev_exclusive() and open_by_devnum() can use further cleanup - rename to blkdev_get_by_path() and blkdev_get_by_devt() and drop special features. Well, let's leave them for another day. Most conversions are straight-forward. drbd conversion is a bit more involved as there was some reordering, but the logic should stay the same. Signed-off-by: Tejun Heo Acked-by: Neil Brown Acked-by: Ryusuke Konishi Acked-by: Mike Snitzer Acked-by: Philipp Reisner Cc: Peter Osterlund Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Jan Kara Cc: Andrew Morton Cc: Andreas Dilger Cc: "Theodore Ts'o" Cc: Mark Fasheh Cc: Joel Becker Cc: Alex Elder Cc: Christoph Hellwig Cc: dm-devel@redhat.com Cc: drbd-dev@lists.linbit.com Cc: Leo Chen Cc: Scott Branden Cc: Chris Mason Cc: Steven Whitehouse Cc: Dave Kleikamp Cc: Joern Engel Cc: reiserfs-devel@vger.kernel.org Cc: Alexander Viro --- block/ioctl.c | 5 +- drivers/block/drbd/drbd_int.h | 2 - drivers/block/drbd/drbd_main.c | 7 +- drivers/block/drbd/drbd_nl.c | 103 ++++++++++----------------- drivers/block/pktcdvd.c | 22 +++--- drivers/char/raw.c | 14 ++-- drivers/md/dm-table.c | 15 +--- drivers/md/md.c | 14 +--- drivers/mtd/devices/block2mtd.c | 17 ++--- drivers/s390/block/dasd_genhd.c | 2 +- fs/block_dev.c | 149 ++++++++++++++-------------------------- fs/btrfs/volumes.c | 14 ++-- fs/ext3/super.c | 12 +--- fs/ext4/super.c | 12 +--- fs/gfs2/ops_fstype.c | 4 +- fs/jfs/jfs_logmgr.c | 17 ++--- fs/logfs/dev_bdev.c | 4 +- fs/nilfs2/super.c | 4 +- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/partitions/check.c | 2 +- fs/reiserfs/journal.c | 17 ++--- fs/super.c | 14 ++-- fs/xfs/linux-2.6/xfs_super.c | 2 +- include/linux/fs.h | 14 ++-- kernel/power/swap.c | 5 +- mm/swapfile.c | 7 +- 26 files changed, 162 insertions(+), 318 deletions(-) (limited to 'kernel') diff --git a/block/ioctl.c b/block/ioctl.c index d724ceb1d465..cc46d499fd27 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -294,11 +294,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EINVAL; if (get_user(n, (int __user *) arg)) return -EFAULT; - if (!(mode & FMODE_EXCL) && bd_claim(bdev, &bdev) < 0) + if (!(mode & FMODE_EXCL) && + blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) return -EBUSY; ret = set_blocksize(bdev, n); if (!(mode & FMODE_EXCL)) - bd_release(bdev); + blkdev_put(bdev, mode | FMODE_EXCL); return ret; case BLKPG: ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 9bdcf4393c0a..0590b9f67ec6 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -923,8 +923,6 @@ struct drbd_md { struct drbd_backing_dev { struct block_device *backing_bdev; struct block_device *md_bdev; - struct file *lo_file; - struct file *md_file; struct drbd_md md; struct disk_conf dc; /* The user provided config... */ sector_t known_size; /* last known size of that backing device */ diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 25c7a73c5062..7ec1a82064a9 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -3361,11 +3361,8 @@ void drbd_free_bc(struct drbd_backing_dev *ldev) if (ldev == NULL) return; - bd_release(ldev->backing_bdev); - bd_release(ldev->md_bdev); - - fput(ldev->lo_file); - fput(ldev->md_file); + blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(ldev); } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 87925e97e613..fd0346090289 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -855,7 +855,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp sector_t max_possible_sectors; sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ - struct inode *inode, *inode2; + struct block_device *bdev; struct lru_cache *resync_lru = NULL; union drbd_state ns, os; unsigned int max_seg_s; @@ -902,46 +902,40 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } } - nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); - if (IS_ERR(nbc->lo_file)) { + bdev = open_bdev_exclusive(nbc->dc.backing_dev, + FMODE_READ | FMODE_WRITE, mdev); + if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, - PTR_ERR(nbc->lo_file)); - nbc->lo_file = NULL; + PTR_ERR(bdev)); retcode = ERR_OPEN_DISK; goto fail; } + nbc->backing_bdev = bdev; - inode = nbc->lo_file->f_dentry->d_inode; - - if (!S_ISBLK(inode->i_mode)) { - retcode = ERR_DISK_NOT_BDEV; - goto fail; - } - - nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); - if (IS_ERR(nbc->md_file)) { + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = open_bdev_exclusive(nbc->dc.meta_dev, + FMODE_READ | FMODE_WRITE, + (nbc->dc.meta_dev_idx < 0) ? + (void *)mdev : (void *)drbd_m_holder); + if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, - PTR_ERR(nbc->md_file)); - nbc->md_file = NULL; + PTR_ERR(bdev)); retcode = ERR_OPEN_MD_DISK; goto fail; } + nbc->md_bdev = bdev; - inode2 = nbc->md_file->f_dentry->d_inode; - - if (!S_ISBLK(inode2->i_mode)) { - retcode = ERR_MD_NOT_BDEV; - goto fail; - } - - nbc->backing_bdev = inode->i_bdev; - if (bd_claim(nbc->backing_bdev, mdev)) { - printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", - nbc->backing_bdev, mdev, - nbc->backing_bdev->bd_holder, - nbc->backing_bdev->bd_contains->bd_holder, - nbc->backing_bdev->bd_holders); - retcode = ERR_BDCLAIM_DISK; + if ((nbc->backing_bdev == nbc->md_bdev) != + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; goto fail; } @@ -950,28 +944,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp offsetof(struct bm_extent, lce)); if (!resync_lru) { retcode = ERR_NOMEM; - goto release_bdev_fail; - } - - /* meta_dev_idx >= 0: external fixed size, - * possibly multiple drbd sharing one meta device. - * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is - * not yet used by some other drbd minor! - * (if you use drbd.conf + drbdadm, - * that should check it for you already; but if you don't, or someone - * fooled it, we need to double check here) */ - nbc->md_bdev = inode2->i_bdev; - if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev - : (void *) drbd_m_holder)) { - retcode = ERR_BDCLAIM_MD_DISK; - goto release_bdev_fail; - } - - if ((nbc->backing_bdev == nbc->md_bdev) != - (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || - nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { - retcode = ERR_MD_IDX_INVALID; - goto release_bdev2_fail; + goto fail; } /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ @@ -982,7 +955,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + goto fail; } if (nbc->dc.meta_dev_idx < 0) { @@ -999,7 +972,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); - goto release_bdev2_fail; + goto fail; } /* Make sure the new disk is big enough @@ -1007,7 +980,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { retcode = ERR_DISK_TO_SMALL; - goto release_bdev2_fail; + goto fail; } nbc->known_size = drbd_get_capacity(nbc->backing_bdev); @@ -1030,7 +1003,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); drbd_resume_io(mdev); if (retcode < SS_SUCCESS) - goto release_bdev2_fail; + goto fail; if (!get_ldev_if_state(mdev, D_ATTACHING)) goto force_diskless; @@ -1264,18 +1237,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp force_diskless: drbd_force_state(mdev, NS(disk, D_DISKLESS)); drbd_md_sync(mdev); - release_bdev2_fail: - if (nbc) - bd_release(nbc->md_bdev); - release_bdev_fail: - if (nbc) - bd_release(nbc->backing_bdev); fail: if (nbc) { - if (nbc->lo_file) - fput(nbc->lo_file); - if (nbc->md_file) - fput(nbc->md_file); + if (nbc->backing_bdev) + blkdev_put(nbc->backing_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); + if (nbc->md_bdev) + blkdev_put(nbc->md_bdev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL); kfree(nbc); } lc_destroy(resync_lru); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 19b3568e9326..77d70eebb6b2 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2296,15 +2296,12 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) * so bdget() can't fail. */ bdget(pd->bdev->bd_dev); - if ((ret = blkdev_get(pd->bdev, FMODE_READ))) + if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd))) goto out; - if ((ret = bd_claim(pd->bdev, pd))) - goto out_putdev; - if ((ret = pkt_get_last_written(pd, &lba))) { printk(DRIVER_NAME": pkt_get_last_written failed\n"); - goto out_unclaim; + goto out_putdev; } set_capacity(pd->disk, lba << 2); @@ -2314,7 +2311,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) q = bdev_get_queue(pd->bdev); if (write) { if ((ret = pkt_open_write(pd))) - goto out_unclaim; + goto out_putdev; /* * Some CDRW drives can not handle writes larger than one packet, * even if the size is a multiple of the packet size. @@ -2329,23 +2326,21 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) } if ((ret = pkt_set_segment_merging(pd, q))) - goto out_unclaim; + goto out_putdev; if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { printk(DRIVER_NAME": not enough memory for buffers\n"); ret = -ENOMEM; - goto out_unclaim; + goto out_putdev; } printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); } return 0; -out_unclaim: - bd_release(pd->bdev); out_putdev: - blkdev_put(pd->bdev, FMODE_READ); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); out: return ret; } @@ -2362,8 +2357,7 @@ static void pkt_release_dev(struct pktcdvd_device *pd, int flush) pkt_lock_door(pd, 0); pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); - bd_release(pd->bdev); - blkdev_put(pd->bdev, FMODE_READ); + blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL); pkt_shrink_pktlist(pd); } @@ -2733,7 +2727,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) bdev = bdget(dev); if (!bdev) return -ENOMEM; - ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY); + ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL); if (ret) return ret; diff --git a/drivers/char/raw.c b/drivers/char/raw.c index bfe25ea9766b..b4b9d5a47885 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -65,15 +65,12 @@ static int raw_open(struct inode *inode, struct file *filp) if (!bdev) goto out; igrab(bdev->bd_inode); - err = blkdev_get(bdev, filp->f_mode); + err = blkdev_get(bdev, filp->f_mode | FMODE_EXCL, raw_open); if (err) goto out; - err = bd_claim(bdev, raw_open); - if (err) - goto out1; err = set_blocksize(bdev, bdev_logical_block_size(bdev)); if (err) - goto out2; + goto out1; filp->f_flags |= O_DIRECT; filp->f_mapping = bdev->bd_inode->i_mapping; if (++raw_devices[minor].inuse == 1) @@ -83,10 +80,8 @@ static int raw_open(struct inode *inode, struct file *filp) mutex_unlock(&raw_mutex); return 0; -out2: - bd_release(bdev); out1: - blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode | FMODE_EXCL); out: mutex_unlock(&raw_mutex); return err; @@ -110,8 +105,7 @@ static int raw_release(struct inode *inode, struct file *filp) } mutex_unlock(&raw_mutex); - bd_release(bdev); - blkdev_put(bdev, filp->f_mode); + blkdev_put(bdev, filp->f_mode | FMODE_EXCL); return 0; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2c876ffc63df..9e88ca0c55e9 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -325,20 +325,13 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev, BUG_ON(d->dm_dev.bdev); - bdev = open_by_devnum(dev, d->dm_dev.mode); + bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); if (IS_ERR(bdev)) return PTR_ERR(bdev); - r = bd_claim(bdev, _claim_ptr); - if (r) { - blkdev_put(bdev, d->dm_dev.mode); - return r; - } - r = bd_link_disk_holder(bdev, dm_disk(md)); if (r) { - bd_release(bdev); - blkdev_put(bdev, d->dm_dev.mode); + blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); return r; } @@ -354,9 +347,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) if (!d->dm_dev.bdev) return; - bd_unlink_disk_holder(d->dm_dev.bdev); - bd_release(d->dm_dev.bdev); - blkdev_put(d->dm_dev.bdev, d->dm_dev.mode); + blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); d->dm_dev.bdev = NULL; } diff --git a/drivers/md/md.c b/drivers/md/md.c index c47644fca1a1..6af951ffe0bb 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1907,7 +1907,6 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) MD_BUG(); return; } - bd_unlink_disk_holder(rdev->bdev); list_del_rcu(&rdev->same_set); printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); rdev->mddev = NULL; @@ -1935,19 +1934,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (mdk_rdev_t *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); return PTR_ERR(bdev); } - err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); - if (err) { - printk(KERN_ERR "md: could not bd_claim %s.\n", - bdevname(bdev, b)); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return err; - } if (!shared) set_bit(AllReserved, &rdev->flags); rdev->bdev = bdev; @@ -1960,8 +1953,7 @@ static void unlock_rdev(mdk_rdev_t *rdev) rdev->bdev = NULL; if (!bdev) MD_BUG(); - bd_release(bdev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } void md_autodetect_dev(dev_t dev); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index a9e2d3b38aeb..aa557beb8f51 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -224,7 +224,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) if (dev->blkdev) { invalidate_mapping_pages(dev->blkdev->bd_inode->i_mapping, 0, -1); - close_bdev_exclusive(dev->blkdev, FMODE_READ|FMODE_WRITE); + blkdev_put(dev->blkdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } kfree(dev); @@ -234,7 +234,7 @@ static void block2mtd_free_device(struct block2mtd_dev *dev) /* FIXME: ensure that mtd->size % erase_size == 0 */ static struct block2mtd_dev *add_device(char *devname, int erase_size) { - const fmode_t mode = FMODE_READ | FMODE_WRITE; + const fmode_t mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; struct block_device *bdev; struct block2mtd_dev *dev; char *name; @@ -255,17 +255,8 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) to resolve the device name by other means. */ dev_t devt = name_to_dev_t(devname); - if (devt) { - bdev = open_by_devnum(devt, mode); - if (!IS_ERR(bdev)) { - int ret; - ret = bd_claim(bdev, dev); - if (ret) { - blkdev_put(bdev, mode); - bdev = ERR_PTR(ret); - } - } - } + if (devt) + bdev = open_by_devnum(devt, mode, dev); } #endif diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 30a1ca3d08b7..5505bc07e1e7 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -103,7 +103,7 @@ int dasd_scan_partitions(struct dasd_block *block) struct block_device *bdev; bdev = bdget_disk(block->gdp, 0); - if (!bdev || blkdev_get(bdev, FMODE_READ) < 0) + if (!bdev || blkdev_get(bdev, FMODE_READ, NULL) < 0) return -ENODEV; /* * See fs/partition/check.c:register_disk,rescan_partitions diff --git a/fs/block_dev.c b/fs/block_dev.c index 9329068684d2..fc48912354d1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -660,7 +660,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, else if (bdev->bd_contains == bdev) return true; /* is a whole device which isn't held */ - else if (whole->bd_holder == bd_claim) + else if (whole->bd_holder == bd_may_claim) return true; /* is a partition of a device that is being partitioned */ else if (whole->bd_holder != NULL) return false; /* is a partition of a held device */ @@ -807,10 +807,10 @@ static void __bd_claim(struct block_device *bdev, struct block_device *whole, { /* note that for a whole device bd_holders * will be incremented twice, and bd_holder will - * be set to bd_claim before being set to holder + * be set to bd_may_claim before being set to holder */ whole->bd_holders++; - whole->bd_holder = bd_claim; + whole->bd_holder = bd_may_claim; bdev->bd_holders++; bdev->bd_holder = holder; } @@ -835,37 +835,7 @@ static void bd_finish_claiming(struct block_device *bdev, __bd_abort_claiming(whole, holder); /* not actually an abort */ } -/** - * bd_claim - claim a block device - * @bdev: block device to claim - * @holder: holder trying to claim @bdev - * - * Try to claim @bdev which must have been opened successfully. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 if successful, -EBUSY if @bdev is already claimed. - */ -int bd_claim(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev->bd_contains; - int res; - - might_sleep(); - - spin_lock(&bdev_lock); - res = bd_prepare_to_claim(bdev, whole, holder); - if (res == 0) - __bd_claim(bdev, whole, holder); - spin_unlock(&bdev_lock); - - return res; -} -EXPORT_SYMBOL(bd_claim); - -void bd_release(struct block_device *bdev) +static void bd_release(struct block_device *bdev) { spin_lock(&bdev_lock); if (!--bdev->bd_contains->bd_holders) @@ -875,8 +845,6 @@ void bd_release(struct block_device *bdev) spin_unlock(&bdev_lock); } -EXPORT_SYMBOL(bd_release); - #ifdef CONFIG_SYSFS static int add_symlink(struct kobject *from, struct kobject *to) { @@ -943,7 +911,7 @@ out_unlock: } EXPORT_SYMBOL_GPL(bd_link_disk_holder); -void bd_unlink_disk_holder(struct block_device *bdev) +static void bd_unlink_disk_holder(struct block_device *bdev) { struct gendisk *disk = bdev->bd_holder_disk; @@ -954,7 +922,9 @@ void bd_unlink_disk_holder(struct block_device *bdev) del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); } -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); +#else +static inline void bd_unlink_disk_holder(struct block_device *bdev) +{ } #endif /* @@ -964,12 +934,12 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); * to be used for internal purposes. If you ever need it - reconsider * your API. */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode) +struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder) { struct block_device *bdev = bdget(dev); int err = -ENOMEM; if (bdev) - err = blkdev_get(bdev, mode); + err = blkdev_get(bdev, mode, holder); return err ? ERR_PTR(err) : bdev; } @@ -1235,17 +1205,37 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } -int blkdev_get(struct block_device *bdev, fmode_t mode) +int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { - return __blkdev_get(bdev, mode, 0); + struct block_device *whole = NULL; + int res; + + WARN_ON_ONCE((mode & FMODE_EXCL) && !holder); + + if ((mode & FMODE_EXCL) && holder) { + whole = bd_start_claiming(bdev, holder); + if (IS_ERR(whole)) { + bdput(bdev); + return PTR_ERR(whole); + } + } + + res = __blkdev_get(bdev, mode, 0); + + if (whole) { + if (res == 0) + bd_finish_claiming(bdev, whole, holder); + else + bd_abort_claiming(whole, holder); + } + + return res; } EXPORT_SYMBOL(blkdev_get); static int blkdev_open(struct inode * inode, struct file * filp) { - struct block_device *whole = NULL; struct block_device *bdev; - int res; /* * Preserve backwards compatibility and allow large file access @@ -1266,26 +1256,9 @@ static int blkdev_open(struct inode * inode, struct file * filp) if (bdev == NULL) return -ENOMEM; - if (filp->f_mode & FMODE_EXCL) { - whole = bd_start_claiming(bdev, filp); - if (IS_ERR(whole)) { - bdput(bdev); - return PTR_ERR(whole); - } - } - filp->f_mapping = bdev->bd_inode->i_mapping; - res = blkdev_get(bdev, filp->f_mode); - - if (whole) { - if (res == 0) - bd_finish_claiming(bdev, whole, filp); - else - bd_abort_claiming(whole, filp); - } - - return res; + return blkdev_get(bdev, filp->f_mode, filp); } static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -1329,6 +1302,13 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) int blkdev_put(struct block_device *bdev, fmode_t mode) { + if (mode & FMODE_EXCL) { + mutex_lock(&bdev->bd_mutex); + bd_release(bdev); + if (!bdev->bd_holders) + bd_unlink_disk_holder(bdev); + mutex_unlock(&bdev->bd_mutex); + } return __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); @@ -1336,8 +1316,7 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); - if (bdev->bd_holder == filp) - bd_release(bdev); + return blkdev_put(bdev, filp->f_mode); } @@ -1494,55 +1473,27 @@ EXPORT_SYMBOL(lookup_bdev); */ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) { - struct block_device *bdev, *whole; + struct block_device *bdev; int error; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; - whole = bd_start_claiming(bdev, holder); - if (IS_ERR(whole)) { - bdput(bdev); - return whole; - } - - error = blkdev_get(bdev, mode); + error = blkdev_get(bdev, mode | FMODE_EXCL, holder); if (error) - goto out_abort_claiming; + return ERR_PTR(error); - error = -EACCES; - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) - goto out_blkdev_put; + if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, mode); + return ERR_PTR(-EACCES); + } - bd_finish_claiming(bdev, whole, holder); return bdev; - -out_blkdev_put: - blkdev_put(bdev, mode); -out_abort_claiming: - bd_abort_claiming(whole, holder); - return ERR_PTR(error); } EXPORT_SYMBOL(open_bdev_exclusive); -/** - * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() - * - * @bdev: blockdevice to close - * @mode: mode, must match that used to open. - * - * This is the counterpart to open_bdev_exclusive(). - */ -void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) -{ - bd_release(bdev); - blkdev_put(bdev, mode); -} - -EXPORT_SYMBOL(close_bdev_exclusive); - int __invalidate_device(struct block_device *bdev) { struct super_block *sb = get_super(bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d39596224d21..f1b729d3b883 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -489,7 +489,7 @@ again: continue; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode | FMODE_EXCL); device->bdev = NULL; fs_devices->open_devices--; } @@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode | FMODE_EXCL); fs_devices->open_devices--; } if (device->writeable) { @@ -638,7 +638,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); error_close: - close_bdev_exclusive(bdev, flags); + blkdev_put(bdev, flags | FMODE_EXCL); error: continue; } @@ -716,7 +716,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - close_bdev_exclusive(bdev, flags); + blkdev_put(bdev, flags | FMODE_EXCL); error: mutex_unlock(&uuid_mutex); return ret; @@ -1244,7 +1244,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - close_bdev_exclusive(device->bdev, device->mode); + blkdev_put(device->bdev, device->mode | FMODE_EXCL); device->bdev = NULL; device->fs_devices->open_devices--; } @@ -1287,7 +1287,7 @@ error_brelse: brelse(bh); error_close: if (bdev) - close_bdev_exclusive(bdev, FMODE_READ); + blkdev_put(bdev, FMODE_READ | FMODE_EXCL); out: mutex_unlock(&root->fs_info->volume_mutex); mutex_unlock(&uuid_mutex); @@ -1565,7 +1565,7 @@ out: mutex_unlock(&root->fs_info->volume_mutex); return ret; error: - close_bdev_exclusive(bdev, 0); + blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 2fedaf8b5012..23e7513dba9c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -364,8 +364,7 @@ fail: */ static int ext3_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext3_blkdev_remove(struct ext3_sb_info *sbi) @@ -2136,13 +2135,6 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext3_msg(sb, KERN_ERR, - "error: failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 61182fe6254e..5dd0b3e76fa8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; @@ -663,8 +663,7 @@ fail: */ static int ext4_blkdev_put(struct block_device *bdev) { - bd_release(bdev); - return blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int ext4_blkdev_remove(struct ext4_sb_info *sbi) @@ -3758,13 +3757,6 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (bdev == NULL) return NULL; - if (bd_claim(bdev, sb)) { - ext4_msg(sb, KERN_ERR, - "failed to claim external journal device"); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); - return NULL; - } - blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 3eb1393f7b81..c1f0763a022b 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, goto error_bdev; if (s->s_root) - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; @@ -1342,7 +1342,7 @@ error_super: deactivate_locked_super(s); return ERR_PTR(error); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); return ERR_PTR(error); } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e1b8493b9aaa..5a290f22dcc3 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1120,16 +1120,13 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE); + bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); if (IS_ERR(bdev)) { rc = -PTR_ERR(bdev); goto free; } - if ((rc = bd_claim(bdev, log))) { - goto close; - } - log->bdev = bdev; memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); @@ -1137,7 +1134,7 @@ int lmLogOpen(struct super_block *sb) * initialize log: */ if ((rc = lmLogInit(log))) - goto unclaim; + goto close; list_add(&log->journal_list, &jfs_external_logs); @@ -1163,11 +1160,8 @@ journal_found: list_del(&log->journal_list); lbmLogShutdown(log); - unclaim: - bd_release(bdev); - close: /* close external log device */ - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1512,8 +1506,7 @@ int lmLogClose(struct super_block *sb) bdev = log->bdev; rc = lmLogShutdown(log); - bd_release(bdev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); kfree(log); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 92ca6fbe09bd..734b9025858e 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -300,7 +300,7 @@ static int bdev_write_sb(struct super_block *sb, struct page *page) static void bdev_put_device(struct logfs_super *s) { - close_bdev_exclusive(s->s_bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } static int bdev_can_write_buf(struct super_block *sb, u64 ofs) @@ -331,7 +331,7 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { int mtdnr = MINOR(bdev->bd_dev); - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); return logfs_get_sb_mtd(p, mtdnr); } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f804d41ec9d3..756a6798d7c8 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode | FMODE_EXCL); return root_dentry; @@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - close_bdev_exclusive(sd.bdev, mode); + blkdev_put(sd.bdev, mode | FMODE_EXCL); return ERR_PTR(err); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 52c7557f3e25..d0a2721eaceb 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1674,7 +1674,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, goto out; reg->hr_bdev = I_BDEV(filp->f_mapping->host); - ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ); + ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL); if (ret) { reg->hr_bdev = NULL; goto out; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 0a8b0ad0c7e2..2e6501d034ab 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -549,7 +549,7 @@ void register_disk(struct gendisk *disk) goto exit; bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ); + err = blkdev_get(bdev, FMODE_READ, NULL); if (err < 0) goto exit; blkdev_put(bdev, FMODE_READ); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 076c8b194682..b488136f5ace 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2552,8 +2552,6 @@ static int release_journal_dev(struct super_block *super, result = 0; if (journal->j_dev_bd != NULL) { - if (journal->j_dev_bd->bd_dev != super->s_dev) - bd_release(journal->j_dev_bd); result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode); journal->j_dev_bd = NULL; } @@ -2571,7 +2569,7 @@ static int journal_init_dev(struct super_block *super, { int result; dev_t jdev; - fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE; + fmode_t blkdev_mode = FMODE_READ | FMODE_WRITE | FMODE_EXCL; char b[BDEVNAME_SIZE]; result = 0; @@ -2585,7 +2583,9 @@ static int journal_init_dev(struct super_block *super, /* there is no "jdev" option and journal is on separate device */ if ((!jdev_name || !jdev_name[0])) { - journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode); + if (jdev == super->s_dev) + blkdev_mode &= ~FMODE_EXCL; + journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal); journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); @@ -2594,15 +2594,8 @@ static int journal_init_dev(struct super_block *super, "cannot init journal device '%s': %i", __bdevname(jdev, b), result); return result; - } else if (jdev != super->s_dev) { - result = bd_claim(journal->j_dev_bd, journal); - if (result) { - blkdev_put(journal->j_dev_bd, blkdev_mode); - return result; - } - + } else if (jdev != super->s_dev) set_blocksize(journal->j_dev_bd, super->s_blocksize); - } return 0; } diff --git a/fs/super.c b/fs/super.c index ca696155cd9a..22374bf0ba87 100644 --- a/fs/super.c +++ b/fs/super.c @@ -801,13 +801,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, /* * s_umount nests inside bd_mutex during - * __invalidate_device(). close_bdev_exclusive() - * acquires bd_mutex and can't be called under - * s_umount. Drop s_umount temporarily. This is safe - * as we're holding an active reference. + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. */ up_write(&s->s_umount); - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; @@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); error: return ERR_PTR(error); } @@ -862,7 +862,7 @@ void kill_block_super(struct super_block *sb) bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode | FMODE_EXCL); } EXPORT_SYMBOL(kill_block_super); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9f3a78fe6ae4..a1a6e5ceea67 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -623,7 +623,7 @@ xfs_blkdev_put( struct block_device *bdev) { if (bdev) - close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE); + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); } /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 66b7f2c5d7e9..1a033e8ebe4c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2006,7 +2006,8 @@ extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); -extern struct block_device *open_by_devnum(dev_t, fmode_t); +extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode, + void *holder); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); @@ -2037,22 +2038,16 @@ extern const struct file_operations def_fifo_fops; extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); -extern int blkdev_get(struct block_device *, fmode_t); -extern int blkdev_put(struct block_device *, fmode_t); -extern int bd_claim(struct block_device *, void *); -extern void bd_release(struct block_device *); +extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); -extern void bd_unlink_disk_holder(struct block_device *bdev); #else static inline int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { return 0; } -static inline void bd_unlink_disk_holder(struct block_device *bdev) -{ -} #endif #endif @@ -2089,7 +2084,6 @@ extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); -extern void close_bdev_exclusive(struct block_device *, fmode_t); extern void blkdev_show(struct seq_file *,off_t); #else diff --git a/kernel/power/swap.c b/kernel/power/swap.c index a0e4a86ccf94..513a77f1a0b3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -223,7 +223,7 @@ static int swsusp_swap_check(void) return res; root_swap = res; - res = blkdev_get(hib_resume_bdev, FMODE_WRITE); + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); if (res) return res; @@ -907,7 +907,8 @@ int swsusp_check(void) { int error; - hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); + hib_resume_bdev = open_by_devnum(swsusp_resume_device, + FMODE_READ, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); diff --git a/mm/swapfile.c b/mm/swapfile.c index 67ddaaf98c74..b6adcfbf6f48 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1677,7 +1677,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { mutex_lock(&inode->i_mutex); inode->i_flags &= ~S_SWAPFILE; @@ -1939,7 +1939,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { bdev = I_BDEV(inode); - error = bd_claim(bdev, sys_swapon); + error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, + sys_swapon); if (error < 0) { bdev = NULL; error = -EINVAL; @@ -2136,7 +2137,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) bad_swap: if (bdev) { set_blocksize(bdev, p->old_block_size); - bd_release(bdev); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } destroy_swap_extents(p); swap_cgroup_swapoff(type); -- cgit v1.3-8-gc7d7 From d4d77629953eabd3c14f6fa5746f6b28babfc55f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 13 Nov 2010 11:55:18 +0100 Subject: block: clean up blkdev_get() wrappers and their users After recent blkdev_get() modifications, open_by_devnum() and open_bdev_exclusive() are simple wrappers around blkdev_get(). Replace them with blkdev_get_by_dev() and blkdev_get_by_path(). blkdev_get_by_dev() is identical to open_by_devnum(). blkdev_get_by_path() is slightly different in that it doesn't automatically add %FMODE_EXCL to @mode. All users are converted. Most conversions are mechanical and don't introduce any behavior difference. There are several exceptions. * btrfs now sets FMODE_EXCL in btrfs_device->mode, so there's no reason to OR it explicitly on blkdev_put(). * gfs2, nilfs2 and the generic mount_bdev() now set FMODE_EXCL in sb->s_mode. * With the above changes, sb->s_mode now always should contain FMODE_EXCL. WARN_ON_ONCE() added to kill_block_super() to detect errors. The new blkdev_get_*() functions are with proper docbook comments. While at it, add function description to blkdev_get() too. Signed-off-by: Tejun Heo Cc: Philipp Reisner Cc: Neil Brown Cc: Mike Snitzer Cc: Joern Engel Cc: Chris Mason Cc: Jan Kara Cc: "Theodore Ts'o" Cc: KONISHI Ryusuke Cc: reiserfs-devel@vger.kernel.org Cc: xfs-masters@oss.sgi.com Cc: Alexander Viro --- drivers/block/drbd/drbd_nl.c | 12 ++-- drivers/md/dm-table.c | 2 +- drivers/md/md.c | 4 +- drivers/mtd/devices/block2mtd.c | 4 +- fs/block_dev.c | 139 +++++++++++++++++++++++++++------------- fs/btrfs/volumes.c | 24 ++++--- fs/btrfs/volumes.h | 2 +- fs/ext3/super.c | 2 +- fs/ext4/super.c | 2 +- fs/gfs2/ops_fstype.c | 8 +-- fs/jfs/jfs_logmgr.c | 4 +- fs/logfs/dev_bdev.c | 3 +- fs/nilfs2/super.c | 8 +-- fs/reiserfs/journal.c | 6 +- fs/super.c | 9 +-- fs/xfs/linux-2.6/xfs_super.c | 3 +- include/linux/fs.h | 7 +- kernel/power/swap.c | 4 +- 18 files changed, 149 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index fd0346090289..650e43ba4f7c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -902,8 +902,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } } - bdev = open_bdev_exclusive(nbc->dc.backing_dev, - FMODE_READ | FMODE_WRITE, mdev); + bdev = blkdev_get_by_path(nbc->dc.backing_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev); if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, PTR_ERR(bdev)); @@ -920,10 +920,10 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * should check it for you already; but if you don't, or * someone fooled it, we need to double check here) */ - bdev = open_bdev_exclusive(nbc->dc.meta_dev, - FMODE_READ | FMODE_WRITE, - (nbc->dc.meta_dev_idx < 0) ? - (void *)mdev : (void *)drbd_m_holder); + bdev = blkdev_get_by_path(nbc->dc.meta_dev, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, + (nbc->dc.meta_dev_idx < 0) ? + (void *)mdev : (void *)drbd_m_holder); if (IS_ERR(bdev)) { dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, PTR_ERR(bdev)); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 9e88ca0c55e9..67150c32986c 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -325,7 +325,7 @@ static int open_dev(struct dm_dev_internal *d, dev_t dev, BUG_ON(d->dm_dev.bdev); - bdev = open_by_devnum(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); + bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/drivers/md/md.c b/drivers/md/md.c index 6af951ffe0bb..5aaa6bfbe638 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1934,8 +1934,8 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - shared ? (mdk_rdev_t *)lock_rdev : rdev); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + shared ? (mdk_rdev_t *)lock_rdev : rdev); if (IS_ERR(bdev)) { printk(KERN_ERR "md: could not open %s.\n", __bdevname(dev, b)); diff --git a/drivers/mtd/devices/block2mtd.c b/drivers/mtd/devices/block2mtd.c index aa557beb8f51..f29a6f9df6e7 100644 --- a/drivers/mtd/devices/block2mtd.c +++ b/drivers/mtd/devices/block2mtd.c @@ -247,7 +247,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) return NULL; /* Get a handle on the device */ - bdev = open_bdev_exclusive(devname, mode, dev); + bdev = blkdev_get_by_path(devname, mode, dev); #ifndef MODULE if (IS_ERR(bdev)) { @@ -256,7 +256,7 @@ static struct block2mtd_dev *add_device(char *devname, int erase_size) dev_t devt = name_to_dev_t(devname); if (devt) - bdev = open_by_devnum(devt, mode, dev); + bdev = blkdev_get_by_dev(devt, mode, dev); } #endif diff --git a/fs/block_dev.c b/fs/block_dev.c index 606a5259f87f..c1c1b8c3fb99 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -854,24 +854,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev) { } #endif -/* - * Tries to open block device by device number. Use it ONLY if you - * really do not have anything better - i.e. when you are behind a - * truly sucky interface and all you are given is a device number. _Never_ - * to be used for internal purposes. If you ever need it - reconsider - * your API. - */ -struct block_device *open_by_devnum(dev_t dev, fmode_t mode, void *holder) -{ - struct block_device *bdev = bdget(dev); - int err = -ENOMEM; - if (bdev) - err = blkdev_get(bdev, mode, holder); - return err ? ERR_PTR(err) : bdev; -} - -EXPORT_SYMBOL(open_by_devnum); - /** * flush_disk - invalidates all buffer-cache entries on a disk * @@ -1132,6 +1114,25 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) return ret; } +/** + * blkdev_get - open a block device + * @bdev: block_device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is + * open with exclusive access. Specifying %FMODE_EXCL with %NULL + * @holder is invalid. Exclusive opens may nest for the same @holder. + * + * On success, the reference count of @bdev is unchanged. On failure, + * @bdev is put. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { struct block_device *whole = NULL; @@ -1186,6 +1187,80 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) } EXPORT_SYMBOL(blkdev_get); +/** + * blkdev_get_by_path - open a block device by name + * @path: path to the block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by the device file at @path. @mode + * and @holder are identical to blkdev_get(). + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder) +{ + struct block_device *bdev; + int err; + + bdev = lookup_bdev(path); + if (IS_ERR(bdev)) + return bdev; + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_path); + +/** + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: FMODE_* mask + * @holder: exclusive holder identifier + * + * Open the blockdevice described by device number @dev. @mode and + * @holder are identical to blkdev_get(). + * + * Use it ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a + * device number. _Never_ to be used for internal purposes. If you + * ever need it - reconsider your API. + * + * On success, the returned block_device has reference count of one. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Pointer to block_device on success, ERR_PTR(-errno) on failure. + */ +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +{ + struct block_device *bdev; + int err; + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); + + err = blkdev_get(bdev, mode, holder); + if (err) + return ERR_PTR(err); + + return bdev; +} +EXPORT_SYMBOL(blkdev_get_by_dev); + static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; @@ -1436,34 +1511,6 @@ fail: } EXPORT_SYMBOL(lookup_bdev); -/** - * open_bdev_exclusive - open a block device by name and set it up for use - * - * @path: special file representing the block device - * @mode: FMODE_... combination to pass be used - * @holder: owner for exclusion - * - * Open the blockdevice described by the special file at @path, claim it - * for the @holder. - */ -struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) -{ - struct block_device *bdev; - int error; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; - - error = blkdev_get(bdev, mode | FMODE_EXCL, holder); - if (error) - return ERR_PTR(error); - - return bdev; -} - -EXPORT_SYMBOL(open_bdev_exclusive); - int __invalidate_device(struct block_device *bdev) { struct super_block *sb = get_super(bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f1b729d3b883..95324e9f9280 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -489,7 +489,7 @@ again: continue; if (device->bdev) { - blkdev_put(device->bdev, device->mode | FMODE_EXCL); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; fs_devices->open_devices--; } @@ -523,7 +523,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev) { - blkdev_put(device->bdev, device->mode | FMODE_EXCL); + blkdev_put(device->bdev, device->mode); fs_devices->open_devices--; } if (device->writeable) { @@ -580,13 +580,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int seeding = 1; int ret = 0; + flags |= FMODE_EXCL; + list_for_each_entry(device, head, dev_list) { if (device->bdev) continue; if (!device->name) continue; - bdev = open_bdev_exclusive(device->name, flags, holder); + bdev = blkdev_get_by_path(device->name, flags, holder); if (IS_ERR(bdev)) { printk(KERN_INFO "open %s failed\n", device->name); goto error; @@ -638,7 +640,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, error_brelse: brelse(bh); error_close: - blkdev_put(bdev, flags | FMODE_EXCL); + blkdev_put(bdev, flags); error: continue; } @@ -684,7 +686,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, mutex_lock(&uuid_mutex); - bdev = open_bdev_exclusive(path, flags, holder); + flags |= FMODE_EXCL; + bdev = blkdev_get_by_path(path, flags, holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); @@ -716,7 +719,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, brelse(bh); error_close: - blkdev_put(bdev, flags | FMODE_EXCL); + blkdev_put(bdev, flags); error: mutex_unlock(&uuid_mutex); return ret; @@ -1179,8 +1182,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) goto out; } } else { - bdev = open_bdev_exclusive(device_path, FMODE_READ, - root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) { ret = PTR_ERR(bdev); goto out; @@ -1244,7 +1247,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->latest_bdev = next_device->bdev; if (device->bdev) { - blkdev_put(device->bdev, device->mode | FMODE_EXCL); + blkdev_put(device->bdev, device->mode); device->bdev = NULL; device->fs_devices->open_devices--; } @@ -1439,7 +1442,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) return -EINVAL; - bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + bdev = blkdev_get_by_path(device_path, FMODE_EXCL, + root->fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2b638b6e4eea..856e75770304 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -49,7 +49,7 @@ struct btrfs_device { struct block_device *bdev; - /* the mode sent to open_bdev_exclusive */ + /* the mode sent to blkdev_get */ fmode_t mode; char *name; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 23e7513dba9c..123720ba786d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -347,7 +347,7 @@ static struct block_device *ext3_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5dd0b3e76fa8..bd63e6927219 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -647,7 +647,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; char b[BDEVNAME_SIZE]; - bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); + bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, sb); if (IS_ERR(bdev)) goto fail; return bdev; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c1f0763a022b..bc56ccf98ffd 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1268,7 +1268,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error; struct gfs2_args args; struct gfs2_sbd *sdp; @@ -1276,7 +1276,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -1298,7 +1298,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, goto error_bdev; if (s->s_root) - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, mode); memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; @@ -1342,7 +1342,7 @@ error_super: deactivate_locked_super(s); return ERR_PTR(error); error_bdev: - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, mode); return ERR_PTR(error); } diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 5a290f22dcc3..278e3fb40b71 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1120,8 +1120,8 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = open_by_devnum(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - log); + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); if (IS_ERR(bdev)) { rc = -PTR_ERR(bdev); goto free; diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 734b9025858e..723bc5bca09a 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -325,7 +325,8 @@ int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, { struct block_device *bdev; - bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type); + bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + type); if (IS_ERR(bdev)) return PTR_ERR(bdev); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 756a6798d7c8..0030640e2d72 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1147,14 +1147,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags, { struct nilfs_super_data sd; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; struct dentry *root_dentry; int err, s_new = false; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - sd.bdev = open_bdev_exclusive(dev_name, mode, fs_type); + sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(sd.bdev)) return ERR_CAST(sd.bdev); @@ -1233,7 +1233,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s_new) - blkdev_put(sd.bdev, mode | FMODE_EXCL); + blkdev_put(sd.bdev, mode); return root_dentry; @@ -1242,7 +1242,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, failed: if (!s_new) - blkdev_put(sd.bdev, mode | FMODE_EXCL); + blkdev_put(sd.bdev, mode); return ERR_PTR(err); } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index b488136f5ace..e2fce519c0f2 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2585,7 +2585,8 @@ static int journal_init_dev(struct super_block *super, if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) blkdev_mode &= ~FMODE_EXCL; - journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, + journal); journal->j_dev_mode = blkdev_mode; if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); @@ -2601,8 +2602,7 @@ static int journal_init_dev(struct super_block *super, } journal->j_dev_mode = blkdev_mode; - journal->j_dev_bd = open_bdev_exclusive(jdev_name, - blkdev_mode, journal); + journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, journal); if (IS_ERR(journal->j_dev_bd)) { result = PTR_ERR(journal->j_dev_bd); journal->j_dev_bd = NULL; diff --git a/fs/super.c b/fs/super.c index 22374bf0ba87..5d9a4497849a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -766,13 +766,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -807,7 +807,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, * holding an active reference. */ up_write(&s->s_umount); - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, mode); down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; @@ -831,7 +831,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, mode); error: return ERR_PTR(error); } @@ -862,6 +862,7 @@ void kill_block_super(struct super_block *sb) bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); blkdev_put(bdev, mode | FMODE_EXCL); } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a1a6e5ceea67..9209cd199c47 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -609,7 +609,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = open_bdev_exclusive(name, FMODE_READ|FMODE_WRITE, mp); + *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); printk("XFS: Invalid device [%s], error=%d\n", name, error); diff --git a/include/linux/fs.h b/include/linux/fs.h index 1a033e8ebe4c..f48501563917 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2006,8 +2006,6 @@ extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); -extern struct block_device *open_by_devnum(dev_t dev, fmode_t mode, - void *holder); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); @@ -2039,6 +2037,10 @@ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); extern long compat_blkdev_ioctl(struct file *, unsigned, unsigned long); extern int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder); +extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, + void *holder); +extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, + void *holder); extern int blkdev_put(struct block_device *bdev, fmode_t mode); #ifdef CONFIG_SYSFS extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk); @@ -2083,7 +2085,6 @@ static inline void unregister_chrdev(unsigned int major, const char *name) extern const char *__bdevname(dev_t, char *buffer); extern const char *bdevname(struct block_device *bdev, char *buffer); extern struct block_device *lookup_bdev(const char *); -extern struct block_device *open_bdev_exclusive(const char *, fmode_t, void *); extern void blkdev_show(struct seq_file *,off_t); #else diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 513a77f1a0b3..b019609d1b45 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -907,8 +907,8 @@ int swsusp_check(void) { int error; - hib_resume_bdev = open_by_devnum(swsusp_resume_device, - FMODE_READ, NULL); + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); if (!IS_ERR(hib_resume_bdev)) { set_blocksize(hib_resume_bdev, PAGE_SIZE); clear_page(swsusp_header); -- cgit v1.3-8-gc7d7 From d07335e51df0c6dec202d315fc4f1f7e100eec4e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 16 Nov 2010 12:52:38 +0100 Subject: block: Rename "block_remap" tracepoint to "block_bio_remap" to clarify the event. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Signed-off-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 10 +++++----- drivers/md/dm.c | 4 ++-- include/trace/events/block.h | 6 +++--- kernel/trace/blktrace.c | 12 ++++++------ 4 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 4ce953f1b390..151070541e21 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -33,7 +33,7 @@ #include "blk.h" -EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); @@ -1329,9 +1329,9 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_sector += p->start_sect; bio->bi_bdev = bdev->bd_contains; - trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, - bio->bi_sector - p->start_sect); + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio, + bdev->bd_dev, + bio->bi_sector - p->start_sect); } } @@ -1500,7 +1500,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, old_sector); + trace_block_bio_remap(q, bio, old_dev, old_sector); old_sector = bio->bi_sector; old_dev = bio->bi_bdev->bd_dev; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7cb1352f7e7a..0a2b5516bc21 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -990,8 +990,8 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, if (r == DM_MAPIO_REMAPPED) { /* the bio has been remapped so dispatch it */ - trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, sector); + trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d8ce278515c3..b56c65dc105d 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -486,16 +486,16 @@ TRACE_EVENT(block_split, ); /** - * block_remap - map request for a partition to the raw device + * block_bio_remap - map request for a logical device to the raw device * @q: queue holding the operation * @bio: revised operation * @dev: device for the operation * @from: original sector for the operation * - * An operation for a partition on a block device has been mapped to the + * An operation for a logical device has been mapped to the * raw block device. */ -TRACE_EVENT(block_remap, +TRACE_EVENT(block_bio_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, sector_t from), diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7b8ec0281548..2b8e2ee7c0ef 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -887,7 +887,7 @@ static void blk_add_trace_split(void *ignore, } /** - * blk_add_trace_remap - Add a trace for a remap operation + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio @@ -899,9 +899,9 @@ static void blk_add_trace_split(void *ignore, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -1016,7 +1016,7 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap, NULL); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); WARN_ON(ret); ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); @@ -1025,7 +1025,7 @@ static void blk_register_tracepoints(void) static void blk_unregister_tracepoints(void) { unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); - unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); -- cgit v1.3-8-gc7d7 From 073ef1f6e508688392580e4f35dcad9aabd1e100 Mon Sep 17 00:00:00 2001 From: Lionel Debroux Date: Tue, 9 Nov 2010 21:48:49 +0100 Subject: hibernation: constify platform_hibernation_ops Patch against mainline. Changes since v1: added one hunk; no longer adding "const" qualifier to pointers in platform_hibernation_ops after seeing b4144e4f6e3b448d322095ca08af393682a69e33. Signed-off-by: Jiri Kosina --- drivers/acpi/sleep.c | 4 ++-- include/linux/suspend.h | 4 ++-- kernel/power/hibernate.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 721d93b3ceee..5149c9bf7015 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -498,7 +498,7 @@ static void acpi_pm_thaw(void) acpi_enable_all_runtime_gpes(); } -static struct platform_hibernation_ops acpi_hibernation_ops = { +static const struct platform_hibernation_ops acpi_hibernation_ops = { .begin = acpi_hibernation_begin, .end = acpi_pm_end, .pre_snapshot = acpi_pm_prepare, @@ -541,7 +541,7 @@ static int acpi_hibernation_begin_old(void) * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has * been requested. */ -static struct platform_hibernation_ops acpi_hibernation_ops_old = { +static const struct platform_hibernation_ops acpi_hibernation_ops_old = { .begin = acpi_hibernation_begin_old, .end = acpi_pm_end, .pre_snapshot = acpi_pm_pre_suspend, diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 26697514c5ec..40ead943fd6a 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -245,7 +245,7 @@ extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); extern unsigned long get_safe_page(gfp_t gfp_mask); -extern void hibernation_set_ops(struct platform_hibernation_ops *ops); +extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); extern bool system_entering_hibernation(void); #else /* CONFIG_HIBERNATION */ @@ -253,7 +253,7 @@ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} static inline void swsusp_unset_page_free(struct page *p) {} -static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {} +static inline void hibernation_set_ops(const struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 657272e91d0a..491b81a27111 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -51,14 +51,14 @@ enum { static int hibernation_mode = HIBERNATION_SHUTDOWN; -static struct platform_hibernation_ops *hibernation_ops; +static const struct platform_hibernation_ops *hibernation_ops; /** * hibernation_set_ops - set the global hibernate operations * @ops: the hibernation operations to use in subsequent hibernation transitions */ -void hibernation_set_ops(struct platform_hibernation_ops *ops) +void hibernation_set_ops(const struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore -- cgit v1.3-8-gc7d7 From 2f55ac072f5344519348c0c94b3d2f4cca46847b Mon Sep 17 00:00:00 2001 From: Lionel Debroux Date: Tue, 16 Nov 2010 14:14:02 +0100 Subject: suspend: constify platform_suspend_ops While at it, fix two checkpatch errors. Several non-const struct instances constified by this patch were added after the introduction of platform_suspend_ops in checkpatch.pl's list of "should be const" structs (79404849e90a41ea2109bd0e2f7c7164b0c4ce73). Patch against mainline. Inspired by hunks of the grsecurity patch, updated for newer kernels. Signed-off-by: Lionel Debroux Acked-by: Ingo Molnar Signed-off-by: Jiri Kosina --- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-davinci/pm.c | 2 +- arch/arm/mach-imx/pm-imx27.c | 2 +- arch/arm/mach-lpc32xx/pm.c | 2 +- arch/arm/mach-omap1/pm.c | 2 +- arch/arm/mach-omap2/pm24xx.c | 2 +- arch/arm/mach-omap2/pm34xx.c | 2 +- arch/arm/mach-omap2/pm44xx.c | 2 +- arch/arm/mach-pnx4008/pm.c | 2 +- arch/arm/mach-pxa/pm.c | 2 +- arch/arm/mach-pxa/sharpsl_pm.c | 2 +- arch/arm/mach-sa1100/pm.c | 2 +- arch/arm/plat-samsung/pm.c | 2 +- arch/avr32/mach-at32ap/pm.c | 2 +- arch/blackfin/mach-common/pm.c | 2 +- arch/mips/alchemy/devboards/pm.c | 2 +- arch/mips/jz4740/pm.c | 2 +- arch/mips/loongson/common/pm.c | 2 +- arch/powerpc/platforms/52xx/lite5200_pm.c | 2 +- arch/powerpc/platforms/52xx/mpc52xx_pm.c | 2 +- arch/powerpc/platforms/83xx/suspend.c | 2 +- arch/powerpc/platforms/pseries/suspend.c | 2 +- arch/powerpc/sysdev/fsl_pmc.c | 2 +- arch/sh/boards/mach-hp6xx/pm.c | 2 +- arch/sh/kernel/cpu/shmobile/pm.c | 2 +- drivers/acpi/sleep.c | 4 ++-- drivers/macintosh/via-pmu.c | 2 +- include/linux/suspend.h | 4 ++-- kernel/power/suspend.c | 4 ++-- 29 files changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index dafbacc25eb1..ea53f4d9b283 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -301,7 +301,7 @@ static void at91_pm_end(void) } -static struct platform_suspend_ops at91_pm_ops ={ +static const struct platform_suspend_ops at91_pm_ops = { .valid = at91_pm_valid_state, .begin = at91_pm_begin, .enter = at91_pm_enter, diff --git a/arch/arm/mach-davinci/pm.c b/arch/arm/mach-davinci/pm.c index fab953b43dea..1bd73a04be20 100644 --- a/arch/arm/mach-davinci/pm.c +++ b/arch/arm/mach-davinci/pm.c @@ -110,7 +110,7 @@ static int davinci_pm_enter(suspend_state_t state) return ret; } -static struct platform_suspend_ops davinci_pm_ops = { +static const struct platform_suspend_ops davinci_pm_ops = { .enter = davinci_pm_enter, .valid = suspend_valid_only_mem, }; diff --git a/arch/arm/mach-imx/pm-imx27.c b/arch/arm/mach-imx/pm-imx27.c index afc17ce0bb54..2153ca71bb82 100644 --- a/arch/arm/mach-imx/pm-imx27.c +++ b/arch/arm/mach-imx/pm-imx27.c @@ -32,7 +32,7 @@ static int mx27_suspend_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops mx27_suspend_ops = { +static const struct platform_suspend_ops mx27_suspend_ops = { .enter = mx27_suspend_enter, .valid = suspend_valid_only_mem, }; diff --git a/arch/arm/mach-lpc32xx/pm.c b/arch/arm/mach-lpc32xx/pm.c index a6e2aed9a49f..e76d41bb7056 100644 --- a/arch/arm/mach-lpc32xx/pm.c +++ b/arch/arm/mach-lpc32xx/pm.c @@ -123,7 +123,7 @@ static int lpc32xx_pm_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops lpc32xx_pm_ops = { +static const struct platform_suspend_ops lpc32xx_pm_ops = { .valid = suspend_valid_only_mem, .enter = lpc32xx_pm_enter, }; diff --git a/arch/arm/mach-omap1/pm.c b/arch/arm/mach-omap1/pm.c index b1d3f9fade23..4cf3b67dd998 100644 --- a/arch/arm/mach-omap1/pm.c +++ b/arch/arm/mach-omap1/pm.c @@ -647,7 +647,7 @@ static struct irqaction omap_wakeup_irq = { -static struct platform_suspend_ops omap_pm_ops ={ +static const struct platform_suspend_ops omap_pm_ops = { .prepare = omap_pm_prepare, .enter = omap_pm_enter, .finish = omap_pm_finish, diff --git a/arch/arm/mach-omap2/pm24xx.c b/arch/arm/mach-omap2/pm24xx.c index a40457d81927..aa9764eeb941 100644 --- a/arch/arm/mach-omap2/pm24xx.c +++ b/arch/arm/mach-omap2/pm24xx.c @@ -326,7 +326,7 @@ static void omap2_pm_finish(void) enable_hlt(); } -static struct platform_suspend_ops omap_pm_ops = { +static const struct platform_suspend_ops omap_pm_ops = { .prepare = omap2_pm_prepare, .enter = omap2_pm_enter, .finish = omap2_pm_finish, diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c index 75c0cd13ad8e..4000c3c8bbd5 100644 --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -594,7 +594,7 @@ static void omap3_pm_end(void) return; } -static struct platform_suspend_ops omap_pm_ops = { +static const struct platform_suspend_ops omap_pm_ops = { .begin = omap3_pm_begin, .end = omap3_pm_end, .prepare = omap3_pm_prepare, diff --git a/arch/arm/mach-omap2/pm44xx.c b/arch/arm/mach-omap2/pm44xx.c index 54544b4fc76b..dc8b1ef9f84c 100644 --- a/arch/arm/mach-omap2/pm44xx.c +++ b/arch/arm/mach-omap2/pm44xx.c @@ -75,7 +75,7 @@ static void omap4_pm_end(void) return; } -static struct platform_suspend_ops omap_pm_ops = { +static const struct platform_suspend_ops omap_pm_ops = { .begin = omap4_pm_begin, .end = omap4_pm_end, .prepare = omap4_pm_prepare, diff --git a/arch/arm/mach-pnx4008/pm.c b/arch/arm/mach-pnx4008/pm.c index ee3c29c57ae3..f3e60a049f98 100644 --- a/arch/arm/mach-pnx4008/pm.c +++ b/arch/arm/mach-pnx4008/pm.c @@ -119,7 +119,7 @@ static int pnx4008_pm_valid(suspend_state_t state) (state == PM_SUSPEND_MEM); } -static struct platform_suspend_ops pnx4008_pm_ops = { +static const struct platform_suspend_ops pnx4008_pm_ops = { .enter = pnx4008_pm_enter, .valid = pnx4008_pm_valid, }; diff --git a/arch/arm/mach-pxa/pm.c b/arch/arm/mach-pxa/pm.c index 166c15f62916..978e1b289544 100644 --- a/arch/arm/mach-pxa/pm.c +++ b/arch/arm/mach-pxa/pm.c @@ -96,7 +96,7 @@ void pxa_pm_finish(void) pxa_cpu_pm_fns->finish(); } -static struct platform_suspend_ops pxa_pm_ops = { +static const struct platform_suspend_ops pxa_pm_ops = { .valid = pxa_pm_valid, .enter = pxa_pm_enter, .prepare = pxa_pm_prepare, diff --git a/arch/arm/mach-pxa/sharpsl_pm.c b/arch/arm/mach-pxa/sharpsl_pm.c index 8fed027b12dc..02874e96ec72 100644 --- a/arch/arm/mach-pxa/sharpsl_pm.c +++ b/arch/arm/mach-pxa/sharpsl_pm.c @@ -868,7 +868,7 @@ static void sharpsl_apm_get_power_status(struct apm_power_info *info) } #ifdef CONFIG_PM -static struct platform_suspend_ops sharpsl_pm_ops = { +static const struct platform_suspend_ops sharpsl_pm_ops = { .prepare = pxa_pm_prepare, .finish = pxa_pm_finish, .enter = corgi_pxa_pm_enter, diff --git a/arch/arm/mach-sa1100/pm.c b/arch/arm/mach-sa1100/pm.c index c83fdc80edfd..ab9fc4470d36 100644 --- a/arch/arm/mach-sa1100/pm.c +++ b/arch/arm/mach-sa1100/pm.c @@ -120,7 +120,7 @@ unsigned long sleep_phys_sp(void *sp) return virt_to_phys(sp); } -static struct platform_suspend_ops sa11x0_pm_ops = { +static const struct platform_suspend_ops sa11x0_pm_ops = { .enter = sa11x0_pm_enter, .valid = suspend_valid_only_mem, }; diff --git a/arch/arm/plat-samsung/pm.c b/arch/arm/plat-samsung/pm.c index 27cfca597699..5bf3f2f09e74 100644 --- a/arch/arm/plat-samsung/pm.c +++ b/arch/arm/plat-samsung/pm.c @@ -355,7 +355,7 @@ static void s3c_pm_finish(void) s3c_pm_check_cleanup(); } -static struct platform_suspend_ops s3c_pm_ops = { +static const struct platform_suspend_ops s3c_pm_ops = { .enter = s3c_pm_enter, .prepare = s3c_pm_prepare, .finish = s3c_pm_finish, diff --git a/arch/avr32/mach-at32ap/pm.c b/arch/avr32/mach-at32ap/pm.c index f021edfeaab0..32d680eb6f48 100644 --- a/arch/avr32/mach-at32ap/pm.c +++ b/arch/avr32/mach-at32ap/pm.c @@ -176,7 +176,7 @@ out: return 0; } -static struct platform_suspend_ops avr32_pm_ops = { +static const struct platform_suspend_ops avr32_pm_ops = { .valid = avr32_pm_valid_state, .enter = avr32_pm_enter, }; diff --git a/arch/blackfin/mach-common/pm.c b/arch/blackfin/mach-common/pm.c index 80884b136a0c..745af2d96553 100644 --- a/arch/blackfin/mach-common/pm.c +++ b/arch/blackfin/mach-common/pm.c @@ -233,7 +233,7 @@ static int bfin_pm_enter(suspend_state_t state) return 0; } -struct platform_suspend_ops bfin_pm_ops = { +static const struct platform_suspend_ops bfin_pm_ops = { .enter = bfin_pm_enter, .valid = bfin_pm_valid, }; diff --git a/arch/mips/alchemy/devboards/pm.c b/arch/mips/alchemy/devboards/pm.c index 4bbd3133e451..acaf91b5e461 100644 --- a/arch/mips/alchemy/devboards/pm.c +++ b/arch/mips/alchemy/devboards/pm.c @@ -110,7 +110,7 @@ static void db1x_pm_end(void) } -static struct platform_suspend_ops db1x_pm_ops = { +static const struct platform_suspend_ops db1x_pm_ops = { .valid = suspend_valid_only_mem, .begin = db1x_pm_begin, .enter = db1x_pm_enter, diff --git a/arch/mips/jz4740/pm.c b/arch/mips/jz4740/pm.c index a9994585424d..902d5b50124c 100644 --- a/arch/mips/jz4740/pm.c +++ b/arch/mips/jz4740/pm.c @@ -42,7 +42,7 @@ static int jz4740_pm_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops jz4740_pm_ops = { +static const struct platform_suspend_ops jz4740_pm_ops = { .valid = suspend_valid_only_mem, .enter = jz4740_pm_enter, }; diff --git a/arch/mips/loongson/common/pm.c b/arch/mips/loongson/common/pm.c index 6c1fd9001712..f55e07aee071 100644 --- a/arch/mips/loongson/common/pm.c +++ b/arch/mips/loongson/common/pm.c @@ -147,7 +147,7 @@ static int loongson_pm_valid_state(suspend_state_t state) } } -static struct platform_suspend_ops loongson_pm_ops = { +static const struct platform_suspend_ops loongson_pm_ops = { .valid = loongson_pm_valid_state, .enter = loongson_pm_enter, }; diff --git a/arch/powerpc/platforms/52xx/lite5200_pm.c b/arch/powerpc/platforms/52xx/lite5200_pm.c index 80234e5921f5..eda0fc2a3914 100644 --- a/arch/powerpc/platforms/52xx/lite5200_pm.c +++ b/arch/powerpc/platforms/52xx/lite5200_pm.c @@ -232,7 +232,7 @@ static void lite5200_pm_end(void) lite5200_pm_target_state = PM_SUSPEND_ON; } -static struct platform_suspend_ops lite5200_pm_ops = { +static const struct platform_suspend_ops lite5200_pm_ops = { .valid = lite5200_pm_valid, .begin = lite5200_pm_begin, .prepare = lite5200_pm_prepare, diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pm.c b/arch/powerpc/platforms/52xx/mpc52xx_pm.c index 568cef636275..8310e8b5b57f 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pm.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pm.c @@ -186,7 +186,7 @@ void mpc52xx_pm_finish(void) iounmap(mbar); } -static struct platform_suspend_ops mpc52xx_pm_ops = { +static const struct platform_suspend_ops mpc52xx_pm_ops = { .valid = mpc52xx_pm_valid, .prepare = mpc52xx_pm_prepare, .enter = mpc52xx_pm_enter, diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c index 75ae77f1af6a..fd4f2f2f19e6 100644 --- a/arch/powerpc/platforms/83xx/suspend.c +++ b/arch/powerpc/platforms/83xx/suspend.c @@ -311,7 +311,7 @@ static int mpc83xx_is_pci_agent(void) return ret; } -static struct platform_suspend_ops mpc83xx_suspend_ops = { +static const struct platform_suspend_ops mpc83xx_suspend_ops = { .valid = mpc83xx_suspend_valid, .begin = mpc83xx_suspend_begin, .enter = mpc83xx_suspend_enter, diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c index ed72098bb4e3..a8ca289ff267 100644 --- a/arch/powerpc/platforms/pseries/suspend.c +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -153,7 +153,7 @@ static struct sysdev_class suspend_sysdev_class = { .name = "power", }; -static struct platform_suspend_ops pseries_suspend_ops = { +static const struct platform_suspend_ops pseries_suspend_ops = { .valid = suspend_valid_only_mem, .begin = pseries_suspend_begin, .prepare_late = pseries_prepare_late, diff --git a/arch/powerpc/sysdev/fsl_pmc.c b/arch/powerpc/sysdev/fsl_pmc.c index 44de8559c975..e9381bfefb21 100644 --- a/arch/powerpc/sysdev/fsl_pmc.c +++ b/arch/powerpc/sysdev/fsl_pmc.c @@ -53,7 +53,7 @@ static int pmc_suspend_valid(suspend_state_t state) return 1; } -static struct platform_suspend_ops pmc_suspend_ops = { +static const struct platform_suspend_ops pmc_suspend_ops = { .valid = pmc_suspend_valid, .enter = pmc_suspend_enter, }; diff --git a/arch/sh/boards/mach-hp6xx/pm.c b/arch/sh/boards/mach-hp6xx/pm.c index 4499a3749d40..adc9b4bba828 100644 --- a/arch/sh/boards/mach-hp6xx/pm.c +++ b/arch/sh/boards/mach-hp6xx/pm.c @@ -143,7 +143,7 @@ static int hp6x0_pm_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops hp6x0_pm_ops = { +static const struct platform_suspend_ops hp6x0_pm_ops = { .enter = hp6x0_pm_enter, .valid = suspend_valid_only_mem, }; diff --git a/arch/sh/kernel/cpu/shmobile/pm.c b/arch/sh/kernel/cpu/shmobile/pm.c index e55968712706..a6f95ae4aae7 100644 --- a/arch/sh/kernel/cpu/shmobile/pm.c +++ b/arch/sh/kernel/cpu/shmobile/pm.c @@ -141,7 +141,7 @@ static int sh_pm_enter(suspend_state_t state) return 0; } -static struct platform_suspend_ops sh_pm_ops = { +static const struct platform_suspend_ops sh_pm_ops = { .enter = sh_pm_enter, .valid = suspend_valid_only_mem, }; diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 5149c9bf7015..ba1caf724a16 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -319,7 +319,7 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state) } } -static struct platform_suspend_ops acpi_suspend_ops = { +static const struct platform_suspend_ops acpi_suspend_ops = { .valid = acpi_suspend_state_valid, .begin = acpi_suspend_begin, .prepare_late = acpi_pm_prepare, @@ -347,7 +347,7 @@ static int acpi_suspend_begin_old(suspend_state_t pm_state) * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has * been requested. */ -static struct platform_suspend_ops acpi_suspend_ops_old = { +static const struct platform_suspend_ops acpi_suspend_ops_old = { .valid = acpi_suspend_state_valid, .begin = acpi_suspend_begin_old, .prepare_late = acpi_pm_pre_suspend, diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index cd29c8248386..8b021eb0d48c 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -2257,7 +2257,7 @@ static int pmu_sleep_valid(suspend_state_t state) && (pmac_call_feature(PMAC_FTR_SLEEP_STATE, NULL, 0, -1) >= 0); } -static struct platform_suspend_ops pmu_pm_ops = { +static const struct platform_suspend_ops pmu_pm_ops = { .enter = powerbook_sleep, .valid = pmu_sleep_valid, }; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 40ead943fd6a..f45f3ccfdfa9 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -122,7 +122,7 @@ struct platform_suspend_ops { * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. */ -extern void suspend_set_ops(struct platform_suspend_ops *ops); +extern void suspend_set_ops(const struct platform_suspend_ops *ops); extern int suspend_valid_only_mem(suspend_state_t state); /** @@ -147,7 +147,7 @@ extern int pm_suspend(suspend_state_t state); #else /* !CONFIG_SUSPEND */ #define suspend_valid_only_mem NULL -static inline void suspend_set_ops(struct platform_suspend_ops *ops) {} +static inline void suspend_set_ops(const struct platform_suspend_ops *ops) {} static inline int pm_suspend(suspend_state_t state) { return -ENOSYS; } #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 7335952ee473..80051bdde6f1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -30,13 +30,13 @@ const char *const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_MEM] = "mem", }; -static struct platform_suspend_ops *suspend_ops; +static const struct platform_suspend_ops *suspend_ops; /** * suspend_set_ops - Set the global suspend method table. * @ops: Pointer to ops structure. */ -void suspend_set_ops(struct platform_suspend_ops *ops) +void suspend_set_ops(const struct platform_suspend_ops *ops) { mutex_lock(&pm_mutex); suspend_ops = ops; -- cgit v1.3-8-gc7d7 From fa9f90be745d3b600a9d97a063be404c5e5d9071 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 28 Nov 2010 21:39:34 +0100 Subject: Kill off a bunch of warning: ‘inline’ is not at beginning of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These warnings are spewed during a build of a 'allnoconfig' kernel (especially the ones from u64_stats_sync.h show up a lot) when building with -Wextra (which I often do).. They are a) annoying b) easy to get rid of. This patch kills them off. include/linux/u64_stats_sync.h:70:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:77:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:84:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:96:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:115:1: warning: ‘inline’ is not at beginning of declaration include/linux/u64_stats_sync.h:127:1: warning: ‘inline’ is not at beginning of declaration kernel/time.c:241:1: warning: ‘inline’ is not at beginning of declaration kernel/time.c:257:1: warning: ‘inline’ is not at beginning of declaration kernel/perf_event.c:4513:1: warning: ‘inline’ is not at beginning of declaration mm/page_alloc.c:4012:1: warning: ‘inline’ is not at beginning of declaration Signed-off-by: Jesper Juhl Signed-off-by: Jiri Kosina --- include/linux/u64_stats_sync.h | 12 ++++++------ kernel/perf_event.c | 2 +- kernel/time.c | 4 ++-- mm/page_alloc.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index fa261a0da280..8da8c4e87da3 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -67,21 +67,21 @@ struct u64_stats_sync { #endif }; -static void inline u64_stats_update_begin(struct u64_stats_sync *syncp) +static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) write_seqcount_begin(&syncp->seq); #endif } -static void inline u64_stats_update_end(struct u64_stats_sync *syncp) +static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) write_seqcount_end(&syncp->seq); #endif } -static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_begin(&syncp->seq); @@ -93,7 +93,7 @@ static unsigned int inline u64_stats_fetch_begin(const struct u64_stats_sync *sy #endif } -static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) @@ -112,7 +112,7 @@ static bool inline u64_stats_fetch_retry(const struct u64_stats_sync *syncp, * - UP 32bit must disable BH. * - 64bit have no problem atomically reading u64 values, irq safe. */ -static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp) +static inline unsigned int u64_stats_fetch_begin_bh(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_begin(&syncp->seq); @@ -124,7 +124,7 @@ static unsigned int inline u64_stats_fetch_begin_bh(const struct u64_stats_sync #endif } -static bool inline u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp, +static inline bool u64_stats_fetch_retry_bh(const struct u64_stats_sync *syncp, unsigned int start) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 517d827f4982..06682e7b12e2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4454,7 +4454,7 @@ int perf_swevent_get_recursion_context(void) } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); -void inline perf_swevent_put_recursion_context(int rctx) +inline void perf_swevent_put_recursion_context(int rctx) { struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); diff --git a/kernel/time.c b/kernel/time.c index ba9b338d1835..32174359576f 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -238,7 +238,7 @@ EXPORT_SYMBOL(current_fs_time); * Avoid unnecessary multiplications/divisions in the * two most common HZ cases: */ -unsigned int inline jiffies_to_msecs(const unsigned long j) +inline unsigned int jiffies_to_msecs(const unsigned long j) { #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) return (MSEC_PER_SEC / HZ) * j; @@ -254,7 +254,7 @@ unsigned int inline jiffies_to_msecs(const unsigned long j) } EXPORT_SYMBOL(jiffies_to_msecs); -unsigned int inline jiffies_to_usecs(const unsigned long j) +inline unsigned int jiffies_to_usecs(const unsigned long j) { #if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) return (USEC_PER_SEC / HZ) * j; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a654486f75..f823ee192e94 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4013,7 +4013,7 @@ static void __init setup_usemap(struct pglist_data *pgdat, zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); } #else -static void inline setup_usemap(struct pglist_data *pgdat, +static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zonesize) {} #endif /* CONFIG_SPARSEMEM */ -- cgit v1.3-8-gc7d7 From 41263fc6716dea402125c95f38ed83ebf59d5172 Mon Sep 17 00:00:00 2001 From: Ben Gardiner Date: Tue, 14 Dec 2010 11:39:44 -0500 Subject: kbuild: fix interaction of CONFIG_IKCONFIG and KCONFIG_CONFIG If you try to build a kernel with KCONFIG_CONFIG set (to a value not equal to .config) and that config sets CONFIG_IKCONFIG then the build will fail with: make[1]: *** No rule to make target `.config', needed by \ `kernel/config_data.gz'. Stop. because the kernel/Makefile contains a direct reference to .config. This issue has been present since the introduction of KCONFIG_CONFIG in 14cdd3c402bf7c66f0bcd76e290f0770a54a4b21. Signed-off-by: Ben Gardiner CC: Roman Zippel CC: Michal Marek Reviewed-by: Michal Marek Signed-off-by: Michal Marek --- Makefile | 1 + kernel/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 6619720f50dd..ee77a3d88e0a 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,7 @@ ifeq ($(ARCH),m68knommu) endif KCONFIG_CONFIG ?= .config +export KCONFIG_CONFIG # SHELL used by kbuild CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..33e0a39cf359 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -121,7 +121,7 @@ $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. # Info from config_data can be extracted from /proc/config* targets += config_data.gz -$(obj)/config_data.gz: .config FORCE +$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) quiet_cmd_ikconfiggz = IKCFG $@ -- cgit v1.3-8-gc7d7 From 43b210139a3cb09d49a18f0dc9bed3674c55f235 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 22 Dec 2010 19:01:47 +0100 Subject: hrtimer: fix a typo in comment Signed-off-by: Namhyung Kim Signed-off-by: Jiri Kosina --- kernel/hrtimer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6cf..66163dfe810b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1774,7 +1774,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, } /* - * A NULL parameter means "inifinte" + * A NULL parameter means "infinite" */ if (!expires) { schedule(); -- cgit v1.3-8-gc7d7 From 133f1128b2bf178a1976b17c54bd14ce6feb90bf Mon Sep 17 00:00:00 2001 From: Tracey Dent Date: Thu, 25 Nov 2010 23:41:29 +0100 Subject: PM: Use proper ccflag flag in kernel/power/Makefile Use the ccflags-$ flag instead of EXTRA_CFLAGS because EXTRA_CFLAGS is deprecated and should now be switched. According to (documentation/kbuild/makefiles.txt). Signed-off-by: Tracey Dent Signed-off-by: Rafael J. Wysocki --- kernel/power/Makefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..b75597235d85 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,4 @@ - -ifeq ($(CONFIG_PM_DEBUG),y) -EXTRA_CFLAGS += -DDEBUG -endif +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG obj-$(CONFIG_PM) += main.o obj-$(CONFIG_PM_SLEEP) += console.o -- cgit v1.3-8-gc7d7 From 8cfe400ca54fd1ed96f962bea5f7e20b09b6d69f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 26 Nov 2010 23:07:27 +0100 Subject: Freezer: Fix a race during freezing of TASK_STOPPED tasks After calling freeze_task(), try_to_freeze_tasks() see whether the task is stopped or traced and if so, considers it to be frozen; however, nothing guarantees that either the task being frozen sees TIF_FREEZE or the freezer sees TASK_STOPPED -> TASK_RUNNING transition. The task being frozen may wake up and not see TIF_FREEZE while the freezer fails to notice the transition and believes the task is still stopped. This patch fixes the race by making freeze_task() always go through fake_signal_wake_up() for applicable tasks. The function goes through the target task's scheduler lock and thus guarantees that either the target sees TIF_FREEZE or try_to_freeze_task() sees TASK_RUNNING. Signed-off-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 9 +++++++-- kernel/power/process.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/freezer.c b/kernel/freezer.c index bd1d42b17cb2..66ecd2ead215 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -104,8 +104,13 @@ bool freeze_task(struct task_struct *p, bool sig_only) } if (should_send_signal(p)) { - if (!signal_pending(p)) - fake_signal_wake_up(p); + fake_signal_wake_up(p); + /* + * fake_signal_wake_up() goes through p's scheduler + * lock and guarantees that TASK_STOPPED/TRACED -> + * TASK_RUNNING transition can't race with task state + * testing in try_to_freeze_tasks(). + */ } else if (sig_only) { return false; } else { diff --git a/kernel/power/process.c b/kernel/power/process.c index e50b4c1b2a0f..eb2c88a9e562 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -64,6 +64,12 @@ static int try_to_freeze_tasks(bool sig_only) * perturb a task in TASK_STOPPED or TASK_TRACED. * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. + * + * Because freeze_task() goes through p's + * scheduler lock after setting TIF_FREEZE, it's + * guaranteed that either we see TASK_RUNNING or + * try_to_stop() after schedule() in ptrace/signal + * stop sees TIF_FREEZE. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) -- cgit v1.3-8-gc7d7 From 5729c63a51f0f8a351e0f1dc7b3250ebac12c309 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Fri, 26 Nov 2010 23:07:48 +0100 Subject: PM / Hibernate: hibernation_ops->leave should be checked too Because hibernate calls hibernation_ops->leave() without checking whether hibernation_ops->leave is NULL or not, hiberantion_set_ops should WARN_ON if hibernation_ops->leave is NULL. This patch added one more condition to check hibernation_ops->leave. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 048d0b514831..ab2836c25038 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -62,7 +62,7 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops) { if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore - && ops->restore_cleanup)) { + && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } -- cgit v1.3-8-gc7d7 From 5262a47502adcfc3a64403120768f528418a3b79 Mon Sep 17 00:00:00 2001 From: MyungJoo Ham Date: Fri, 26 Nov 2010 23:07:56 +0100 Subject: PM / Hibernate: When failed, in_suspend should be reset When hibernation failed due to an error in swsusp_write() called by hibernate(), it skips calling "power_down()" and returns. When hibernate() is called again (probably after fixing up so that swsusp_write() wouldn't fail again), before "in_suspend = 1" of create_image is called, in_suspend should be 0. However, because hibernate() did not reset "in_suspend" after a failure, it's already 1. This patch fixes such inconsistency of "in_suspend" value. Signed-off-by: MyungJoo Ham Signed-off-by: Kyungmin Park Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index ab2836c25038..c9a98beffee4 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -647,6 +647,7 @@ int hibernate(void) swsusp_free(); if (!error) power_down(); + in_suspend = 0; pm_restore_gfp_mask(); } else { pr_debug("PM: Image restored successfully.\n"); -- cgit v1.3-8-gc7d7 From a2867e08c8e3bdbc00caf56bc3bdde19ccc058e3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Dec 2010 22:58:31 +0100 Subject: PM / Wakeup: Replace pm_check_wakeup_events() with pm_wakeup_pending() To avoid confusion with the meaning and return value of pm_check_wakeup_events() replace it with pm_wakeup_pending() that will work the other way around (ie. return true when system-wide power transition should be aborted). Signed-off-by: Rafael J. Wysocki --- drivers/base/power/main.c | 2 +- drivers/base/power/wakeup.c | 20 ++++++++++---------- include/linux/suspend.h | 4 ++-- kernel/power/hibernate.c | 4 ++-- kernel/power/process.c | 2 +- kernel/power/suspend.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 4747a1e8b44a..8a5258339ca2 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1056,7 +1056,7 @@ static int dpm_prepare(pm_message_t state) if (pm_runtime_barrier(dev) && device_may_wakeup(dev)) pm_wakeup_event(dev, 0); - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { pm_runtime_put_sync(dev); error = -EBUSY; } else { diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 71c5528e1c35..8ec406d8f548 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -542,26 +542,26 @@ static void pm_wakeup_update_hit_counts(void) } /** - * pm_check_wakeup_events - Check for new wakeup events. + * pm_wakeup_pending - Check if power transition in progress should be aborted. * * Compare the current number of registered wakeup events with its preserved - * value from the past to check if new wakeup events have been registered since - * the old value was stored. Check if the current number of wakeup events being - * processed is zero. + * value from the past and return true if new wakeup events have been registered + * since the old value was stored. Also return true if the current number of + * wakeup events being processed is different from zero. */ -bool pm_check_wakeup_events(void) +bool pm_wakeup_pending(void) { unsigned long flags; - bool ret = true; + bool ret = false; spin_lock_irqsave(&events_lock, flags); if (events_check_enabled) { - ret = ((unsigned int)atomic_read(&event_count) == saved_count) - && !atomic_read(&events_in_progress); - events_check_enabled = ret; + ret = ((unsigned int)atomic_read(&event_count) != saved_count) + || atomic_read(&events_in_progress); + events_check_enabled = !ret; } spin_unlock_irqrestore(&events_lock, flags); - if (!ret) + if (ret) pm_wakeup_update_hit_counts(); return ret; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 26697514c5ec..144b34be5c32 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -292,7 +292,7 @@ extern int unregister_pm_notifier(struct notifier_block *nb); /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern bool pm_check_wakeup_events(void); +extern bool pm_wakeup_pending(void); extern bool pm_get_wakeup_count(unsigned int *count); extern bool pm_save_wakeup_count(unsigned int count); #else /* !CONFIG_PM_SLEEP */ @@ -309,7 +309,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) #define pm_notifier(fn, pri) do { (void)(fn); } while (0) -static inline bool pm_check_wakeup_events(void) { return true; } +static inline bool pm_wakeup_pending(void) { return false; } #endif /* !CONFIG_PM_SLEEP */ extern struct mutex pm_mutex; diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9a98beffee4..870f72bc72ae 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -278,7 +278,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) goto Power_up; in_suspend = 1; @@ -516,7 +516,7 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { error = -EAGAIN; goto Power_up; } diff --git a/kernel/power/process.c b/kernel/power/process.c index eb2c88a9e562..d6d2a10320e0 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -85,7 +85,7 @@ static int try_to_freeze_tasks(bool sig_only) if (!todo || time_after(jiffies, end_time)) break; - if (!pm_check_wakeup_events()) { + if (pm_wakeup_pending()) { wakeup = true; break; } diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ecf770509d0d..5e644e3a6bf3 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -163,7 +163,7 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { + if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { error = suspend_ops->enter(state); events_check_enabled = false; } -- cgit v1.3-8-gc7d7 From 26fcaf60fe3861409eb4c455c5c0d0f00f599b08 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 7 Jan 2011 01:42:31 +0100 Subject: PM: Fix oops in suspend/hibernate code related to failing ioremap() When ioremap() fails (which might happen for some reason), we nicely oops in suspend_nvs_save() due to NULL dereference by memcpy() in there. Fail gracefully instead. Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- drivers/acpi/sleep.c | 5 ++--- include/linux/suspend.h | 4 ++-- kernel/power/nvs.c | 8 +++++++- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index febb153b5a68..d8bca6c90719 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -124,8 +124,7 @@ static int acpi_pm_freeze(void) static int acpi_pm_pre_suspend(void) { acpi_pm_freeze(); - suspend_nvs_save(); - return 0; + return suspend_nvs_save(); } /** @@ -151,7 +150,7 @@ static int acpi_pm_prepare(void) { int error = __acpi_pm_prepare(); if (!error) - acpi_pm_pre_suspend(); + error = acpi_pm_pre_suspend(); return error; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 26697514c5ec..acb7d911bb0c 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -262,7 +262,7 @@ static inline bool system_entering_hibernation(void) { return false; } extern int suspend_nvs_register(unsigned long start, unsigned long size); extern int suspend_nvs_alloc(void); extern void suspend_nvs_free(void); -extern void suspend_nvs_save(void); +extern int suspend_nvs_save(void); extern void suspend_nvs_restore(void); #else /* CONFIG_SUSPEND_NVS */ static inline int suspend_nvs_register(unsigned long a, unsigned long b) @@ -271,7 +271,7 @@ static inline int suspend_nvs_register(unsigned long a, unsigned long b) } static inline int suspend_nvs_alloc(void) { return 0; } static inline void suspend_nvs_free(void) {} -static inline void suspend_nvs_save(void) {} +static inline int suspend_nvs_save(void) {} static inline void suspend_nvs_restore(void) {} #endif /* CONFIG_SUSPEND_NVS */ diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c index 1836db60bbb6..57c6fabbb6b6 100644 --- a/kernel/power/nvs.c +++ b/kernel/power/nvs.c @@ -105,7 +105,7 @@ int suspend_nvs_alloc(void) /** * suspend_nvs_save - save NVS memory regions */ -void suspend_nvs_save(void) +int suspend_nvs_save(void) { struct nvs_page *entry; @@ -114,8 +114,14 @@ void suspend_nvs_save(void) list_for_each_entry(entry, &nvs_list, node) if (entry->data) { entry->kaddr = ioremap(entry->phys_start, entry->size); + if (!entry->kaddr) { + suspend_nvs_free(); + return -ENOMEM; + } memcpy(entry->data, entry->kaddr, entry->size); } + + return 0; } /** -- cgit v1.3-8-gc7d7 From 976513dbfc1547c7b1822566923058655f0c32fd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 7 Jan 2011 01:43:44 +0100 Subject: PM / ACPI: Move NVS saving and restoring code to drivers/acpi The saving of the ACPI NVS area during hibernation and suspend and restoring it during the subsequent resume is entirely specific to ACPI, so move it to drivers/acpi and drop the CONFIG_SUSPEND_NVS configuration option which is redundant. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 1 + drivers/acpi/Makefile | 2 +- drivers/acpi/internal.h | 8 +++ drivers/acpi/nvs.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/acpi.h | 9 +++ include/linux/suspend.h | 17 ------ kernel/power/Kconfig | 5 -- kernel/power/Makefile | 1 - kernel/power/nvs.c | 142 ------------------------------------------------ 9 files changed, 161 insertions(+), 166 deletions(-) create mode 100644 drivers/acpi/nvs.c delete mode 100644 kernel/power/nvs.c (limited to 'kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0c2b7ef7a34d..294f26da0c0c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 3d031d02e54b..9cc9f2c4da79 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -24,7 +24,7 @@ acpi-y += atomicio.o # sleep related files acpi-y += wakeup.o acpi-y += sleep.o -acpi-$(CONFIG_ACPI_SLEEP) += proc.o +acpi-$(CONFIG_ACPI_SLEEP) += proc.o nvs.o # diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index a212bfeddf8c..7c23b76e8eca 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -82,8 +82,16 @@ extern int acpi_sleep_init(void); #ifdef CONFIG_ACPI_SLEEP int acpi_sleep_proc_init(void); +int suspend_nvs_alloc(void); +void suspend_nvs_free(void); +int suspend_nvs_save(void); +void suspend_nvs_restore(void); #else static inline int acpi_sleep_proc_init(void) { return 0; } +static inline int suspend_nvs_alloc(void) { return 0; } +static inline void suspend_nvs_free(void) {} +static inline int suspend_nvs_save(void) {} +static inline void suspend_nvs_restore(void) {} #endif #endif /* _ACPI_INTERNAL_H_ */ diff --git a/drivers/acpi/nvs.c b/drivers/acpi/nvs.c new file mode 100644 index 000000000000..57c6fabbb6b6 --- /dev/null +++ b/drivers/acpi/nvs.c @@ -0,0 +1,142 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * suspend and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * suspend_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int suspend_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * suspend_nvs_free - free data pages allocated for saving NVS regions + */ +void suspend_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int suspend_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + suspend_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * suspend_nvs_save - save NVS memory regions + */ +int suspend_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + if (!entry->kaddr) { + suspend_nvs_free(); + return -ENOMEM; + } + memcpy(entry->data, entry->kaddr, entry->size); + } + + return 0; +} + +/** + * suspend_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void suspend_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 67c91b4418b0..fa7ed6a983d0 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -254,6 +254,15 @@ void __init acpi_old_suspend_ordering(void); void __init acpi_nvs_nosave(void); #endif /* CONFIG_PM_SLEEP */ +#ifdef CONFIG_ACPI_SLEEP +int suspend_nvs_register(unsigned long start, unsigned long size); +#else +static inline int suspend_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +#endif + struct acpi_osc_context { char *uuid_str; /* uuid string */ int rev; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index acb7d911bb0c..0e288e3c37be 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -258,23 +258,6 @@ static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ -#ifdef CONFIG_SUSPEND_NVS -extern int suspend_nvs_register(unsigned long start, unsigned long size); -extern int suspend_nvs_alloc(void); -extern void suspend_nvs_free(void); -extern int suspend_nvs_save(void); -extern void suspend_nvs_restore(void); -#else /* CONFIG_SUSPEND_NVS */ -static inline int suspend_nvs_register(unsigned long a, unsigned long b) -{ - return 0; -} -static inline int suspend_nvs_alloc(void) { return 0; } -static inline void suspend_nvs_free(void) {} -static inline int suspend_nvs_save(void) {} -static inline void suspend_nvs_restore(void) {} -#endif /* CONFIG_SUSPEND_NVS */ - #ifdef CONFIG_PM_SLEEP void save_processor_state(void); void restore_processor_state(void); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a5aff3ebad38..265729966ece 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -100,13 +100,9 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n -config SUSPEND_NVS - bool - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE - select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -140,7 +136,6 @@ config HIBERNATION depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE select LZO_COMPRESS select LZO_DECOMPRESS - select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index f9063c6b185d..120a15823325 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,5 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c deleted file mode 100644 index 57c6fabbb6b6..000000000000 --- a/kernel/power/nvs.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * suspend and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * suspend_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int suspend_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * suspend_nvs_free - free data pages allocated for saving NVS regions - */ -void suspend_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * suspend_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int suspend_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - suspend_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * suspend_nvs_save - save NVS memory regions - */ -int suspend_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - if (!entry->kaddr) { - suspend_nvs_free(); - return -ENOMEM; - } - memcpy(entry->data, entry->kaddr, entry->size); - } - - return 0; -} - -/** - * suspend_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void suspend_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} -- cgit v1.3-8-gc7d7 From 23036f1a340beec19cc451ba9719526c4ffb3a57 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 7 Jan 2011 08:53:58 +0100 Subject: blktrace: add missing probe argument to block_bio_complete blktrace.c block bio complete callback needs to gain a new argument to reflect the newly added "error" tracepoint argument. This is needed to match the new block_bio_complete TRACE_EVENT as of commit de983a7bfcb7c020901ca6e2314cf55a4207ab5a. Signed-off-by: Mathieu Desnoyers CC: Jeff Moyer CC: Steven Rostedt CC: Frederic Weisbecker CC: Ingo Molnar CC: Thomas Gleixner CC: Li Zefan Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2b8e2ee7c0ef..fab31aff9d97 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -782,7 +782,8 @@ static void blk_add_trace_bio_bounce(void *ignore, } static void blk_add_trace_bio_complete(void *ignore, - struct request_queue *q, struct bio *bio) + struct request_queue *q, struct bio *bio, + int error) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -- cgit v1.3-8-gc7d7 From 0b3fcf178deefd7b64154c2c0760a2c63df0b74f Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 3 Jan 2011 18:20:01 +0200 Subject: perf_events: Move code around to prepare for cgroup In particular this patch move perf_event_exit_task() before cgroup_exit() to allow for cgroup support. The cgroup_exit() function detaches the cgroups attached to a task. Other movements include hoisting some definitions and inlines at the top of perf_event.c Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d22058b.cdace30a.4657.ffff95b1@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/exit.c | 14 +++++++++----- kernel/perf_event.c | 28 +++++++++++++++++----------- 2 files changed, 26 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 676149a4ac5f..8cb89045ecf3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -994,6 +994,15 @@ NORET_TYPE void do_exit(long code) exit_fs(tsk); check_stack_usage(); exit_thread(); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + * + * because of cgroup mode, must be called before cgroup_exit() + */ + perf_event_exit_task(tsk); + cgroup_exit(tsk, 1); if (group_dead) @@ -1007,11 +1016,6 @@ NORET_TYPE void do_exit(long code) * FIXME: do that only when needed, using sched_exit tracepoint */ flush_ptrace_hw_breakpoint(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 11847bf1e8cc..2c14e3afdf0d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -38,6 +38,12 @@ #include +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -65,6 +71,12 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -72,6 +84,11 @@ extern __weak const char *perf_pmu_name(void) return "pmu"; } +static inline u64 perf_clock(void) +{ + return local_clock(); +} + void perf_pmu_disable(struct pmu *pmu) { int *count = this_cpu_ptr(pmu->pmu_disable_count); @@ -240,11 +257,6 @@ static void perf_unpin_context(struct perf_event_context *ctx) put_ctx(ctx); } -static inline u64 perf_clock(void) -{ - return local_clock(); -} - /* * Update the record of the current time in a context. */ @@ -1193,12 +1205,6 @@ static int perf_event_refresh(struct perf_event *event, int refresh) return 0; } -enum event_type_t { - EVENT_FLEXIBLE = 0x1, - EVENT_PINNED = 0x2, - EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, -}; - static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) -- cgit v1.3-8-gc7d7 From 5632ab12e9e1fcd7e94058567e181d8f35e83798 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 3 Jan 2011 18:20:01 +0200 Subject: perf_events: Generalize use of event_filter_match() Replace all occurrences of: event->cpu != -1 && event->cpu == smp_processor_id() by a call to: event_filter_match(event) This makes the code more consistent and will make the cgroup patch smaller. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d220593.2308e30a.48c5.ffff8ae9@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2c14e3afdf0d..dcdb19ed83a6 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -949,7 +949,7 @@ static void __perf_install_in_context(void *info) add_event_to_ctx(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1094,7 +1094,7 @@ static void __perf_event_enable(void *info) goto unlock; __perf_event_mark_enabled(event, ctx); - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) goto unlock; /* @@ -1441,7 +1441,7 @@ ctx_pinned_sched_in(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->pinned_groups, group_entry) { if (event->state <= PERF_EVENT_STATE_OFF) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, 1)) @@ -1473,7 +1473,7 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, * Listen to the 'cpu' scheduling filter constraint * of events: */ - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; if (group_can_go_on(event, cpuctx, can_add_hw)) { @@ -1700,7 +1700,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) continue; hwc = &event->hw; @@ -3899,7 +3899,7 @@ static int perf_event_task_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm || event->attr.mmap || @@ -4036,7 +4036,7 @@ static int perf_event_comm_match(struct perf_event *event) if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if (event->attr.comm) @@ -4184,7 +4184,7 @@ static int perf_event_mmap_match(struct perf_event *event, if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; - if (event->cpu != -1 && event->cpu != smp_processor_id()) + if (!event_filter_match(event)) return 0; if ((!executable && event->attr.mmap_data) || -- cgit v1.3-8-gc7d7 From 4158755d3136f4cb05c1a8a260e9c06f93baeb48 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 3 Jan 2011 18:20:01 +0200 Subject: perf_events: Add perf_event_time() Adds perf_event_time() to try and centralize access to event timing and in particular ctx->time. Prepares for cgroup support. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4d22059c.122ae30a.5e0e.ffff8b8b@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index dcdb19ed83a6..b782b7a79f00 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -268,6 +268,12 @@ static void update_context_time(struct perf_event_context *ctx) ctx->timestamp = now; } +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + return ctx ? ctx->time : 0; +} + /* * Update the total_time_enabled and total_time_running fields for a event. */ @@ -281,7 +287,7 @@ static void update_event_times(struct perf_event *event) return; if (ctx->is_active) - run_end = ctx->time; + run_end = perf_event_time(event); else run_end = event->tstamp_stopped; @@ -290,7 +296,7 @@ static void update_event_times(struct perf_event *event) if (event->state == PERF_EVENT_STATE_INACTIVE) run_end = event->tstamp_stopped; else - run_end = ctx->time; + run_end = perf_event_time(event); event->total_time_running = run_end - event->tstamp_running; } @@ -546,6 +552,7 @@ event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); u64 delta; /* * An event which could not be activated because of @@ -557,7 +564,7 @@ event_sched_out(struct perf_event *event, && !event_filter_match(event)) { delta = ctx->time - event->tstamp_stopped; event->tstamp_running += delta; - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; } if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -568,7 +575,7 @@ event_sched_out(struct perf_event *event, event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; + event->tstamp_stopped = tstamp; event->pmu->del(event, 0); event->oncpu = -1; @@ -780,6 +787,8 @@ event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -796,9 +805,9 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; + event->tstamp_running += tstamp - event->tstamp_stopped; - event->shadow_ctx_time = ctx->time - ctx->timestamp; + event->shadow_ctx_time = tstamp - ctx->timestamp; if (!is_software_event(event)) cpuctx->active_oncpu++; @@ -910,11 +919,13 @@ static int group_can_go_on(struct perf_event *event, static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { + u64 tstamp = perf_event_time(event); + list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = ctx->time; - event->tstamp_running = ctx->time; - event->tstamp_stopped = ctx->time; + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; } /* @@ -1054,14 +1065,13 @@ static void __perf_event_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { struct perf_event *sub; + u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = ctx->time - event->total_time_enabled; + event->tstamp_enabled = tstamp - event->total_time_enabled; list_for_each_entry(sub, &event->sibling_list, group_entry) { - if (sub->state >= PERF_EVENT_STATE_INACTIVE) { - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; - } + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; } } -- cgit v1.3-8-gc7d7 From 1dbd1951f39e13da579ffe879cce19586d0462de Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 9 Dec 2010 15:47:56 +0800 Subject: tracing: Fix preempt count leak While running my ftrace stress test, this showed up: BUG: sleeping function called from invalid context at mm/mmap.c:233 ... note: cat[3293] exited with preempt_count 1 The bug was introduced by commit 91e86e560d0b3ce4c5fc64fd2bbb99f856a30a4e ("tracing: Fix recursive user stack trace") Cc: Signed-off-by: Li Zefan LKML-Reference: <4D0089AC.1020802@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f8cf959bad45..dc53ecb80589 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1313,12 +1313,10 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) __this_cpu_inc(user_stack_count); - - event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) - return; + goto out_drop_count; entry = ring_buffer_event_data(event); entry->tgid = current->tgid; @@ -1333,8 +1331,8 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!filter_check_discard(call, entry, buffer, event)) ring_buffer_unlock_commit(buffer, event); + out_drop_count: __this_cpu_dec(user_stack_count); - out: preempt_enable(); } -- cgit v1.3-8-gc7d7 From 870915e047a2da695be118d32dd5a900f0c3e933 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 28 Oct 2010 11:31:17 -0400 Subject: tracing: Fix TRACE_EVENT power tracepoint creation DEFINE_TRACE should also exist when CONFIG_EVENT_TRACING=n. Otherwise, setting only TRACEPOINTS=y is broken. Acked-by: Arjan van de Ven Signed-off-by: Mathieu Desnoyers LKML-Reference: <20101028153117.GA4051@Krystal> Signed-off-by: Steven Rostedt --- kernel/Makefile | 1 + kernel/trace/Makefile | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0b5ff083fa22..e0f2831634b4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 53f338190b26..761c510a06c5 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o -obj-$(CONFIG_EVENT_TRACING) += power-traces.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif -- cgit v1.3-8-gc7d7 From 797a455d2c682476c3797dbfecf5bf84c1e3b9d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jan 2011 09:06:44 +0100 Subject: block: ensure that completion error gets properly traced We normally just use the BIO_UPTODATE flag to signal 0/-EIO. If we have more information available, we should pass that along to the trace output. Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fab31aff9d97..153562d0b93c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -758,54 +758,58 @@ static void blk_add_trace_rq_complete(void *ignore, * @q: queue the io is for * @bio: the source bio * @what: the action + * @error: error, if any * * Description: * Records an action against a bio. Will log the bio offset + size. * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; if (likely(!bt)) return; + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, - !bio_flagged(bio, BIO_UPTODATE), 0, NULL); + error, 0, NULL); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -813,7 +817,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -828,7 +832,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; -- cgit v1.3-8-gc7d7 From f123c98e7f168e949b283690693695f988332c3d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 6 Jan 2011 15:08:29 -0500 Subject: rtmutex: Fix comment about why new_owner can be NULL in wake_futex_pi() The comment about why rt_mutex_next_owner() can return NULL in wake_futex_pi() is not the normal case. Tracing the cause of why this occurs is more likely that waiter simply timedout. But because it originally caused contention with the futex, the owner will go into the kernel when it unlocks the lock. Then it will hit this code path and rt_mutex_next_owner() will return NULL. Cc: Thomas Gleixner Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..5696d38cc71d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -791,10 +791,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* - * This happens when we have stolen the lock and the original - * pending owner did not enqueue itself back on the rt_mutex. - * Thats not a tragedy. We know that way, that a lock waiter - * is on the fly. We make the futex_q waiter the pending owner. + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. */ if (!new_owner) new_owner = this->task; -- cgit v1.3-8-gc7d7 From e159489baa717dbae70f9903770a6a4990865887 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 9 Jan 2011 23:32:15 +0100 Subject: workqueue: relax lockdep annotation on flush_work() Currently, the lockdep annotation in flush_work() requires exclusive access on the workqueue the target work is queued on and triggers warning if a work is trying to flush another work on the same workqueue; however, this is no longer true as workqueues can now execute multiple works concurrently. This patch adds lock_map_acquire_read() and make process_one_work() hold read access to the workqueue while executing a work and start_flush_work() check for write access if concurrnecy level is one or the workqueue has a rescuer (as only one execution resource - the rescuer - is guaranteed to be available under memory pressure), and read access if higher. This better represents what's going on and removes spurious lockdep warnings which are triggered by fake dependency chain created through flush_work(). * Peter pointed out that flushing another work from a WQ_MEM_RECLAIM wq breaks forward progress guarantee under memory pressure. Condition check accordingly updated. Signed-off-by: Tejun Heo Reported-by: "Rafael J. Wysocki" Tested-by: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: stable@kernel.org --- include/linux/lockdep.h | 3 +++ kernel/workqueue.c | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 71c09b26c759..9f19430c7d07 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -522,12 +522,15 @@ static inline void print_irqtrace_events(struct task_struct *curr) #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_PROVE_LOCKING # define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 2, NULL, _THIS_IP_) +# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 2, NULL, _THIS_IP_) # else # define lock_map_acquire(l) lock_acquire(l, 0, 0, 0, 1, NULL, _THIS_IP_) +# define lock_map_acquire_read(l) lock_acquire(l, 0, 0, 2, 1, NULL, _THIS_IP_) # endif # define lock_map_release(l) lock_release(l, 1, _THIS_IP_) #else # define lock_map_acquire(l) do { } while (0) +# define lock_map_acquire_read(l) do { } while (0) # define lock_map_release(l) do { } while (0) #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8ee6ec82f88a..930c2390b77e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1840,7 +1840,7 @@ __acquires(&gcwq->lock) spin_unlock_irq(&gcwq->lock); work_clear_pending(work); - lock_map_acquire(&cwq->wq->lockdep_map); + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); trace_workqueue_execute_start(work); f(work); @@ -2384,8 +2384,18 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, insert_wq_barrier(cwq, barr, work, worker); spin_unlock_irq(&gcwq->lock); - lock_map_acquire(&cwq->wq->lockdep_map); + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); lock_map_release(&cwq->wq->lockdep_map); + return true; already_gone: spin_unlock_irq(&gcwq->lock); -- cgit v1.3-8-gc7d7 From 42c025f3de9042d9c9abd9a6f6205d1a0f4bcadf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Jan 2011 15:58:49 +0100 Subject: workqueue: note the nested NOT_RUNNING test in worker_clr_flags() isn't a noop The nested NOT_RUNNING test in worker_clr_flags() is slightly misleading in that if NOT_RUNNING were a single flag the nested test would be always %true and thus noop. Add a comment noting that the test isn't a noop. Signed-off-by: Tejun Heo Cc: Hillf Danton Cc: Andrew Morton --- kernel/workqueue.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 930c2390b77e..11869faa6819 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -768,7 +768,11 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) worker->flags &= ~flags; - /* if transitioning out of NOT_RUNNING, increment nr_running */ + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(get_gcwq_nr_running(gcwq->cpu)); -- cgit v1.3-8-gc7d7 From 81e88fdc432a1552401d6e91a984dcccce72b8dc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 12 Jan 2011 14:44:55 +0800 Subject: ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It works in so called "Firmware First" mode, that is, hardware errors are reported to firmware firstly, then reported to Linux by firmware. This way, some non-standard hardware error registers or non-standard hardware link can be checked by firmware to produce more valuable hardware error information for Linux. This patch adds POLL/IRQ/NMI notification types support. Because the memory area used to transfer hardware error information from BIOS to Linux can be determined only in NMI, IRQ or timer handler, but general ioremap can not be used in atomic context, so a special version of atomic ioremap is implemented for that. Known issue: - Error information can not be printed for recoverable errors notified via NMI, because printk is not NMI-safe. Will fix this via delay printing to IRQ context via irq_work or make printk NMI-safe. v2: - adjust printk format per comments. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/dumpstack.c | 1 + drivers/acpi/apei/ghes.c | 406 +++++++++++++++++++++++++++++++++++--------- kernel/panic.c | 1 + lib/ioremap.c | 2 + mm/vmalloc.c | 1 + 6 files changed, 328 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 71232b941b6c..c2d0baa48d8c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -504,6 +504,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) return 0; } +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd52..d34cf80ec402 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -240,6 +240,7 @@ unsigned __kprobes long oops_begin(void) bust_spinlocks(1); return flags; } +EXPORT_SYMBOL_GPL(oops_begin); void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 51905d07a4d9..d1d484d4a06a 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -12,10 +12,6 @@ * For more information about Generic Hardware Error Source, please * refer to ACPI Specification version 4.0, section 17.3.2.6 * - * Now, only SCI notification type and memory errors are - * supported. More notification type and hardware error type will be - * added later. - * * Copyright 2010 Intel Corp. * Author: Huang Ying * @@ -39,15 +35,18 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include +#include #include "apei-internal.h" @@ -56,42 +55,131 @@ #define GHES_ESTATUS_MAX_SIZE 65536 /* - * One struct ghes is created for each generic hardware error - * source. - * + * One struct ghes is created for each generic hardware error source. * It provides the context for APEI hardware error timer/IRQ/SCI/NMI - * handler. Handler for one generic hardware error source is only - * triggered after the previous one is done. So handler can uses - * struct ghes without locking. + * handler. * * estatus: memory buffer for error status block, allocated during * HEST parsing. */ #define GHES_TO_CLEAR 0x0001 +#define GHES_EXITING 0x0002 struct ghes { struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; - struct list_head list; u64 buffer_paddr; unsigned long flags; + union { + struct list_head list; + struct timer_list timer; + unsigned int irq; + }; }; +static int ghes_panic_timeout __read_mostly = 30; + /* - * Error source lists, one list for each notification method. The - * members in lists are struct ghes. + * All error sources notified with SCI shares one notifier function, + * so they need to be linked and checked one by one. This is applied + * to NMI too. * - * The list members are only added in HEST parsing and deleted during - * module_exit, that is, single-threaded. So no lock is needed for - * that. - * - * But the mutual exclusion is needed between members adding/deleting - * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is - * used for that. + * RCU is used for these lists, so ghes_list_mutex is only used for + * list changing, not for traversing. */ static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_nmi); static DEFINE_MUTEX(ghes_list_mutex); +/* + * NMI may be triggered on any CPU, so ghes_nmi_lock is used for + * mutual exclusion. + */ +static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); + +/* + * Because the memory area used to transfer hardware error information + * from BIOS to Linux can be determined only in NMI, IRQ or timer + * handler, but general ioremap can not be used in atomic context, so + * a special version of atomic ioremap is implemented for that. + */ + +/* + * Two virtual pages are used, one for NMI context, the other for + * IRQ/PROCESS context + */ +#define GHES_IOREMAP_PAGES 2 +#define GHES_IOREMAP_NMI_PAGE(base) (base) +#define GHES_IOREMAP_IRQ_PAGE(base) ((base) + PAGE_SIZE) + +/* virtual memory area for atomic ioremap */ +static struct vm_struct *ghes_ioremap_area; +/* + * These 2 spinlock is used to prevent atomic ioremap virtual memory + * area from being mapped simultaneously. + */ +static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); +static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); + +static int ghes_ioremap_init(void) +{ + ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, + VM_IOREMAP, VMALLOC_START, VMALLOC_END); + if (!ghes_ioremap_area) { + pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n"); + return -ENOMEM; + } + + return 0; +} + +static void ghes_ioremap_exit(void) +{ + free_vm_area(ghes_ioremap_area); +} + +static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + +static void ghes_iounmap_irq(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + static struct ghes *ghes_new(struct acpi_hest_generic *generic) { struct ghes *ghes; @@ -102,7 +190,6 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) if (!ghes) return ERR_PTR(-ENOMEM); ghes->generic = generic; - INIT_LIST_HEAD(&ghes->list); rc = acpi_pre_map_gar(&generic->error_status_address); if (rc) goto err_free; @@ -159,22 +246,41 @@ static inline int ghes_severity(int severity) } } -/* SCI handler run in work queue, so ioremap can be used here */ -static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, - int from_phys) +static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, + int from_phys) { - void *vaddr; - - vaddr = ioremap_cache(paddr, len); - if (!vaddr) - return -ENOMEM; - if (from_phys) - memcpy(buffer, vaddr, len); - else - memcpy(vaddr, buffer, len); - iounmap(vaddr); - - return 0; + void __iomem *vaddr; + unsigned long flags = 0; + int in_nmi = in_nmi(); + u64 offset; + u32 trunk; + + while (len > 0) { + offset = paddr - (paddr & PAGE_MASK); + if (in_nmi) { + raw_spin_lock(&ghes_ioremap_lock_nmi); + vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT); + } else { + spin_lock_irqsave(&ghes_ioremap_lock_irq, flags); + vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT); + } + trunk = PAGE_SIZE - offset; + trunk = min(trunk, len); + if (from_phys) + memcpy_fromio(buffer, vaddr + offset, trunk); + else + memcpy_toio(vaddr + offset, buffer, trunk); + len -= trunk; + paddr += trunk; + buffer += trunk; + if (in_nmi) { + ghes_iounmap_nmi(vaddr); + raw_spin_unlock(&ghes_ioremap_lock_nmi); + } else { + ghes_iounmap_irq(vaddr); + spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); + } + } } static int ghes_read_estatus(struct ghes *ghes, int silent) @@ -195,10 +301,8 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) if (!buf_paddr) return -ENOENT; - rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, + sizeof(*ghes->estatus), 1); if (!ghes->estatus->block_status) return -ENOENT; @@ -213,17 +317,15 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) goto err_read_block; if (apei_estatus_check_header(ghes->estatus)) goto err_read_block; - rc = ghes_copy_tofrom_phys(ghes->estatus + 1, - buf_paddr + sizeof(*ghes->estatus), - len - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus + 1, + buf_paddr + sizeof(*ghes->estatus), + len - sizeof(*ghes->estatus), 1); if (apei_estatus_check(ghes->estatus)) goto err_read_block; rc = 0; err_read_block: - if (rc && !silent) + if (rc && !silent && printk_ratelimit()) pr_warning(FW_WARN GHES_PFX "Failed to read error status block!\n"); return rc; @@ -293,6 +395,42 @@ out: return 0; } +static void ghes_add_timer(struct ghes *ghes) +{ + struct acpi_hest_generic *g = ghes->generic; + unsigned long expire; + + if (!g->notify.poll_interval) { + pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n", + g->header.source_id); + return; + } + expire = jiffies + msecs_to_jiffies(g->notify.poll_interval); + ghes->timer.expires = round_jiffies_relative(expire); + add_timer(&ghes->timer); +} + +static void ghes_poll_func(unsigned long data) +{ + struct ghes *ghes = (void *)data; + + ghes_proc(ghes); + if (!(ghes->flags & GHES_EXITING)) + ghes_add_timer(ghes); +} + +static irqreturn_t ghes_irq_func(int irq, void *data) +{ + struct ghes *ghes = data; + int rc; + + rc = ghes_proc(ghes); + if (rc) + return IRQ_NONE; + + return IRQ_HANDLED; +} + static int ghes_notify_sci(struct notifier_block *this, unsigned long event, void *data) { @@ -309,10 +447,63 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } +static int ghes_notify_nmi(struct notifier_block *this, + unsigned long cmd, void *data) +{ + struct ghes *ghes, *ghes_global = NULL; + int sev, sev_global = -1; + int ret = NOTIFY_DONE; + + if (cmd != DIE_NMI) + return ret; + + raw_spin_lock(&ghes_nmi_lock); + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (ghes_read_estatus(ghes, 1)) { + ghes_clear_estatus(ghes); + continue; + } + sev = ghes_severity(ghes->estatus->error_severity); + if (sev > sev_global) { + sev_global = sev; + ghes_global = ghes; + } + ret = NOTIFY_STOP; + } + + if (ret == NOTIFY_DONE) + goto out; + + if (sev_global >= GHES_SEV_PANIC) { + oops_begin(); + ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); + /* reboot to log the error! */ + if (panic_timeout == 0) + panic_timeout = ghes_panic_timeout; + panic("Fatal hardware error!"); + } + + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (!(ghes->flags & GHES_TO_CLEAR)) + continue; + /* Do not print estatus because printk is not NMI safe */ + ghes_do_proc(ghes); + ghes_clear_estatus(ghes); + } + +out: + raw_spin_unlock(&ghes_nmi_lock); + return ret; +} + static struct notifier_block ghes_notifier_sci = { .notifier_call = ghes_notify_sci, }; +static struct notifier_block ghes_notifier_nmi = { + .notifier_call = ghes_notify_nmi, +}; + static int __devinit ghes_probe(struct platform_device *ghes_dev) { struct acpi_hest_generic *generic; @@ -323,18 +514,27 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) if (!generic->enabled) return -ENODEV; - if (generic->error_block_length < - sizeof(struct acpi_hest_generic_status)) { - pr_warning(FW_BUG GHES_PFX -"Invalid error block length: %u for generic hardware error source: %d\n", - generic->error_block_length, + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + case ACPI_HEST_NOTIFY_EXTERNAL: + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_NMI: + break; + case ACPI_HEST_NOTIFY_LOCAL: + pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", generic->header.source_id); goto err; + default: + pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n", + generic->notify.type, generic->header.source_id); + goto err; } - if (generic->records_to_preallocate == 0) { - pr_warning(FW_BUG GHES_PFX -"Invalid records to preallocate: %u for generic hardware error source: %d\n", - generic->records_to_preallocate, + + rc = -EIO; + if (generic->error_block_length < + sizeof(struct acpi_hest_generic_status)) { + pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n", + generic->error_block_length, generic->header.source_id); goto err; } @@ -344,38 +544,43 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) ghes = NULL; goto err; } - if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) { + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + ghes->timer.function = ghes_poll_func; + ghes->timer.data = (unsigned long)ghes; + init_timer_deferrable(&ghes->timer); + ghes_add_timer(ghes); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + /* External interrupt vector is GSI */ + if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) { + pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + if (request_irq(ghes->irq, ghes_irq_func, + 0, "GHES IRQ", ghes)) { + pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + break; + case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_sci)) register_acpi_hed_notifier(&ghes_notifier_sci); list_add_rcu(&ghes->list, &ghes_sci); mutex_unlock(&ghes_list_mutex); - } else { - unsigned char *notify = NULL; - - switch (generic->notify.type) { - case ACPI_HEST_NOTIFY_POLLED: - notify = "POLL"; - break; - case ACPI_HEST_NOTIFY_EXTERNAL: - case ACPI_HEST_NOTIFY_LOCAL: - notify = "IRQ"; - break; - case ACPI_HEST_NOTIFY_NMI: - notify = "NMI"; - break; - } - if (notify) { - pr_warning(GHES_PFX -"Generic hardware error source: %d notified via %s is not supported!\n", - generic->header.source_id, notify); - } else { - pr_warning(FW_WARN GHES_PFX -"Unknown notification type: %u for generic hardware error source: %d\n", - generic->notify.type, generic->header.source_id); - } - rc = -ENODEV; - goto err; + break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + if (list_empty(&ghes_nmi)) + register_die_notifier(&ghes_notifier_nmi); + list_add_rcu(&ghes->list, &ghes_nmi); + mutex_unlock(&ghes_list_mutex); + break; + default: + BUG(); } platform_set_drvdata(ghes_dev, ghes); @@ -396,7 +601,14 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) ghes = platform_get_drvdata(ghes_dev); generic = ghes->generic; + ghes->flags |= GHES_EXITING; switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + del_timer_sync(&ghes->timer); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + free_irq(ghes->irq, ghes); + break; case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); @@ -404,12 +616,23 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) unregister_acpi_hed_notifier(&ghes_notifier_sci); mutex_unlock(&ghes_list_mutex); break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + list_del_rcu(&ghes->list); + if (list_empty(&ghes_nmi)) + unregister_die_notifier(&ghes_notifier_nmi); + mutex_unlock(&ghes_list_mutex); + /* + * To synchronize with NMI handler, ghes can only be + * freed after NMI handler finishes. + */ + synchronize_rcu(); + break; default: BUG(); break; } - synchronize_rcu(); ghes_fini(ghes); kfree(ghes); @@ -429,6 +652,8 @@ static struct platform_driver ghes_platform_driver = { static int __init ghes_init(void) { + int rc; + if (acpi_disabled) return -ENODEV; @@ -437,12 +662,25 @@ static int __init ghes_init(void) return -EINVAL; } - return platform_driver_register(&ghes_platform_driver); + rc = ghes_ioremap_init(); + if (rc) + goto err; + + rc = platform_driver_register(&ghes_platform_driver); + if (rc) + goto err_ioremap_exit; + + return 0; +err_ioremap_exit: + ghes_ioremap_exit(); +err: + return rc; } static void __exit ghes_exit(void) { platform_driver_unregister(&ghes_platform_driver); + ghes_ioremap_exit(); } module_init(ghes_init); diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88ebb..991bb87a1704 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/lib/ioremap.c b/lib/ioremap.c index 5730ecd3eb66..da4e2ad74b68 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long addr, return err; } +EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d00c5a..816f074fb4e1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1175,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB -- cgit v1.3-8-gc7d7 From 5fdade95f26372154459347dfb9f60721d22cfc7 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 11 Jan 2011 12:18:12 -0500 Subject: time: Rename misnamed minsec argument of clocks_calc_mult_shift() The minsec argument to clocks_calc_mult_shift() is misnamed. It is used to clamp the magnitude of the mult factor so that a multiplication with any value in the given range won't overflow a 64 bit result. Let's rename it to match the actual usage. Signed-off-by: Nicolas Pitre Acked-by: John Stultz LKML-Reference: Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c18d7efa1b4b..8588abcac07b 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * @shift: pointer to shift variable * @from: frequency to convert from * @to: frequency to convert to - * @minsec: guaranteed runtime conversion range in seconds + * @maxsec: guaranteed runtime conversion range in seconds * * The function evaluates the shift/mult pair for the scaled math * operations of clocksources and clockevents. @@ -122,7 +122,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock * event @to is the counter frequency and @from is NSEC_PER_SEC. * - * The @minsec conversion range argument controls the time frame in + * The @maxsec conversion range argument controls the time frame in * seconds which must be covered by the runtime conversion with the * calculated mult and shift factors. This guarantees that no 64bit * overflow happens when the input value of the conversion is @@ -131,7 +131,7 @@ EXPORT_SYMBOL_GPL(timecounter_cyc2time); * factors. */ void -clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) { u64 tmp; u32 sft, sftacc= 32; @@ -140,7 +140,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec) * Calculate the shift factor which is limiting the conversion * range: */ - tmp = ((u64)minsec * from) >> 32; + tmp = ((u64)maxsec * from) >> 32; while (tmp) { tmp >>=1; sftacc--; -- cgit v1.3-8-gc7d7 From afa14e7c553ebe45844d76208f66017a43abd0e2 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 11 Jan 2011 17:59:38 -0600 Subject: timekeeping: Make local variables static Signed-off-by: H Hartley Sweeten Cc: John Stultz LKML-Reference: <0D753D10438DA54287A00B027084269764CE0E54B7@AUSP01VMBX24.collaborationhost.net> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da82003..eef7452bd8a9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -49,7 +49,7 @@ struct timekeeper { u32 mult; }; -struct timekeeper timekeeper; +static struct timekeeper timekeeper; /** * timekeeper_setup_internals - Set up internals to use clocksource clock. @@ -164,7 +164,7 @@ static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ -struct timespec raw_time; +static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -- cgit v1.3-8-gc7d7 From 0df6a63f8735a7c8a877878bc215d4312e41ef81 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Dec 2010 13:29:29 -0500 Subject: switch cgroup switching it to s_d_op allows to kill the cgroup_lookup() kludge. Signed-off-by: Al Viro --- kernel/cgroup.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc11cd85..5c5f4cc2e99a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -763,8 +763,6 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); * -> cgroup_mkdir. */ -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd); static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); @@ -1451,6 +1449,10 @@ static int cgroup_set_super(struct super_block *sb, void *data) static int cgroup_get_rootdir(struct super_block *sb) { + static const struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + }; + struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); struct dentry *dentry; @@ -1468,6 +1470,8 @@ static int cgroup_get_rootdir(struct super_block *sb) return -ENOMEM; } sb->s_root = dentry; + /* for everything else we want ->d_op set */ + sb->s_d_op = &cgroup_dops; return 0; } @@ -2191,7 +2195,7 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = cgroup_lookup, + .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, @@ -2207,26 +2211,6 @@ static inline struct cftype *__file_cft(struct file *file) return __d_cft(file->f_dentry); } -static int cgroup_delete_dentry(const struct dentry *dentry) -{ - return 1; -} - -static struct dentry *cgroup_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - static const struct dentry_operations cgroup_dentry_operations = { - .d_delete = cgroup_delete_dentry, - .d_iput = cgroup_diput, - }; - - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_set_d_op(dentry, &cgroup_dentry_operations); - d_add(dentry, NULL); - return NULL; -} - static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { -- cgit v1.3-8-gc7d7 From 04c6862c055fb687c90d9652f32c11a063df15cf Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Wed, 12 Jan 2011 16:59:30 -0800 Subject: kmsg_dump: add kmsg_dump() calls to the reboot, halt, poweroff and emergency_restart paths We need to know the reason why system rebooted in support service. However, we can't inform our customers of the reason because final messages are lost on current Linux kernel. This patch improves the situation above because the final messages are saved by adding kmsg_dump() to reboot, halt, poweroff and emergency_restart path. Signed-off-by: Seiji Aguchi Cc: David Woodhouse Cc: Marco Stornelli Reviewed-by: Artem Bityutskiy Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmsg_dump.h | 4 ++++ kernel/printk.c | 4 ++++ kernel/sys.c | 6 ++++++ 3 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 24b44145a886..2a0d7d651dc3 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -18,6 +18,10 @@ enum kmsg_dump_reason { KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, KMSG_DUMP_KEXEC, + KMSG_DUMP_RESTART, + KMSG_DUMP_HALT, + KMSG_DUMP_POWEROFF, + KMSG_DUMP_EMERG, }; /** diff --git a/kernel/printk.c b/kernel/printk.c index f64b8997fc76..0b0c9aa71e89 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1539,6 +1539,10 @@ static const char * const kmsg_reasons[] = { [KMSG_DUMP_OOPS] = "oops", [KMSG_DUMP_PANIC] = "panic", [KMSG_DUMP_KEXEC] = "kexec", + [KMSG_DUMP_RESTART] = "restart", + [KMSG_DUMP_HALT] = "halt", + [KMSG_DUMP_POWEROFF] = "poweroff", + [KMSG_DUMP_EMERG] = "emergency_restart", }; static const char *kmsg_to_str(enum kmsg_dump_reason reason) diff --git a/kernel/sys.c b/kernel/sys.c index 2745dcdb6c6c..31b71a276b40 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -43,6 +43,8 @@ #include #include +#include + #include #include #include @@ -285,6 +287,7 @@ out_unlock: */ void emergency_restart(void) { + kmsg_dump(KMSG_DUMP_EMERG); machine_emergency_restart(); } EXPORT_SYMBOL_GPL(emergency_restart); @@ -312,6 +315,7 @@ void kernel_restart(char *cmd) printk(KERN_EMERG "Restarting system.\n"); else printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); machine_restart(cmd); } EXPORT_SYMBOL_GPL(kernel_restart); @@ -333,6 +337,7 @@ void kernel_halt(void) kernel_shutdown_prepare(SYSTEM_HALT); sysdev_shutdown(); printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); machine_halt(); } @@ -351,6 +356,7 @@ void kernel_power_off(void) disable_nonboot_cpus(); sysdev_shutdown(); printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); machine_power_off(); } EXPORT_SYMBOL_GPL(kernel_power_off); -- cgit v1.3-8-gc7d7 From 351f8f8e6499ae4fff40f5e3a8fe16d9e1903646 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Wed, 12 Jan 2011 16:59:39 -0800 Subject: kernel: clean up USE_GENERIC_SMP_HELPERS For arch which needs USE_GENERIC_SMP_HELPERS, it has to select USE_GENERIC_SMP_HELPERS, rather than leaving a choice to user, since they don't provide their own implementions. Also, move on_each_cpu() to kernel/smp.c, it is strange to put it in kernel/softirq.c. For arch which doesn't use USE_GENERIC_SMP_HELPERS, e.g. blackfin, only on_each_cpu() is compiled. Signed-off-by: Amerigo Wang Cc: David Howells Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Yinghai Lu Cc: Peter Zijlstra Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mn10300/Kconfig | 6 +----- arch/x86/Kconfig | 5 +---- kernel/Makefile | 2 +- kernel/smp.c | 19 +++++++++++++++++++ kernel/softirq.c | 19 ------------------- 5 files changed, 22 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 41ba38513c89..8ed41cf2b08d 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -203,6 +203,7 @@ endmenu config SMP bool "Symmetric multi-processing support" default y + select USE_GENERIC_SMP_HELPERS depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050 ---help--- This enables support for systems with more than one CPU. If you have @@ -226,11 +227,6 @@ config NR_CPUS depends on SMP default "2" -config USE_GENERIC_SMP_HELPERS - bool - depends on SMP - default y - source "kernel/Kconfig.preempt" config MN10300_CURRENT_IN_E2 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b6fccb07123e..8734db3c5830 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -65,6 +65,7 @@ config X86 select HAVE_SPARSE_IRQ select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP + select USE_GENERIC_SMP_HELPERS if SMP config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -203,10 +204,6 @@ config HAVE_INTEL_TXT def_bool y depends on EXPERIMENTAL && DMAR && ACPI -config USE_GENERIC_SMP_HELPERS - def_bool y - depends on SMP - config X86_32_SMP def_bool y depends on X86_32 && SMP diff --git a/kernel/Makefile b/kernel/Makefile index 5669f71dfdd5..353d3fe8ba33 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -43,7 +43,7 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o -obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o +obj-$(CONFIG_SMP) += smp.o ifneq ($(CONFIG_SMP),y) obj-y += up.o endif diff --git a/kernel/smp.c b/kernel/smp.c index 12ed8b013e2d..4ec30e069987 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -13,6 +13,7 @@ #include #include +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS static struct { struct list_head queue; raw_spinlock_t lock; @@ -529,3 +530,21 @@ void ipi_call_unlock_irq(void) { raw_spin_unlock_irq(&call_function.lock); } +#endif /* USE_GENERIC_SMP_HELPERS */ + +/* + * Call a function on all processors + */ +int on_each_cpu(void (*func) (void *info), void *info, int wait) +{ + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, wait); + local_irq_disable(); + func(info); + local_irq_enable(); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c index 0823778f87fc..68eb5efec388 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -885,25 +885,6 @@ static __init int spawn_ksoftirqd(void) } early_initcall(spawn_ksoftirqd); -#ifdef CONFIG_SMP -/* - * Call a function on all processors - */ -int on_each_cpu(void (*func) (void *info), void *info, int wait) -{ - int ret = 0; - - preempt_disable(); - ret = smp_call_function(func, info, wait); - local_irq_disable(); - func(info); - local_irq_enable(); - preempt_enable(); - return ret; -} -EXPORT_SYMBOL(on_each_cpu); -#endif - /* * [ These __weak aliases are kept in a separate compilation unit, so that * GCC does not inline them incorrectly. ] -- cgit v1.3-8-gc7d7 From 455cd5ab305c90ffc422dd2e0fb634730942b257 Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Wed, 12 Jan 2011 16:59:41 -0800 Subject: kptr_restrict for hiding kernel pointers from unprivileged users Add the %pK printk format specifier and the /proc/sys/kernel/kptr_restrict sysctl. The %pK format specifier is designed to hide exposed kernel pointers, specifically via /proc interfaces. Exposing these pointers provides an easy target for kernel write vulnerabilities, since they reveal the locations of writable structures containing easily triggerable function pointers. The behavior of %pK depends on the kptr_restrict sysctl. If kptr_restrict is set to 0, no deviation from the standard %p behavior occurs. If kptr_restrict is set to 1, the default, if the current user (intended to be a reader via seq_printf(), etc.) does not have CAP_SYSLOG (currently in the LSM tree), kernel pointers using %pK are printed as 0's. If kptr_restrict is set to 2, kernel pointers using %pK are printed as 0's regardless of privileges. Replacing with 0's was chosen over the default "(null)", which cannot be parsed by userland %p, which expects "(nil)". [akpm@linux-foundation.org: check for IRQ context when !kptr_restrict, save an indent level, s/WARN/WARN_ONCE/] [akpm@linux-foundation.org: coding-style fixup] [randy.dunlap@oracle.com: fix kernel/sysctl.c warning] Signed-off-by: Dan Rosenberg Signed-off-by: Randy Dunlap Cc: James Morris Cc: Eric Dumazet Cc: Thomas Graf Cc: Eugene Teo Cc: Kees Cook Cc: Ingo Molnar Cc: David S. Miller Cc: Peter Zijlstra Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 14 ++++++++++++++ include/linux/printk.h | 1 + kernel/sysctl.c | 10 ++++++++++ lib/vsprintf.c | 22 ++++++++++++++++++++++ 4 files changed, 47 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 574067194f38..11d5ceda5bb0 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -34,6 +34,7 @@ show up in /proc/sys/kernel: - hotplug - java-appletviewer [ binfmt_java, obsolete ] - java-interpreter [ binfmt_java, obsolete ] +- kptr_restrict - kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] - modprobe ==> Documentation/debugging-modules.txt @@ -261,6 +262,19 @@ This flag controls the L2 cache of G3 processor boards. If ============================================================== +kptr_restrict: + +This toggle indicates whether restrictions are placed on +exposing kernel addresses via /proc and other interfaces. When +kptr_restrict is set to (0), there are no restrictions. When +kptr_restrict is set to (1), the default, kernel pointers +printed using the %pK format specifier will be replaced with 0's +unless the user has CAP_SYSLOG. When kptr_restrict is set to +(2), kernel pointers printed using %pK will be replaced with 0's +regardless of privileges. + +============================================================== + kstack_depth_to_print: (X86 only) Controls the number of words to print when dumping the raw diff --git a/include/linux/printk.h b/include/linux/printk.h index b772ca5fbdf0..9adfba6ec28a 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -83,6 +83,7 @@ extern bool printk_timed_ratelimit(unsigned long *caller_jiffies, extern int printk_delay_msec; extern int dmesg_restrict; +extern int kptr_restrict; /* * Print a one-time message (analogous to WARN_ONCE() et al): diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ae5cbb1e3ced..c6811ee2092b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -710,6 +711,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, #endif { .procname = "ngroups_max", diff --git a/lib/vsprintf.c b/lib/vsprintf.c index c150d3dafff4..6ff38524ec16 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -936,6 +936,8 @@ char *uuid_string(char *buf, char *end, const u8 *addr, return string(buf, end, uuid, spec); } +int kptr_restrict = 1; + /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -979,6 +981,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr, * Implements a "recursive vsnprintf". * Do not use this feature without some mechanism to verify the * correctness of the format string and va_list arguments. + * - 'K' For a kernel pointer that should be hidden from unprivileged users * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1035,6 +1038,25 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return buf + vsnprintf(buf, end - buf, ((struct va_format *)ptr)->fmt, *(((struct va_format *)ptr)->va)); + case 'K': + /* + * %pK cannot be used in IRQ context because its test + * for CAP_SYSLOG would be meaningless. + */ + if (in_irq() || in_serving_softirq() || in_nmi()) { + if (spec.field_width == -1) + spec.field_width = 2 * sizeof(void *); + return string(buf, end, "pK-error", spec); + } else if ((kptr_restrict == 0) || + (kptr_restrict == 1 && + has_capability_noaudit(current, CAP_SYSLOG))) + break; + + if (spec.field_width == -1) { + spec.field_width = 2 * sizeof(void *); + spec.flags |= ZEROPAD; + } + return number(buf, end, 0, spec); } spec.flags |= SMALL; if (spec.field_width == -1) { -- cgit v1.3-8-gc7d7 From fb842b00c5eab66ec361b31550aa8a922745ce9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 12 Jan 2011 16:59:43 -0800 Subject: printk: use RCU to prevent potential lock contention in kmsg_dump dump_list_lock is used to protect dump_list in kmsg_dumper implementation, kmsg_dump() uses it to traverse dump_list too. But if there is contention on the lock, kmsg_dump() will fail, and the valuable kernel message may be lost. This patch solves this issue with RCU. Because kmsg_dump() only read the list, no lock is needed in kmsg_dump(). So that kmsg_dump() will never fail because of lock contention. Signed-off-by: Huang Ying Cc: "Paul E. McKenney" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 34 +++++++--------------------------- 1 file changed, 7 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 0b0c9aa71e89..53d9a9ec88e6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -39,6 +39,7 @@ #include #include #include +#include #include @@ -1502,7 +1503,7 @@ int kmsg_dump_register(struct kmsg_dumper *dumper) /* Don't allow registering multiple times */ if (!dumper->registered) { dumper->registered = 1; - list_add_tail(&dumper->list, &dump_list); + list_add_tail_rcu(&dumper->list, &dump_list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); @@ -1526,33 +1527,16 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper) spin_lock_irqsave(&dump_list_lock, flags); if (dumper->registered) { dumper->registered = 0; - list_del(&dumper->list); + list_del_rcu(&dumper->list); err = 0; } spin_unlock_irqrestore(&dump_list_lock, flags); + synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(kmsg_dump_unregister); -static const char * const kmsg_reasons[] = { - [KMSG_DUMP_OOPS] = "oops", - [KMSG_DUMP_PANIC] = "panic", - [KMSG_DUMP_KEXEC] = "kexec", - [KMSG_DUMP_RESTART] = "restart", - [KMSG_DUMP_HALT] = "halt", - [KMSG_DUMP_POWEROFF] = "poweroff", - [KMSG_DUMP_EMERG] = "emergency_restart", -}; - -static const char *kmsg_to_str(enum kmsg_dump_reason reason) -{ - if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0) - return "unknown"; - - return kmsg_reasons[reason]; -} - /** * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping @@ -1591,13 +1575,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) l2 = chars; } - if (!spin_trylock_irqsave(&dump_list_lock, flags)) { - printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n", - kmsg_to_str(reason)); - return; - } - list_for_each_entry(dumper, &dump_list, list) + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) dumper->dump(dumper, reason, s1, l1, s2, l2); - spin_unlock_irqrestore(&dump_list_lock, flags); + rcu_read_unlock(); } #endif -- cgit v1.3-8-gc7d7 From 34e49d4f635d6a800c4089c40fd254e12e451449 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Jan 2011 17:00:30 -0800 Subject: fs/proc/base.c, kernel/latencytop.c: convert sprintf_symbol() to %ps Use temporary lr for struct latency_record for improved readability and fewer columns used. Removed trailing space from output. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 22 ++++++++-------------- kernel/latencytop.c | 23 +++++++++-------------- 2 files changed, 17 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index b20962c71a52..336b79803e82 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -373,24 +373,18 @@ static int lstats_show_proc(struct seq_file *m, void *v) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < 32; i++) { - if (task->latency_record[i].backtrace[0]) { + struct latency_record *lr = &task->latency_record[i]; + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %li %li ", - task->latency_record[i].count, - task->latency_record[i].time, - task->latency_record[i].max); + seq_printf(m, "%i %li %li", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!task->latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (task->latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, task->latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } seq_printf(m, "\n"); } diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 17110a4a4fc2..ee74b35e528d 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -241,24 +241,19 @@ static int lstats_show(struct seq_file *m, void *v) seq_puts(m, "Latency Top version : v0.1\n"); for (i = 0; i < MAXLR; i++) { - if (latency_record[i].backtrace[0]) { + struct latency_record *lr = &latency_record[i]; + + if (lr->backtrace[0]) { int q; - seq_printf(m, "%i %lu %lu ", - latency_record[i].count, - latency_record[i].time, - latency_record[i].max); + seq_printf(m, "%i %lu %lu", + lr->count, lr->time, lr->max); for (q = 0; q < LT_BACKTRACEDEPTH; q++) { - char sym[KSYM_SYMBOL_LEN]; - char *c; - if (!latency_record[i].backtrace[q]) + unsigned long bt = lr->backtrace[q]; + if (!bt) break; - if (latency_record[i].backtrace[q] == ULONG_MAX) + if (bt == ULONG_MAX) break; - sprint_symbol(sym, latency_record[i].backtrace[q]); - c = strchr(sym, '+'); - if (c) - *c = 0; - seq_printf(m, "%s ", sym); + seq_printf(m, " %ps", (void *)bt); } seq_printf(m, "\n"); } -- cgit v1.3-8-gc7d7 From 556105000334cb440636ef61b862d22b03c24f70 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 12 Jan 2011 17:00:45 -0800 Subject: sysctl: fix #ifdef guard comment Signed-off-by: Jovi Zhang Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c6811ee2092b..a05605a4c369 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2909,7 +2909,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } -#else /* CONFIG_PROC_FS */ +#else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2961,7 +2961,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, } -#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_PROC_SYSCTL */ /* * No sense putting this after each symbol definition, twice, -- cgit v1.3-8-gc7d7 From e020e742e5dbd8c44d31706995dc13ddc732e274 Mon Sep 17 00:00:00 2001 From: Jovi Zhang Date: Wed, 12 Jan 2011 17:00:45 -0800 Subject: sysctl: remove obsolete comments ctl_unnumbered.txt have been removed in Documentation directory so just also remove this invalid comments [akpm@linux-foundation.org: fix Documentation/sysctl/00-INDEX, per Dave] Signed-off-by: Jovi Zhang Cc: Dave Young Acked-by: WANG Cong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/00-INDEX | 2 -- kernel/sysctl.c | 17 ----------------- 2 files changed, 19 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/00-INDEX b/Documentation/sysctl/00-INDEX index 1286f455992f..8cf5d493fd03 100644 --- a/Documentation/sysctl/00-INDEX +++ b/Documentation/sysctl/00-INDEX @@ -4,8 +4,6 @@ README - general information about /proc/sys/ sysctl files. abi.txt - documentation for /proc/sys/abi/*. -ctl_unnumbered.txt - - explanation of why one should not add new binary sysctl numbers. fs.txt - documentation for /proc/sys/fs/*. kernel.txt diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a05605a4c369..bc86bb32e126 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -246,10 +246,6 @@ static struct ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -972,10 +968,6 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1336,11 +1328,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif - -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; @@ -1496,10 +1483,6 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, -/* - * NOTE: do not add new entries to this table unless you have read - * Documentation/sysctl/ctl_unnumbered.txt - */ { } }; -- cgit v1.3-8-gc7d7 From 6164281ab7a4d3bd42588d6b25984e960a2e032f Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 12 Jan 2011 17:00:46 -0800 Subject: user_ns: improve the user_ns on-the-slab packaging Currently on 64-bit arch the user_namespace is 2096 and when being kmalloc-ed it resides on a 4k slab wasting 2003 bytes. If we allocate a separate cache for it and reduce the hash size from 128 to 64 chains the packaging becomes *much* better - the struct is 1072 bytes and the hole between is 98 bytes. [akpm@linux-foundation.org: s/__initcall/module_init/] Signed-off-by: Pavel Emelyanov Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/user_namespace.h | 2 +- kernel/user_namespace.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8178156711f9..faf467944baf 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -6,7 +6,7 @@ #include #include -#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 8) +#define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) #define UIDHASH_SZ (1 << UIDHASH_BITS) struct user_namespace { diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 25915832291a..9da289c34f22 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -12,6 +12,8 @@ #include #include +static struct kmem_cache *user_ns_cachep __read_mostly; + /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the @@ -26,7 +28,7 @@ int create_user_ns(struct cred *new) struct user_struct *root_user; int n; - ns = kmalloc(sizeof(struct user_namespace), GFP_KERNEL); + ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); if (!ns) return -ENOMEM; @@ -38,7 +40,7 @@ int create_user_ns(struct cred *new) /* Alloc new root user. */ root_user = alloc_uid(ns, 0); if (!root_user) { - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); return -ENOMEM; } @@ -71,7 +73,7 @@ static void free_user_ns_work(struct work_struct *work) struct user_namespace *ns = container_of(work, struct user_namespace, destroyer); free_uid(ns->creator); - kfree(ns); + kmem_cache_free(user_ns_cachep, ns); } void free_user_ns(struct kref *kref) @@ -126,3 +128,10 @@ gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t /* No useful relationship so no mapping */ return overflowgid; } + +static __init int user_namespaces_init(void) +{ + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + return 0; +} +module_init(user_namespaces_init); -- cgit v1.3-8-gc7d7 From 9ab020cf07e457a8b425bf5af17e704f90f86d8b Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 12 Jan 2011 17:00:48 -0800 Subject: taskstats: use better ifdef for alignment Commit 4be2c95d ("taskstats: pad taskstats netlink response for aligment issues on ia64") added a null field to align the taskstats structure but the discussion centered around ia64. The issue exists on other platforms with inefficient unaligned access and adding them piecemeal would be an unmaintainable mess. This patch uses Dave Miller's suggestion of using a combination of CONFIG_64BIT && !CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS to determine whether alignment is needed. Note that this will cause breakage on those platforms with applications like iotop which had hard-coded offsets into the packet to access the taskstats structure. The message seen on systems without the alignment fixes looks like: kernel unaligned access to 0xe000023879dca9bc, ip=0xa000000100133d10 The addresses may vary but resolve to locations inside __delayacct_add_tsk. iotop makes what I'd call unreasonable assumptions about the contents of a netlink genetlink packet containing generic attributes. They're typed and have headers that specify value lengths, so the client can (should) identify and skip the ones the client doesn't understand. The kernel, as of version 2.6.36, presented a packet like so: +--------------------------------+ | genlmsghdr - 4 bytes | +--------------------------------+ | NLA header - 4 bytes | /* Aggregate header */ +-+------------------------------+ | | NLA header - 4 bytes | /* PID header */ | +------------------------------+ | | pid/tgid - 4 bytes | | +------------------------------+ | | NLA header - 4 bytes | /* stats header */ | + -----------------------------+ <- oops. aligned on 4 byte boundary | | struct taskstats - 328 bytes | +-+------------------------------+ The iotop code expects that the kernel will behave as it did then, assuming that the packet format is set in stone. The format is set in stone, but the packet offsets are not. There's nothing in the packet format that guarantees that the packet will be sent in exactly the same way. The attribute contents are set (or versioned) and the aggregate contents are set but they can be anywhere in the packet. The issue here isn't that an unaligned structure gets passed to userspace, it's that the NLA infrastructure has something of a weakness: The 4 byte attribute header may force the payload to be unaligned. The taskstats structure is created at an unaligned location and then 64-bit values are operated on inside the kernel, so the unaligned access warnings gets spewed everywhere. It's possible to use the unaligned access API to operate on the structure in the kernel but it seems like a wasted effort to work around userspace code that isn't following the packet format. Any new additions would also need the be worked around. It's a maintenance nightmare. The conclusion of the earlier discussion seemed to be "ok fine, if we have to break it, don't break it on arches that don't have the problem." Dave pointed out that the unaligned access problem doesn't only exist on ia64, but also on other 64-bit arches that don't have efficient unaligned access and it should be fixed there as well. The committed version of the patch and this addition keep with the conclusion of that discussion not to break it unnecessarily, which the pid padding and the packet padding fixes did do. x86_64 and powerpc don't suffer this problem so they shouldn't suffer the solution. Other 64-bit architectures do and will, though. Signed-off-by: Jeff Mahoney Reported-by: David S. Miller Acked-by: David S. Miller Cc: Dan Carpenter Cc: Balbir Singh Cc: Florian Mickler Cc: Guillaume Chazarain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 69691eb4b715..3971c6b9d58d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -348,7 +348,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) return ret; } -#ifdef CONFIG_IA64 +#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define TASKSTATS_NEEDS_PADDING 1 #endif -- cgit v1.3-8-gc7d7 From 025b40abe715d638e60516a657d354e8560c1a85 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 12 Jan 2011 17:00:56 -0800 Subject: ntp: add hardpps implementation This commit adds hardpps() implementation based upon the original one from the NTPv4 reference kernel code from David Mills. However, it is highly optimized towards very fast syncronization and maximum stickness to PPS signal. The typical error is less then a microsecond. To make it sync faster I had to throw away exponential phase filter so that the full phase offset is corrected immediately. Then I also had to throw away median phase filter because it gives a bigger error itself if used without exponential filter. Maybe we will find an appropriate filtering scheme in the future but it's not necessary if the signal quality is ok. Signed-off-by: Alexander Gordeev Acked-by: John Stultz Cc: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pps/Kconfig | 9 ++ include/linux/timex.h | 1 + kernel/time/ntp.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 420 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/drivers/pps/Kconfig b/drivers/pps/Kconfig index 1afe4e03440f..0ad5ff38bfec 100644 --- a/drivers/pps/Kconfig +++ b/drivers/pps/Kconfig @@ -30,6 +30,15 @@ config PPS_DEBUG messages to the system log. Select this if you are having a problem with PPS support and want to see more of what is going on. +config NTP_PPS + bool "PPS kernel consumer support" + depends on PPS && !NO_HZ + help + This option adds support for direct in-kernel time + syncronization using an external PPS signal. + + It doesn't work on tickless systems at the moment. + source drivers/pps/clients/Kconfig endmenu diff --git a/include/linux/timex.h b/include/linux/timex.h index 32d852f8cbe4..d23999f9499d 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -268,6 +268,7 @@ extern u64 tick_length; extern void second_overflow(void); extern void update_ntp_one_tick(void); extern int do_adjtimex(struct timex *); +extern void hardpps(const struct timespec *, const struct timespec *); int read_current_timer(unsigned long *timer_val); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d2321891538f..5c00242fa921 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -14,6 +14,7 @@ #include #include #include +#include /* * NTP timekeeping variables: @@ -74,6 +75,162 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) + && !(time_status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((time_status & STA_PPSFREQ) + && (time_status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + /* * NTP methods: */ @@ -185,6 +342,9 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); } /* @@ -250,16 +410,16 @@ void second_overflow(void) time_status |= STA_UNSYNC; } - /* - * Compute the phase adjustment for the next second. The offset is - * reduced by a fixed factor times the time constant. - */ + /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; - delta = shift_right(time_offset, SHIFT_PLL + time_constant); + delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; + /* Check PPS signal */ + pps_dec_valid(); + if (!time_adjust) return; @@ -369,6 +529,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); } /* @@ -418,6 +580,8 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) @@ -508,7 +672,8 @@ int do_adjtimex(struct timex *txc) } result = time_state; /* mostly `TIME_OK' */ - if (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* check for errors */ + if (is_error_status(time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * @@ -522,15 +687,8 @@ int do_adjtimex(struct timex *txc) txc->tick = tick_usec; txc->tai = time_tai; - /* PPS is not implemented, so these are zero */ - txc->ppsfreq = 0; - txc->jitter = 0; - txc->shift = 0; - txc->stabil = 0; - txc->jitcnt = 0; - txc->calcnt = 0; - txc->errcnt = 0; - txc->stbcnt = 0; + /* fill PPS status fields */ + pps_fill_timex(txc); write_sequnlock_irq(&xtime_lock); @@ -544,6 +702,243 @@ int do_adjtimex(struct timex *txc) return result; } +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + pr_err("hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + unsigned long flags; + + pts_norm = pps_normalize_ts(*phase_ts); + + write_seqlock_irqsave(&xtime_lock, flags); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + pr_err("hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} +EXPORT_SYMBOL(hardpps); + +#endif /* CONFIG_NTP_PPS */ + static int __init ntp_tick_adj_setup(char *str) { ntp_tick_adj = simple_strtol(str, NULL, 0); -- cgit v1.3-8-gc7d7 From e2c18e49a0d4f822ffc29fb4958943beb1ff08b7 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 12 Jan 2011 17:00:57 -0800 Subject: pps: capture MONOTONIC_RAW timestamps as well MONOTONIC_RAW clock timestamps are ideally suited for frequency calculation and also fit well into the original NTP hardpps design. Now phase and frequency can be adjusted separately: the former based on REALTIME clock and the latter based on MONOTONIC_RAW clock. A new function getnstime_raw_and_real is added to timekeeping subsystem to capture both timestamps at the same time and atomically. Signed-off-by: Alexander Gordeev Acked-by: John Stultz Cc: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pps_kernel.h | 14 ++++++++++++++ include/linux/time.h | 2 ++ kernel/time/timekeeping.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) (limited to 'kernel') diff --git a/include/linux/pps_kernel.h b/include/linux/pps_kernel.h index 1aedf50088cf..94048547f29a 100644 --- a/include/linux/pps_kernel.h +++ b/include/linux/pps_kernel.h @@ -47,6 +47,9 @@ struct pps_source_info { }; struct pps_event_time { +#ifdef CONFIG_NTP_PPS + struct timespec ts_raw; +#endif /* CONFIG_NTP_PPS */ struct timespec ts_real; }; @@ -97,10 +100,21 @@ static inline void timespec_to_pps_ktime(struct pps_ktime *kt, kt->nsec = ts.tv_nsec; } +#ifdef CONFIG_NTP_PPS + +static inline void pps_get_ts(struct pps_event_time *ts) +{ + getnstime_raw_and_real(&ts->ts_raw, &ts->ts_real); +} + +#else /* CONFIG_NTP_PPS */ + static inline void pps_get_ts(struct pps_event_time *ts) { getnstimeofday(&ts->ts_real); } +#endif /* CONFIG_NTP_PPS */ + #endif /* LINUX_PPS_KERNEL_H */ diff --git a/include/linux/time.h b/include/linux/time.h index 9f15ac7ab92a..1e6d3b59238d 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -158,6 +158,8 @@ extern unsigned int alarm_setitimer(unsigned int seconds); extern int do_getitimer(int which, struct itimerval *value); extern void getnstimeofday(struct timespec *tv); extern void getrawmonotonic(struct timespec *ts); +extern void getnstime_raw_and_real(struct timespec *ts_raw, + struct timespec *ts_real); extern void getboottime(struct timespec *ts); extern void monotonic_to_bootbased(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5bb86da82003..5536aaf3ba36 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -288,6 +288,49 @@ void ktime_get_ts(struct timespec *ts) } EXPORT_SYMBOL_GPL(ktime_get_ts); +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + u32 arch_offset; + + seq = read_seqbegin(&xtime_lock); + + *ts_raw = raw_time; + *ts_real = xtime; + + nsecs_raw = timekeeping_get_ns_raw(); + nsecs_real = timekeeping_get_ns(); + + /* If arch requires, add in gettimeoffset() */ + arch_offset = arch_gettimeoffset(); + nsecs_raw += arch_offset; + nsecs_real += arch_offset; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + /** * do_gettimeofday - Returns the time of day in a timeval * @tv: pointer to the timeval to be set -- cgit v1.3-8-gc7d7 From 6c9ae009b298753a3baf71298d676a68b5a10c8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 13 Jan 2011 15:45:38 -0800 Subject: irq: use per_cpu kstat_irqs Use modern per_cpu API to increment {soft|hard}irq counters, and use per_cpu allocation for (struct irq_desc)->kstats_irq instead of an array. This gives better SMP/NUMA locality and saves few instructions per irq. With small nr_cpuids values (8 for example), kstats_irq was a small array (less than L1_CACHE_BYTES), potentially source of false sharing. In the !CONFIG_SPARSE_IRQ case, remove the huge, NUMA/cache unfriendly kstat_irqs_all[NR_IRQS][NR_CPUS] array. Note: we still populate kstats_irq for all possible irqs in early_irq_init(). We probably could use on-demand allocations. (Code included in alloc_descs()). Problem is not all IRQS are used with a prior alloc_descs() call. kstat_irqs_this_cpu() is not used anymore, remove it. Signed-off-by: Eric Dumazet Reviewed-by: Christoph Lameter Cc: Ingo Molnar Cc: Andi Kleen Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/irqdesc.h | 2 +- include/linux/kernel_stat.h | 19 +++++++++---------- kernel/irq/irqdesc.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 40 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 979c68cc7458..6a64c6fa81af 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -57,7 +57,7 @@ struct irq_desc { #endif struct timer_rand_state *timer_rand_state; - unsigned int *kstat_irqs; + unsigned int __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status; /* IRQ status */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44e83ba12b5b..0cce2db580c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -46,16 +46,14 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); #ifndef CONFIG_GENERIC_HARDIRQS -#define kstat_irqs_this_cpu(irq) \ - (this_cpu_read(kstat.irqs[irq]) struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) { - kstat_this_cpu.irqs[irq]++; - kstat_this_cpu.irqs_sum++; + __this_cpu_inc(kstat.irqs[irq]); + __this_cpu_inc(kstat.irqs_sum); } static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) @@ -65,17 +63,18 @@ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) #else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#define kstat_incr_irqs_this_cpu(irqno, DESC) do {\ - ((DESC)->kstat_irqs[smp_processor_id()]++);\ - kstat_this_cpu.irqs_sum++; } while (0) + +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ +do { \ + __this_cpu_inc(*(DESC)->kstat_irqs); \ + __this_cpu_inc(kstat.irqs_sum); \ +} while (0) #endif static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { - kstat_this_cpu.softirqs[irq]++; + __this_cpu_inc(kstat.softirqs[irq]); } static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 9988d03797f5..282f20230e67 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -72,6 +72,8 @@ static inline int desc_node(struct irq_desc *desc) { return 0; } static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) { + int cpu; + desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; @@ -83,7 +85,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) desc->irq_count = 0; desc->irqs_unhandled = 0; desc->name = NULL; - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; desc_smp_init(desc, node); } @@ -133,8 +136,7 @@ static struct irq_desc *alloc_desc(int irq, int node) if (!desc) return NULL; /* allocate based on nr_cpu_ids */ - desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), - gfp, node); + desc->kstat_irqs = alloc_percpu(unsigned int); if (!desc->kstat_irqs) goto err_desc; @@ -149,7 +151,7 @@ static struct irq_desc *alloc_desc(int irq, int node) return desc; err_kstat: - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); err_desc: kfree(desc); return NULL; @@ -166,7 +168,7 @@ static void free_desc(unsigned int irq) mutex_unlock(&sparse_irq_lock); free_masks(desc); - kfree(desc->kstat_irqs); + free_percpu(desc->kstat_irqs); kfree(desc); } @@ -234,7 +236,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; int __init early_irq_init(void) { int count, i, node = first_online_node; @@ -250,7 +251,8 @@ int __init early_irq_init(void) for (i = 0; i < count; i++) { desc[i].irq_data.irq = i; desc[i].irq_data.chip = &no_irq_chip; - desc[i].kstat_irqs = kstat_irqs_all[i]; + /* TODO : do this allocation on-demand ... */ + desc[i].kstat_irqs = alloc_percpu(unsigned int); alloc_masks(desc + i, GFP_KERNEL, node); desc_smp_init(desc + i, node); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); @@ -275,6 +277,22 @@ static void free_desc(unsigned int irq) static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) { +#if defined(CONFIG_KSTAT_IRQS_ONDEMAND) + struct irq_desc *desc; + unsigned int i; + + for (i = 0; i < cnt; i++) { + desc = irq_to_desc(start + i); + if (desc && !desc->kstat_irqs) { + unsigned int __percpu *stats = alloc_percpu(unsigned int); + + if (!stats) + return -1; + if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL) + free_percpu(stats); + } + } +#endif return start; } #endif /* !CONFIG_SPARSE_IRQ */ @@ -391,7 +409,9 @@ void dynamic_irq_cleanup(unsigned int irq) unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; } #ifdef CONFIG_GENERIC_HARDIRQS @@ -401,10 +421,10 @@ unsigned int kstat_irqs(unsigned int irq) int cpu; int sum = 0; - if (!desc) + if (!desc || !desc->kstat_irqs) return 0; for_each_possible_cpu(cpu) - sum += desc->kstat_irqs[cpu]; + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; } #endif /* CONFIG_GENERIC_HARDIRQS */ -- cgit v1.3-8-gc7d7 From 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 13 Jan 2011 15:45:40 -0800 Subject: sched: remove long deprecated CLONE_STOPPED flag This warning was added in commit bdff746a3915 ("clone: prepare to recycle CLONE_STOPPED") three years ago. 2.6.26 came and went. As far as I know, no-one is actually using CLONE_STOPPED. Signed-off-by: Dave Jones Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 3 ++- kernel/fork.c | 28 +--------------------------- 2 files changed, 3 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 96e23215e276..07402530fc70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -21,7 +21,8 @@ #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ -#define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) + and is now available for re-use. */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ #define CLONE_NEWUSER 0x10000000 /* New user namespace */ diff --git a/kernel/fork.c b/kernel/fork.c index d9b44f20b6b0..76a1fdd80bdf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1409,23 +1409,6 @@ long do_fork(unsigned long clone_flags, return -EPERM; } - /* - * We hope to recycle these flags after 2.6.26 - */ - if (unlikely(clone_flags & CLONE_STOPPED)) { - static int __read_mostly count = 100; - - if (count > 0 && printk_ratelimit()) { - char comm[TASK_COMM_LEN]; - - count--; - printk(KERN_INFO "fork(): process `%s' used deprecated " - "clone flags 0x%lx\n", - get_task_comm(comm, current), - clone_flags & CLONE_STOPPED); - } - } - /* * When called from kernel_thread, don't do user tracing stuff. */ @@ -1464,16 +1447,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - if (unlikely(clone_flags & CLONE_STOPPED)) { - /* - * We'll start up with an immediate SIGSTOP. - */ - sigaddset(&p->pending.signal, SIGSTOP); - set_tsk_thread_flag(p, TIF_SIGPENDING); - __set_task_state(p, TASK_STOPPED); - } else { - wake_up_new_task(p, clone_flags); - } + wake_up_new_task(p, clone_flags); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); -- cgit v1.3-8-gc7d7 From dabb16f639820267b3850d804571c70bd93d4e07 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Thu, 13 Jan 2011 15:46:05 -0800 Subject: oom: allow a non-CAP_SYS_RESOURCE proces to oom_score_adj down We'd like to be able to oom_score_adj a process up/down as it enters/leaves the foreground. Currently, it is not possible to oom_adj down without CAP_SYS_RESOURCE. This patch allows a task to decrease its oom_score_adj back to the value that a CAP_SYS_RESOURCE thread set it to or its inherited value at fork. Assuming the thread that has forked it has oom_score_adj of 0, each process could decrease it back from 0 upon activation unless a CAP_SYS_RESOURCE thread elevated it to something higher. Alternative considered: * a setuid binary * a daemon with CAP_SYS_RESOURCE Since you don't wan't all processes to be able to reduce their oom_adj, a setuid or daemon implementation would be complex. The alternatives also have much higher overhead. This patch updated from original patch based on feedback from David Rientjes. Signed-off-by: Mandeep Singh Baines Acked-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 4 ++++ fs/proc/base.c | 4 +++- include/linux/sched.h | 2 ++ kernel/fork.c | 1 + 4 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index ef757fca470b..23cae6548d3a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1323,6 +1323,10 @@ scaled linearly with /proc//oom_score_adj. Writing to /proc//oom_score_adj or /proc//oom_adj will change the other with its scaled value. +The value of /proc//oom_score_adj may be reduced no lower than the last +value set by a CAP_SYS_RESOURCE process. To reduce the value any lower +requires CAP_SYS_RESOURCE. + NOTICE: /proc//oom_adj is deprecated and will be removed, please see Documentation/feature-removal-schedule.txt. diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f1cdd5d3d7..9d096e82b201 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1151,7 +1151,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, goto err_task_lock; } - if (oom_score_adj < task->signal->oom_score_adj && + if (oom_score_adj < task->signal->oom_score_adj_min && !capable(CAP_SYS_RESOURCE)) { err = -EACCES; goto err_sighand; @@ -1164,6 +1164,8 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, atomic_dec(&task->mm->oom_disable_count); } task->signal->oom_score_adj = oom_score_adj; + if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) + task->signal->oom_score_adj_min = oom_score_adj; /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/linux/sched.h b/include/linux/sched.h index 07402530fc70..f23b5bb6f52e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -634,6 +634,8 @@ struct signal_struct { int oom_adj; /* OOM kill score adjustment (bit shift) */ int oom_score_adj; /* OOM kill score adjustment */ + int oom_score_adj_min; /* OOM kill score adjustment minimum value. + * Only settable by CAP_SYS_RESOURCE. */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations diff --git a/kernel/fork.c b/kernel/fork.c index 76a1fdd80bdf..1499607e4da2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -910,6 +910,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); -- cgit v1.3-8-gc7d7 From a5b338f2b0b1ff73ae20c66ab831201549eaec01 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:34 -0800 Subject: thp: update futex compound knowledge Futex code is smarter than most other gup_fast O_DIRECT code and knows about the compound internals. However now doing a put_page(head_page) will not release the pin on the tail page taken by gup-fast, leading to all sort of refcounting bugchecks. Getting a stable head_page is a little tricky. page_head = page is there because if this is not a tail page it's also the page_head. Only in case this is a tail page, compound_head is called, otherwise it's guaranteed unnecessary. And if it's a tail page compound_head has to run atomically inside irq disabled section __get_user_pages_fast before returning. Otherwise ->first_page won't be a stable pointer. Disableing irq before __get_user_page_fast and releasing irq after running compound_head is needed because if __get_user_page_fast returns == 1, it means the huge pmd is established and cannot go away from under us. pmdp_splitting_flush_notify in __split_huge_page_splitting will have to wait for local_irq_enable before the IPI delivery can return. This means __split_huge_page_refcount can't be running from under us, and in turn when we run compound_head(page) we're not reading a dangling pointer from tailpage->first_page. Then after we get to stable head page, we are always safe to call compound_lock and after taking the compound lock on head page we can finally re-check if the page returned by gup-fast is still a tail page. in which case we're set and we didn't need to split the hugepage in order to take a futex on it. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 3019b92e6917..52075633373f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -233,7 +233,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page; + struct page *page, *page_head; int err; /* @@ -265,11 +265,46 @@ again: if (err < 0) return err; - page = compound_head(page); - lock_page(page); - if (!page->mapping) { - unlock_page(page); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + if (!page_head->mapping) { + unlock_page(page_head); + put_page(page_head); goto again; } @@ -280,20 +315,20 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page)) { + if (PageAnon(page_head)) { key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page->mapping->host; - key->shared.pgoff = page->index; + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; } get_futex_key_refs(key); - unlock_page(page); - put_page(page); + unlock_page(page_head); + put_page(page_head); return 0; } -- cgit v1.3-8-gc7d7 From e7a00c45f29c0155007aa150bf231a70fa470365 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:45 -0800 Subject: thp: add pmd_huge_pte to mm_struct This increase the size of the mm struct a bit but it is needed to preallocate one pte for each hugepage so that split_huge_page will not require a fail path. Guarantee of success is a fundamental property of split_huge_page to avoid decrasing swapping reliability and to avoid adding -ENOMEM fail paths that would otherwise force the hugepage-unaware VM code to learn rolling back in the middle of its pte mangling operations (if something we need it to learn handling pmd_trans_huge natively rather being capable of rollback). When split_huge_page runs a pte is needed to succeed the split, to map the newly splitted regular pages with a regular pte. This way all existing VM code remains backwards compatible by just adding a split_huge_page* one liner. The memory waste of those preallocated ptes is negligible and so it is worth it. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 3 +++ kernel/fork.c | 7 +++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bb7288a782fd..26bc4e2cd275 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -309,6 +309,9 @@ struct mm_struct { #endif #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif /* How many tasks sharing this mm are OOM_DISABLE */ atomic_t oom_disable_count; diff --git a/kernel/fork.c b/kernel/fork.c index 1499607e4da2..f78f50ba6cb2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -529,6 +529,9 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -669,6 +672,10 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->token_priority = 0; mm->last_interval = 0; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.3-8-gc7d7 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 66 +++ include/linux/sched.h | 1 + kernel/fork.c | 5 + mm/huge_memory.c | 1073 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 1136 insertions(+), 10 deletions(-) create mode 100644 include/linux/khugepaged.h (limited to 'kernel') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d776e2..43a694ef8904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,6 +25,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 000000000000..552f3184756c --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if (khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb6f52e..d747f948b34e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb2..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); if (retval) goto out; @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7101112a5429..ae2bf08b1099 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,14 +12,111 @@ #include #include #include +#include +#include +#include #include #include #include "internal.h" +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ unsigned long transparent_hugepage_flags __read_mostly = - (1< 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + return ret; } static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); @@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = { static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, - .name = "transparent_hugepage", +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static int __init hugepage_init(void) { -#ifdef CONFIG_SYSFS int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; - err = sysfs_create_group(mm_kobj, &hugepage_attr_group); - if (err) - printk(KERN_ERR "hugepage: register sysfs failed\n"); + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } #endif - return 0; + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + start_khugepaged(); + +out: + return err; } module_init(hugepage_init) @@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; @@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags) return 0; } +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(!*hpage); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + + *hpage = NULL; + khugepaged_pages_collapsed++; +out: + up_write(&mm->mmap_sem); +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + up_read(&mm->mmap_sem); + collapse_huge_page(mm, address, hpage); + } +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if (!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} + +static void khugepaged_loop(void) +{ + struct page *hpage; + + while (likely(khugepaged_enabled())) { + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; + + khugepaged_do_scan(&hpage); + if (hpage) + put_page(hpage); + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_interruptible(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; -- cgit v1.3-8-gc7d7 From 3ec762ad8be364c2fadfe0d6b2cc6d4d3b5e1b54 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 14 Jan 2011 11:34:34 +0800 Subject: cgroups: Fix a lockdep warning at cgroup removal Commit 2fd6b7f5 ("fs: dcache scale subdirs") forgot to annotate a dentry lock, which caused a lockdep warning. Reported-by: Valdis Kletnieks Signed-off-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 51cddc11cd85..a7837e2d9d6b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -912,7 +912,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry) parent = dentry->d_parent; spin_lock(&parent->d_lock); - spin_lock(&dentry->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); list_del_init(&dentry->d_u.d_child); spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); -- cgit v1.3-8-gc7d7 From c072a388d59a1d48e36864d0e66f42d71745be1c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 7 Jan 2011 02:33:47 -0800 Subject: rcu: demote SRCU_SYNCHRONIZE_DELAY from kernel-parameter status Because the adaptive synchronize_srcu_expedited() approach has worked very well in testing, remove the kernel parameter and replace it by a C-preprocessor macro. If someone finds problems with this approach, a more complex and aggressively adaptive approach might be required. Longer term, SRCU will be merged with the other RCU implementations, at which point synchronize_srcu_expedited() will be event driven, just as synchronize_sched_expedited() currently is. At that point, there will be no need for this adaptive approach. Reported-by: Linus Torvalds Signed-off-by: Paul E. McKenney --- init/Kconfig | 15 --------------- kernel/srcu.c | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 526ec1c7456a..e11bc793a91d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -497,21 +497,6 @@ config RCU_BOOST_DELAY Accept the default if unsure. -config SRCU_SYNCHRONIZE_DELAY - int "Microseconds to delay before waiting for readers" - range 0 20 - default 10 - help - This option controls how long SRCU delays before entering its - loop waiting on SRCU readers. The purpose of this loop is - to avoid the unconditional context-switch penalty that would - otherwise be incurred if there was an active SRCU reader, - in a manner similar to adaptive locking schemes. This should - be set to be a bit longer than the common-case SRCU read-side - critical-section overhead. - - Accept the default if unsure. - endmenu # "RCU Subsystem" config IKCONFIG diff --git a/kernel/srcu.c b/kernel/srcu.c index 98d8c1e80edb..73ce23feaea9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -155,6 +155,16 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx) } EXPORT_SYMBOL_GPL(__srcu_read_unlock); +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ @@ -207,11 +217,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) * will have finished executing. We initially give readers * an arbitrarily chosen 10 microseconds to get out of their * SRCU read-side critical sections, then loop waiting 1/HZ - * seconds per iteration. + * seconds per iteration. The 10-microsecond value has done + * very well in testing. */ if (srcu_readers_active_idx(sp, idx)) - udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); + udelay(SYNCHRONIZE_SRCU_READER_DELAY); while (srcu_readers_active_idx(sp, idx)) schedule_timeout_interruptible(1); -- cgit v1.3-8-gc7d7 From b24efdfdf679cf9b05947c531971905fc727dd40 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jan 2011 14:18:11 -0800 Subject: rcu: avoid pointless blocked-task warnings If the RCU callback-processing kthread has nothing to do, it parks in a wait_event(). If RCU remains idle for more than two minutes, the kernel complains about this. This commit changes from wait_event() to wait_event_interruptible() to prevent the kernel from complaining just because RCU is idle. Reported-by: Russell King Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Thomas Weber Tested-by: Russell King --- kernel/rcutiny.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 034493724749..0c343b9a46d5 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -189,7 +189,8 @@ static int rcu_kthread(void *arg) unsigned long flags; for (;;) { - wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); morework = rcu_boost(); local_irq_save(flags); work = have_rcu_kthread_work; -- cgit v1.3-8-gc7d7 From c72a04e34735ec3f19f4788b7f95017310b5e1eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Jan 2011 05:31:45 +0000 Subject: cgroup_fs: fix cgroup use of simple_lookup() cgroup can't use simple_lookup(), since that'd override its desired ->d_op. Tested-by: Li Zefan Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c5f4cc2e99a..ffb7bbad0638 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -764,6 +764,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); */ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); static const struct inode_operations cgroup_dir_inode_operations; @@ -860,6 +861,11 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) iput(inode); } +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + static void remove_dir(struct dentry *d) { struct dentry *parent = dget(d->d_parent); @@ -1451,6 +1457,7 @@ static int cgroup_get_rootdir(struct super_block *sb) { static const struct dentry_operations cgroup_dops = { .d_iput = cgroup_diput, + .d_delete = cgroup_delete, }; struct inode *inode = @@ -2195,12 +2202,20 @@ static const struct file_operations cgroup_file_operations = { }; static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, + .lookup = cgroup_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .rename = cgroup_rename, }; +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + /* * Check if a file is a control file */ -- cgit v1.3-8-gc7d7 From 7f85803a26f304e698c673838aab06cc6d4d6e59 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Jan 2011 17:49:02 +0800 Subject: tracing: Remove syscall_exit_fields There is no need for syscall_exit_fields as the syscall exit event class can already host the fields in its structure, like most other trace events do by default. Use that default behavior instead. Following this scheme, we don't need anymore to override the get_fields() callback of the syscall exit event class either. Hence both syscall_exit_fields and syscall_get_exit_fields() can be removed. Also changed some indentation to keep the following under 80 characters: ".fields = LIST_HEAD_INIT(event_class_syscall_exit.fields)," Acked-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan LKML-Reference: <4D301C0E.8090408@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index bac752f0cfb5..b706529b4fc7 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -23,9 +23,6 @@ static int syscall_exit_register(struct ftrace_event_call *event, static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); -/* All syscall exit events have the same fields */ -static LIST_HEAD(syscall_exit_fields); - static struct list_head * syscall_get_enter_fields(struct ftrace_event_call *call) { @@ -34,34 +31,28 @@ syscall_get_enter_fields(struct ftrace_event_call *call) return &entry->enter_fields; } -static struct list_head * -syscall_get_exit_fields(struct ftrace_event_call *call) -{ - return &syscall_exit_fields; -} - struct trace_event_functions enter_syscall_print_funcs = { - .trace = print_syscall_enter, + .trace = print_syscall_enter, }; struct trace_event_functions exit_syscall_print_funcs = { - .trace = print_syscall_exit, + .trace = print_syscall_exit, }; struct ftrace_event_class event_class_syscall_enter = { - .system = "syscalls", - .reg = syscall_enter_register, - .define_fields = syscall_enter_define_fields, - .get_fields = syscall_get_enter_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, }; struct ftrace_event_class event_class_syscall_exit = { - .system = "syscalls", - .reg = syscall_exit_register, - .define_fields = syscall_exit_define_fields, - .get_fields = syscall_get_exit_fields, - .raw_init = init_syscall_trace, + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, }; extern unsigned long __start_syscalls_metadata[]; -- cgit v1.3-8-gc7d7 From 977dda7c9b540f48b228174346d8b31542c1e99f Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Fri, 14 Jan 2011 17:57:50 -0800 Subject: sched: Update effective_load() to use global share weights Previously effective_load would approximate the global load weight present on a group taking advantage of: entity_weight = tg->shares ( lw / global_lw ), where entity_weight was provided by tg_shares_up. This worked (approximately) for an 'empty' (at tg level) cpu since we would place boost load representative of what a newly woken task would receive. However, now that load is instantaneously updated this assumption is no longer true and the load calculation is rather incorrect in this case. Fix this (and improve the general case) by re-writing effective_load to take advantage of the new shares distribution code. Signed-off-by: Paul Turner Signed-off-by: Peter Zijlstra LKML-Reference: <20110115015817.069769529@google.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c62ebae65cf0..414145cf5344 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1362,27 +1362,27 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg) return wl; for_each_sched_entity(se) { - long S, rw, s, a, b; + long lw, w; - S = se->my_q->tg->shares; - s = se->load.weight; - rw = se->my_q->load.weight; + tg = se->my_q->tg; + w = se->my_q->load.weight; - a = S*(rw + wl); - b = S*rw + s*wg; + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; - wl = s*(a-b); + wl += w; - if (likely(b)) - wl /= b; + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; - /* - * Assume the group is already running and will - * thus already be accounted for in the weight. - * - * That is, moving shares between CPUs, does not - * alter the group weight. - */ + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; wg = 0; } -- cgit v1.3-8-gc7d7 From efe25c2c7b3a5d17b0c70987a758d8fe7af8e3d1 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:41:54 +0530 Subject: sched: Reinstate group names in /proc/sched_debug Displaying of group names in /proc/sched_debug was dropped in autogroup patches. Add group names while displaying cfs_rq and tasks information. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101153.GE4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d014b5..4d36f3726da7 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -16,6 +16,8 @@ #include #include +static DEFINE_SPINLOCK(sched_debug_lock); + /* * This allows printing both to /proc/sched_debug and * to the console @@ -86,6 +88,23 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group } #endif +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { @@ -108,6 +127,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); #endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif SEQ_printf(m, "\n"); } @@ -144,7 +166,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) struct sched_entity *last; unsigned long flags; +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", SPLIT_NS(cfs_rq->exec_clock)); @@ -191,7 +217,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif #define P(x) \ SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) @@ -212,6 +242,7 @@ extern __read_mostly int sched_clock_running; static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; #ifdef CONFIG_X86 { @@ -266,10 +297,14 @@ static void print_cpu(struct seq_file *m, int cpu) #undef P #endif + spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); print_rt_stats(m, cpu); + rcu_read_lock(); print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); } static const char *sched_tunable_scaling_names[] = { -- cgit v1.3-8-gc7d7 From 8ecedd7a06d27a31dbb36fab88e2ba6e6edd43ca Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 11 Jan 2011 15:42:57 +0530 Subject: sched: Display autogroup names in /proc/sched_debug Add autogroup name to cfs_rq and tasks information to /proc/sched_debug. Signed-off-by: Bharata B Rao Signed-off-by: Peter Zijlstra LKML-Reference: <20110111101257.GF4772@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_autogroup.c | 5 +++++ kernel/sched_debug.c | 3 +++ 2 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 32a723b8f84c..938d52f80a2d 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -231,6 +231,11 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) #ifdef CONFIG_SCHED_DEBUG static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) { + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (!enabled || !tg->autogroup) + return 0; + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); } #endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4d36f3726da7..e4d37259d490 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -93,6 +93,9 @@ static char group_path[PATH_MAX]; static char *task_group_path(struct task_group *tg) { + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + /* * May be NULL if the underlying cgroup isn't fully-created yet */ -- cgit v1.3-8-gc7d7 From f44937718ce3b8360f72f6c68c9481712517a867 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 13 Jan 2011 04:54:50 +0100 Subject: sched, autogroup: Fix CONFIG_RT_GROUP_SCHED sched_setscheduler() failure If CONFIG_RT_GROUP_SCHED is set, __sched_setscheduler() fails due to autogroup not allocating rt_runtime. Free unused/unusable rt_se and rt_rq, redirect RT tasks to the root task group, and tell __sched_setscheduler() that it's ok. Reported-and-tested-by: Bharata B Rao Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294890890.8089.39.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- kernel/sched_autogroup.c | 27 +++++++++++++++++++++++++++ kernel/sched_autogroup.h | 4 ++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a0eb0941fa84..6cbff6bd1a60 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4871,7 +4871,8 @@ recheck: * assigned. */ if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) { + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { __task_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); return -EPERM; diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c index 938d52f80a2d..9fb656283157 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched_autogroup.c @@ -27,6 +27,11 @@ static inline void autogroup_destroy(struct kref *kref) { struct autogroup *ag = container_of(kref, struct autogroup, kref); +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif sched_destroy_group(ag->tg); } @@ -55,6 +60,10 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -72,6 +81,19 @@ static inline struct autogroup *autogroup_create(void) init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif tg->autogroup = ag; return ag; @@ -106,6 +128,11 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return tg != &root_task_group && tg->autogroup; +} + static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) { diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h index 5358e241cb20..7b859ffe5dad 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched_autogroup.h @@ -15,6 +15,10 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg); static inline void autogroup_init(struct task_struct *init_task) { } static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} static inline struct task_group * autogroup_task_group(struct task_struct *p, struct task_group *tg) -- cgit v1.3-8-gc7d7 From fce2097983d914ea8c2338efc6f6e9bb737f6ae4 Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Fri, 14 Jan 2011 15:57:39 +0800 Subject: sched: Replace rq->bkl_count with rq->rq_sched_info.bkl_count Now rq->rq_sched_info.bkl_count is not used for rq, scroll rq->bkl_count into it. Thus we can save some space for rq. Signed-off-by: Yong Zhang Signed-off-by: Peter Zijlstra LKML-Reference: <1294991859-13246-1-git-send-email-yong.zhang0@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +---- kernel/sched_debug.c | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6cbff6bd1a60..0a169a85eb3e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -553,9 +553,6 @@ struct rq { /* try_to_wake_up() stats */ unsigned int ttwu_count; unsigned int ttwu_local; - - /* BKL stats */ - unsigned int bkl_count; #endif }; @@ -3887,7 +3884,7 @@ static inline void schedule_debug(struct task_struct *prev) schedstat_inc(this_rq(), sched_count); #ifdef CONFIG_SCHEDSTATS if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), bkl_count); + schedstat_inc(this_rq(), rq_sched_info.bkl_count); schedstat_inc(prev, sched_info.bkl_count); } #endif diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e4d37259d490..eb6cb8edd075 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -296,9 +296,11 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - P(bkl_count); + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", + rq->rq_sched_info.bkl_count); #undef P +#undef P64 #endif spin_lock_irqsave(&sched_debug_lock, flags); print_cfs_stats(m, cpu); -- cgit v1.3-8-gc7d7 From d7d8294415f0ce4254827d4a2a5ee88b00be52a8 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 5 Jan 2011 05:41:17 +0100 Subject: sched: Fix signed unsigned comparison in check_preempt_tick() Signed unsigned comparison may lead to superfluous resched if leftmost is right of the current task, wasting a few cycles, and inadvertently _lengthening_ the current task's slice. Reported-by: Venkatesh Pallipadi Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra LKML-Reference: <1294202477.9384.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 414145cf5344..77e9166d7bbf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1062,6 +1062,9 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; + if (delta < 0) + return; + if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } -- cgit v1.3-8-gc7d7 From c5ed5145591774bd9a2960ba4ca45a02fc70aad1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 17 Jan 2011 13:45:37 +0100 Subject: perf: Fix contexted inheritance Linus reported that the RCU lockdep annotation bits triggered for this rcu_dereference() because we're not holding rcu_read_lock(). Going over the code I cannot convince myself its correct: - holding a ref on the parent_ctx, doesn't avoid it being uncloned concurrently (as the comment says), so we can race with a free. - holding parent_ctx->mutex doesn't avoid the above free from taking place either, it would at best avoid parent_ctx from being freed. I.e. the warning is correct. To fix the bug, serialize against the unclone_ctx() call by extending the reach of the parent_ctx->lock. Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Paul E. McKenney LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b782b7a79f00..76be4c7bf08e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6494,7 +6494,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) raw_spin_lock_irqsave(&parent_ctx->lock, flags); parent_ctx->rotate_disable = 0; - raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); child_ctx = child->perf_event_ctxp[ctxn]; @@ -6502,12 +6501,11 @@ int perf_event_init_context(struct task_struct *child, int ctxn) /* * Mark the child context as a clone of the parent * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of events and the generation - * count can't have changed since we took the mutex. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + cloned_ctx = parent_ctx->parent_ctx; if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; child_ctx->parent_gen = parent_ctx->parent_gen; @@ -6518,6 +6516,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn) get_ctx(child_ctx->parent_ctx); } + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); -- cgit v1.3-8-gc7d7 From 22a4ec729017ba613337a28f306f94ba5023fe2e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Jan 2011 17:10:08 +0100 Subject: perf: Find_get_context: fix the per-cpu-counter check If task == NULL, find_get_context() should always check that cpu is correct. Afaics, the bug was introduced by 38a81da2 "perf events: Clean up pid passing", but even before that commit "&& cpu != -1" was not exactly right, -ESRCH from find_task_by_vpid() is not accurate. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath Cc: gregkh@suse.de Cc: stable@kernel.org LKML-Reference: <20110118161008.GB693@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 76be4c7bf08e..a962b1962eee 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2228,7 +2228,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) unsigned long flags; int ctxn, err; - if (!task && cpu != -1) { + if (!task) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); -- cgit v1.3-8-gc7d7 From 66832eb4baaaa9abe4c993ddf9113a79e39b9915 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 18 Jan 2011 17:10:32 +0100 Subject: perf: Validate cpu early in perf_event_alloc() Starting from perf_event_alloc()->perf_init_event(), the kernel assumes that event->cpu is either -1 or the valid CPU number. Change perf_event_alloc() to validate this argument early. This also means we can remove the similar check in find_get_context(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath Cc: gregkh@suse.de Cc: stable@kernel.org LKML-Reference: <20110118161032.GC693@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a962b1962eee..67d9bd70b746 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2233,9 +2233,6 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); - if (cpu < 0 || cpu >= nr_cpumask_bits) - return ERR_PTR(-EINVAL); - /* * We could be clever and allow to attach a event to an * offline CPU and activate it when the CPU comes up, but @@ -5541,6 +5538,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct hw_perf_event *hwc; long err; + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -5589,7 +5591,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; - + event->overflow_handler = overflow_handler; if (attr->disabled) -- cgit v1.3-8-gc7d7 From 068c5cc5ac7414a8e9eb7856b4bf3cc4d4744267 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Jan 2011 12:26:11 +0100 Subject: sched, cgroup: Use exit hook to avoid use-after-free crash By not notifying the controller of the on-exit move back to init_css_set, we fail to move the task out of the previous cgroup's cfs_rq. This leads to an opportunity for a cgroup-destroy to come in and free the cgroup (there are no active tasks left in it after all) to which the not-quite dead task is still enqueued. Reported-by: Miklos Vajna Fixed-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Cc: Mike Galbraith Signed-off-by: Ingo Molnar LKML-Reference: <1293206353.29444.205.camel@laptop> --- kernel/sched.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0a169a85eb3e..fa5272a0932c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -606,6 +606,9 @@ static inline struct task_group *task_group(struct task_struct *p) struct task_group *tg; struct cgroup_subsys_state *css; + if (p->flags & PF_EXITING) + return &root_task_group; + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, lockdep_is_held(&task_rq(p)->lock)); tg = container_of(css, struct task_group, css); @@ -8880,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } } +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) @@ -8952,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, .early_init = 1, -- cgit v1.3-8-gc7d7 From 490da40d82b31c0562d3f5edb37810f492ca1c34 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 19 Jan 2011 10:51:44 +0800 Subject: blktrace: Don't output messages if NOTIFY isn't set. Now if we enable blktrace, cfq has too many messages output to the trace buffer. It is fine if we don't specify any action mask. But if I do like this: blktrace /dev/sdb -a issue -a complete -o - | blkparse -i - I only want to see 'D' and 'C', while with the following command dd if=/mnt/ocfs2/test of=/dev/null bs=4k count=1 iflag=direct I will get(with a 2.6.37 vanilla kernel): 8,16 0 0 0.000000000 0 m N cfq3805 alloced 8,16 0 0 0.000004126 0 m N cfq3805 insert_request 8,16 0 0 0.000004884 0 m N cfq3805 add_to_rr 8,16 0 0 0.000008417 0 m N cfq workload slice:300 8,16 0 0 0.000009557 0 m N cfq3805 set_active wl_prio:0 wl_type:2 8,16 0 0 0.000010640 0 m N cfq3805 fifo= (null) 8,16 0 0 0.000011193 0 m N cfq3805 dispatch_insert 8,16 0 0 0.000012221 0 m N cfq3805 dispatched a request 8,16 0 0 0.000012802 0 m N cfq3805 activate rq, drv=1 8,16 0 1 0.000013181 3805 D R 114759 + 8 [dd] 8,16 0 2 0.000164244 0 C R 114759 + 8 [0] 8,16 0 0 0.000167997 0 m N cfq3805 complete rqnoidle 0 8,16 0 0 0.000168782 0 m N cfq3805 set_slice=100 8,16 0 0 0.000169874 0 m N cfq3805 arm_idle: 8 group_idle: 0 8,16 0 0 0.000170189 0 m N cfq schedule dispatch 8,16 0 0 0.000397938 0 m N cfq3805 slice expired t=0 8,16 0 0 0.000399763 0 m N cfq3805 sl_used=1 disp=1 charge=1 iops=0 sect=8 8,16 0 0 0.000400227 0 m N cfq3805 del_from_rr 8,16 0 0 0.000400882 0 m N cfq3805 put_queue See, there are 19 lines while I only need 2. I don't think it is appropriate for a user. So this patch will disable any messages if the BLK_TC_NOTIFY isn't set. Now the output for the same command will look like: 8,16 0 1 0.000000000 4908 D R 114759 + 8 [dd] 8,16 0 2 0.000146827 0 C R 114759 + 8 [0] Yes, it is what I want to see. Cc: Steven Rostedt Cc: Jeff Moyer Signed-off-by: Tao Ma Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 153562d0b93c..d95721f33702 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -138,6 +138,13 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) !blk_tracer_enabled)) return; + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + local_irq_save(flags); buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); va_start(args, fmt); -- cgit v1.3-8-gc7d7 From dbe08d82ce3967ccdf459f7951d02589cf967300 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 19 Jan 2011 19:22:07 +0100 Subject: perf: Fix find_get_context() vs perf_event_exit_task() race find_get_context() must not install the new perf_event_context if the task has already passed perf_event_exit_task(). If nothing else, this means the memory leak. Initially ctx->refcount == 2, it is supposed that perf_event_exit_task_context() should participate and do the necessary put_ctx(). find_lively_task_by_vpid() checks PF_EXITING but this buys nothing, by the time we call find_get_context() this task can be already dead. To the point, cmpxchg() can succeed when the task has already done the last schedule(). Change find_get_context() to populate task->perf_event_ctxp[] under task->perf_event_mutex, this way we can trust PF_EXITING because perf_event_exit_task() takes the same mutex. Also, change perf_event_exit_task_context() to use rcu_dereference(). Probably this is not strictly needed, but with or without this change find_get_context() can race with setup_new_exec()->perf_event_exit_task(), rcu_dereference() looks better. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath LKML-Reference: <20110119182207.GB12183@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 84522c796987..4ec55ef5810c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* - * Can't attach events to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - /* Reuse ptrace permission checks for now. */ err = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) @@ -2268,14 +2261,27 @@ retry: get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { put_task_struct(task); kfree(ctx); - goto retry; + + if (err == -EAGAIN) + goto retry; + goto errout; } } @@ -6127,7 +6133,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* -- cgit v1.3-8-gc7d7 From 8550d7cb6ed6c89add49c3b6ad4c753ab8a3d7f9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 19 Jan 2011 19:22:28 +0100 Subject: perf: Fix perf_event_init_task()/perf_event_free_task() interaction perf_event_init_task() should clear child->perf_event_ctxp[] before anything else. Otherwise, if perf_event_init_context(perf_hw_context) fails, perf_event_free_task() can free perf_event_ctxp[perf_sw_context] copied from parent->perf_event_ctxp[] by dup_task_struct(). Also move the initialization of perf_event_mutex and perf_event_list from perf_event_init_context() to perf_event_init_context(). Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Cc: Alan Stern Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Prasad Cc: Roland McGrath LKML-Reference: <20110119182228.GC12183@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4ec55ef5810c..244ca3acb0ee 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6446,11 +6446,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - child->perf_event_ctxp[ctxn] = NULL; - - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp[ctxn])) return 0; @@ -6539,6 +6534,10 @@ int perf_event_init_task(struct task_struct *child) { int ctxn, ret; + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); if (ret) -- cgit v1.3-8-gc7d7 From 2d0640b47da74cff7c11642c798d40de861ed524 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Tue, 18 Jan 2011 22:46:34 -0800 Subject: hrtimers: Notify hrtimer users of switches to NOHZ mode When NOHZ=y and high res timers are disabled (via cmdline or Kconfig) tick_nohz_switch_to_nohz() will notify the user about switching into NOHZ mode. Nothing is printed for the case where HIGH_RES_TIMERS=y. Fix this for the HIGH_RES_TIMERS=y case by duplicating the printk from the low res NOHZ path in the high res NOHZ path. This confused me since I was thinking 'dmesg | grep -i NOHZ' would tell me if NOHZ was enabled, but if I have hrtimers there is nothing. Signed-off-by: Stephen Boyd Acked-by: Thomas Gleixner Cc: Peter Zijlstra LKML-Reference: <1295419594-13085-1-git-send-email-sboyd@codeaurora.org> Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd1..c55ea2433471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) } local_irq_enable(); - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); + } #endif } #endif /* HIGH_RES_TIMERS */ -- cgit v1.3-8-gc7d7 From 2ce802f62ba32a7d95748ac92bf351f76affb6ff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:06:35 +0100 Subject: lockdep: Move early boot local IRQ enable/disable status to init/main.c During early boot, local IRQ is disabled until IRQ subsystem is properly initialized. During this time, no one should enable local IRQ and some operations which usually are not allowed with IRQ disabled, e.g. operations which might sleep or require communications with other processors, are allowed. lockdep tracked this with early_boot_irqs_off/on() callbacks. As other subsystems need this information too, move it to init/main.c and make it generally available. While at it, toggle the boolean to early_boot_irqs_disabled instead of enabled so that it can be initialized with %false and %true indicates the exceptional condition. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110635.GB6036@htj.dyndns.org> Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- include/linux/kernel.h | 2 ++ include/linux/lockdep.h | 8 -------- init/main.c | 13 +++++++++++-- kernel/lockdep.c | 18 +----------------- kernel/trace/trace_irqsoff.c | 8 -------- 6 files changed, 15 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7e8d3bc80af6..50542efe45fb 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1194,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; local_irq_disable(); - early_boot_irqs_off(); + early_boot_irqs_disabled = true; memblock_init(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5a9d9059520b..d07d8057e440 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -243,6 +243,8 @@ extern int test_taint(unsigned flag); extern unsigned long get_taint(void); extern int root_mountflags; +extern bool early_boot_irqs_disabled; + /* Values used for system_state */ extern enum system_states { SYSTEM_BOOTING, diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 71c09b26c759..f638fd78d106 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -436,16 +436,8 @@ do { \ #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_TRACE_IRQFLAGS -extern void early_boot_irqs_off(void); -extern void early_boot_irqs_on(void); extern void print_irqtrace_events(struct task_struct *curr); #else -static inline void early_boot_irqs_off(void) -{ -} -static inline void early_boot_irqs_on(void) -{ -} static inline void print_irqtrace_events(struct task_struct *curr) { } diff --git a/init/main.c b/init/main.c index 00799c1d4628..33c37c379e96 100644 --- a/init/main.c +++ b/init/main.c @@ -96,6 +96,15 @@ static inline void mark_rodata_ro(void) { } extern void tc_init(void); #endif +/* + * Debug helper: via this flag we know that we are in 'early bootup code' + * where only the boot processor is running with IRQ disabled. This means + * two things - IRQ must not be enabled before the flag is cleared and some + * operations which are not allowed with IRQ disabled are allowed while the + * flag is set. + */ +bool early_boot_irqs_disabled __read_mostly; + enum system_states system_state __read_mostly; EXPORT_SYMBOL(system_state); @@ -554,7 +563,7 @@ asmlinkage void __init start_kernel(void) cgroup_init_early(); local_irq_disable(); - early_boot_irqs_off(); + early_boot_irqs_disabled = true; /* * Interrupts are still disabled. Do necessary setups, then @@ -621,7 +630,7 @@ asmlinkage void __init start_kernel(void) if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " "enabled early\n"); - early_boot_irqs_on(); + early_boot_irqs_disabled = false; local_irq_enable(); /* Interrupts are enabled now so all GFP allocations are safe. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 42ba65dff7d9..0d2058da80f5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2291,22 +2291,6 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark) return 1; } -/* - * Debugging helper: via this flag we know that we are in - * 'early bootup code', and will warn about any invalid irqs-on event: - */ -static int early_boot_irqs_enabled; - -void early_boot_irqs_off(void) -{ - early_boot_irqs_enabled = 0; -} - -void early_boot_irqs_on(void) -{ - early_boot_irqs_enabled = 1; -} - /* * Hardirqs will be enabled: */ @@ -2319,7 +2303,7 @@ void trace_hardirqs_on_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; - if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled))) + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; if (unlikely(curr->hardirqs_enabled)) { diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5cf8c602b880..92b6e1e12d98 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -453,14 +453,6 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) * Stubs: */ -void early_boot_irqs_off(void) -{ -} - -void early_boot_irqs_on(void) -{ -} - void trace_softirqs_on(unsigned long ip) { } -- cgit v1.3-8-gc7d7 From bd924e8cbd4b73ffb7d707a774c04f7e2cae88ed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 20 Jan 2011 12:07:13 +0100 Subject: smp: Allow on_each_cpu() to be called while early_boot_irqs_disabled status to init/main.c percpu may end up calling vfree() during early boot which in turn may call on_each_cpu() for TLB flushes. The function of on_each_cpu() can be done safely while IRQ is disabled during early boot but it assumed that the function is always called with local IRQ enabled which ended up enabling local IRQ prematurely during boot and triggering a couple of warnings. This patch updates on_each_cpu() and smp_call_function_many() such on_each_cpu() can be used safely while early_boot_irqs_disabled is set. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Acked-by: Pekka Enberg Cc: Linus Torvalds LKML-Reference: <20110120110713.GC6036@htj.dyndns.org> Signed-off-by: Ingo Molnar Reported-by: Ingo Molnar --- kernel/smp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e069987..4b83cd6815e2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -430,7 +430,7 @@ void smp_call_function_many(const struct cpumask *mask, * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress); + && !oops_in_progress && !early_boot_irqs_disabled); /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); @@ -533,17 +533,20 @@ void ipi_call_unlock_irq(void) #endif /* USE_GENERIC_SMP_HELPERS */ /* - * Call a function on all processors + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). */ int on_each_cpu(void (*func) (void *info), void *info, int wait) { + unsigned long flags; int ret = 0; preempt_disable(); ret = smp_call_function(func, info, wait); - local_irq_disable(); + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); preempt_enable(); return ret; } -- cgit v1.3-8-gc7d7 From 6dc19899958e420a931274b94019e267e2396d3e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 20 Jan 2011 14:44:33 -0800 Subject: kernel/smp.c: fix smp_call_function_many() SMP race I noticed a failure where we hit the following WARN_ON in generic_smp_call_function_interrupt: if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; data->csd.func(data->csd.info); refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); <------------------------- We atomically tested and cleared our bit in the cpumask, and yet the number of cpus left (ie refs) was 0. How can this be? It turns out commit 54fdade1c3332391948ec43530c02c4794a38172 ("generic-ipi: make struct call_function_data lockless") is at fault. It removes locking from smp_call_function_many and in doing so creates a rather complicated race. The problem comes about because: - The smp_call_function_many interrupt handler walks call_function.queue without any locking. - We reuse a percpu data structure in smp_call_function_many. - We do not wait for any RCU grace period before starting the next smp_call_function_many. Imagine a scenario where CPU A does two smp_call_functions back to back, and CPU B does an smp_call_function in between. We concentrate on how CPU C handles the calls: CPU A CPU B CPU C CPU D smp_call_function smp_call_function_interrupt walks call_function.queue sees data from CPU A on list smp_call_function smp_call_function_interrupt walks call_function.queue sees (stale) CPU A on list smp_call_function int clears last ref on A list_del_rcu, unlock smp_call_function reuses percpu *data A data->cpumask sees and clears bit in cpumask might be using old or new fn! decrements refs below 0 set data->refs (too late!) The important thing to note is since the interrupt handler walks a potentially stale call_function.queue without any locking, then another cpu can view the percpu *data structure at any time, even when the owner is in the process of initialising it. The following test case hits the WARN_ON 100% of the time on my PowerPC box (having 128 threads does help :) #include #include #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); I tried to fix it by ordering the read and the write of ->cpumask and ->refs. In doing so I missed a critical case but Paul McKenney was able to spot my bug thankfully :) To ensure we arent viewing previous iterations the interrupt handler needs to read ->refs then ->cpumask then ->refs _again_. Thanks to Milton Miller and Paul McKenney for helping to debug this issue. [miltonm@bga.com: add WARN_ON and BUG_ON, remove extra read of refs before initial read of mask that doesn't help (also noted by Peter Zijlstra), adjust comments, hopefully clarify scenario ] [miltonm@bga.com: remove excess tests] Signed-off-by: Anton Blanchard Signed-off-by: Milton Miller Cc: Ingo Molnar Cc: "Paul E. McKenney" Cc: [2.6.32+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 4ec30e069987..17c6e5860231 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -195,6 +195,24 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) + continue; + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; @@ -203,6 +221,8 @@ void generic_smp_call_function_interrupt(void) refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); if (!refs) { + WARN_ON(!cpumask_empty(data->cpumask)); + raw_spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); raw_spin_unlock(&call_function.lock); @@ -454,11 +474,21 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); + + /* + * To ensure the interrupt handler gets an complete view + * we order the cpumask and refs writes and order the read + * of them in the interrupt handler. In addition we may + * only clear our own cpu bit from the mask. + */ + smp_wmb(); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); raw_spin_lock_irqsave(&call_function.lock, flags); -- cgit v1.3-8-gc7d7 From 225c8e010f2d17a62aef131e24c6e7c111f36f9b Mon Sep 17 00:00:00 2001 From: Milton Miller Date: Thu, 20 Jan 2011 14:44:34 -0800 Subject: kernel/smp.c: consolidate writes in smp_call_function_interrupt() We have to test the cpu mask in the interrupt handler before checking the refs, otherwise we can start to follow an entry before its deleted and find it partially initailzed for the next trip. Presently we also clear the cpumask bit before executing the called function, which implies getting write access to the line. After the function is called we then decrement refs, and if they go to zero we then unlock the structure. However, this implies getting write access to the call function data before and after another the function is called. If we can assert that no smp_call_function execution function is allowed to enable interrupts, then we can move both writes to after the function is called, hopfully allowing both writes with one cache line bounce. On a 256 thread system with a kernel compiled for 1024 threads, the time to execute testcase in the "smp_call_function_many race" changelog was reduced by about 30-40ms out of about 545 ms. I decided to keep this as WARN because its now a buggy function, even though the stack trace is of no value -- a simple printk would give us the information needed. Raw data: Without patch: ipi_test startup took 1219366ns complete 539819014ns total 541038380ns ipi_test startup took 1695754ns complete 543439872ns total 545135626ns ipi_test startup took 7513568ns complete 539606362ns total 547119930ns ipi_test startup took 13304064ns complete 533898562ns total 547202626ns ipi_test startup took 8668192ns complete 544264074ns total 552932266ns ipi_test startup took 4977626ns complete 548862684ns total 553840310ns ipi_test startup took 2144486ns complete 541292318ns total 543436804ns ipi_test startup took 21245824ns complete 530280180ns total 551526004ns With patch: ipi_test startup took 5961748ns complete 500859628ns total 506821376ns ipi_test startup took 8975996ns complete 495098924ns total 504074920ns ipi_test startup took 19797750ns complete 492204740ns total 512002490ns ipi_test startup took 14824796ns complete 487495878ns total 502320674ns ipi_test startup took 11514882ns complete 494439372ns total 505954254ns ipi_test startup took 8288084ns complete 502570774ns total 510858858ns ipi_test startup took 6789954ns complete 493388112ns total 500178066ns #include #include #include /* sched clock */ #define ITERATIONS 100 static void do_nothing_ipi(void *dummy) { } static void do_ipis(struct work_struct *dummy) { int i; for (i = 0; i < ITERATIONS; i++) smp_call_function(do_nothing_ipi, NULL, 1); printk(KERN_DEBUG "cpu %d finished\n", smp_processor_id()); } static struct work_struct work[NR_CPUS]; static int __init testcase_init(void) { int cpu; u64 start, started, done; start = local_clock(); for_each_online_cpu(cpu) { INIT_WORK(&work[cpu], do_ipis); schedule_work_on(cpu, &work[cpu]); } started = local_clock(); for_each_online_cpu(cpu) flush_work(&work[cpu]); done = local_clock(); pr_info("ipi_test startup took %lldns complete %lldns total %lldns\n", started-start, done-started, done-start); return 0; } static void __exit testcase_exit(void) { } module_init(testcase_init) module_exit(testcase_exit) MODULE_LICENSE("GPL"); MODULE_AUTHOR("Anton Blanchard"); Signed-off-by: Milton Miller Cc: Anton Blanchard Cc: Ingo Molnar Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 17c6e5860231..2fe66f7c617a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -194,6 +194,7 @@ void generic_smp_call_function_interrupt(void) */ list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; + void (*func) (void *info); /* * Since we walk the list without any locks, we might @@ -213,24 +214,32 @@ void generic_smp_call_function_interrupt(void) if (atomic_read(&data->refs) == 0) continue; - if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) - continue; - + func = data->csd.func; /* for later warn */ data->csd.func(data->csd.info); + /* + * If the cpu mask is not still set then it enabled interrupts, + * we took another smp interrupt, and executed the function + * twice on this cpu. In theory that copy decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pS enabled interrupts and double executed\n", + func); + continue; + } + refs = atomic_dec_return(&data->refs); WARN_ON(refs < 0); - if (!refs) { - WARN_ON(!cpumask_empty(data->cpumask)); - - raw_spin_lock(&call_function.lock); - list_del_rcu(&data->csd.list); - raw_spin_unlock(&call_function.lock); - } if (refs) continue; + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + csd_unlock(&data->csd); } -- cgit v1.3-8-gc7d7 From 1c77ff22f539ceaa64ea43d6a26d867e84602cb7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 19 Jan 2011 19:41:35 +0100 Subject: genirq: Remove __do_IRQ All architectures are finally converted. Remove the cruft. Signed-off-by: Thomas Gleixner Cc: Richard Henderson Cc: Mike Frysinger Cc: David Howells Cc: Tony Luck Cc: Greg Ungerer Cc: Michal Simek Acked-by: David Howells Cc: Kyle McMartin Acked-by: Benjamin Herrenschmidt Cc: Chen Liqin Cc: "David S. Miller" Cc: Chris Metcalf Cc: Jeff Dike --- Documentation/feature-removal-schedule.txt | 8 --- arch/alpha/Kconfig | 3 - arch/blackfin/Kconfig | 3 - arch/frv/Kconfig | 4 -- arch/ia64/Kconfig | 3 - arch/m68knommu/Kconfig | 4 -- arch/microblaze/Kconfig | 3 - arch/mips/Kconfig | 3 - arch/mn10300/Kconfig | 3 - arch/parisc/Kconfig | 4 -- arch/powerpc/Kconfig | 4 -- arch/score/Kconfig | 3 - arch/sparc/Kconfig | 4 -- arch/tile/Kconfig | 3 - arch/um/Kconfig.um | 3 - include/linux/irqdesc.h | 14 ---- kernel/irq/Kconfig | 3 - kernel/irq/handle.c | 111 ----------------------------- 18 files changed, 183 deletions(-) (limited to 'kernel') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 8c594c45b6a1..b959659c5df4 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -357,14 +357,6 @@ Who: Dave Jones , Matthew Garrett ----------------------------- -What: __do_IRQ all in one fits nothing interrupt handler -When: 2.6.32 -Why: __do_IRQ was kept for easy migration to the type flow handlers. - More than two years of migration time is enough. -Who: Thomas Gleixner - ------------------------------ - What: fakephp and associated sysfs files in /sys/bus/pci/slots/ When: 2011 Why: In 2.6.27, the semantics of /sys/bus/pci/slots was redefined to diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index fc95ee1bcf6f..943fe6930f77 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -68,9 +68,6 @@ config GENERIC_IOMAP bool default n -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_HARDIRQS bool default y diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 0a221d48152d..a37b2be23f18 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -50,9 +50,6 @@ config GENERIC_HARDIRQS config GENERIC_IRQ_PROBE def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_GPIO def_bool y diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index f6bcb039cd6d..e504edeb3d84 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -33,10 +33,6 @@ config GENERIC_HARDIRQS bool default y -config GENERIC_HARDIRQS_NO__DO_IRQ - bool - default y - config TIME_LOW_RES bool default y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index e0f5b6d7f849..be1faf991813 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -684,9 +684,6 @@ source "lib/Kconfig" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_IRQ_PROBE bool default y diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 704e7b92334c..7379cb0ce1af 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -52,10 +52,6 @@ config GENERIC_HARDIRQS bool default y -config GENERIC_HARDIRQS_NO__DO_IRQ - bool - default y - config GENERIC_CALIBRATE_DELAY bool default y diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 5f5018a71a3d..5a6378493797 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -52,9 +52,6 @@ config GENERIC_TIME_VSYSCALL config GENERIC_CLOCKEVENTS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_GPIO def_bool y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 548e6cc3bc28..f5ecc0566bc2 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -793,9 +793,6 @@ config SCHED_OMIT_FRAME_POINTER bool default y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # # Select some configuration options automatically based on user selections. # diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 8ed41cf2b08d..4638269152f6 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -34,9 +34,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM bool -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 0888675c98dd..4b94ac4dc603 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -12,7 +12,6 @@ config PARISC select HAVE_IRQ_WORK select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT - select GENERIC_HARDIRQS_NO__DO_IRQ help The PA-RISC microprocessor is designed by Hewlett-Packard and used in many of their workstations & servers (HP9000 700 and 800 series, @@ -79,9 +78,6 @@ config IRQ_PER_CPU bool default y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # unless you want to implement ACPI on PA-RISC ... ;-) config PM bool diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 959f38ccb9a7..e0b185dd718a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -40,10 +40,6 @@ config GENERIC_HARDIRQS bool default y -config GENERIC_HARDIRQS_NO__DO_IRQ - bool - default y - config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 4293fdcb5398..4f159acfbe33 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -53,9 +53,6 @@ config GENERIC_CLOCKEVENTS config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_SYSCALL_TABLE def_bool y diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 45d9c87d083a..989bb6415ea3 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -107,10 +107,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y if SPARC64 -config GENERIC_HARDIRQS_NO__DO_IRQ - bool - def_bool y if SPARC64 - config MMU bool default y diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 4e8b82bca9e5..c16b98c2435d 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -10,9 +10,6 @@ config GENERIC_CSUM config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config GENERIC_IRQ_PROBE def_bool y diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um index f8d1d0d47fe6..90a438acbfaf 100644 --- a/arch/um/Kconfig.um +++ b/arch/um/Kconfig.um @@ -120,9 +120,6 @@ config SMP If you don't know what to do, say N. -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - config NR_CPUS int "Maximum number of CPUs (2-32)" range 2 32 diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 6a64c6fa81af..c1a95b7b58de 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -100,13 +100,6 @@ static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) #define get_irq_desc_data(desc) ((desc)->irq_data.handler_data) #define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc) -/* - * Monolithic do_IRQ implementation. - */ -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ -extern unsigned int __do_IRQ(unsigned int irq); -#endif - /* * Architectures call this to let the generic IRQ layer * handle an interrupt. If the descriptor is attached to an @@ -115,14 +108,7 @@ extern unsigned int __do_IRQ(unsigned int irq); */ static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc) { -#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); -#else - if (likely(desc->handle_irq)) - desc->handle_irq(irq, desc); - else - __do_IRQ(irq); -#endif } static inline void generic_handle_irq(unsigned int irq) diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 31d766bf5d2e..8e42fec7686d 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -9,9 +9,6 @@ menu "IRQ subsystem" config GENERIC_HARDIRQS def_bool y -config GENERIC_HARDIRQS_NO__DO_IRQ - def_bool y - # Select this to disable the deprecated stuff config GENERIC_HARDIRQS_NO_DEPRECATED def_bool n diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e2347eb63306..3540a7190122 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -118,114 +118,3 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) return retval; } - -#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ - -#ifdef CONFIG_ENABLE_WARN_DEPRECATED -# warning __do_IRQ is deprecated. Please convert to proper flow handlers -#endif - -/** - * __do_IRQ - original all in one highlevel IRQ handler - * @irq: the interrupt number - * - * __do_IRQ handles all normal device IRQ's (the special - * SMP cross-CPU interrupts have their own specific - * handlers). - * - * This is the original x86 implementation which is used for every - * interrupt type. - */ -unsigned int __do_IRQ(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - struct irqaction *action; - unsigned int status; - - kstat_incr_irqs_this_cpu(irq, desc); - - if (CHECK_IRQ_PER_CPU(desc->status)) { - irqreturn_t action_ret; - - /* - * No locking required for CPU-local interrupts: - */ - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - if (likely(!(desc->status & IRQ_DISABLED))) { - action_ret = handle_IRQ_event(irq, desc->action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - } - desc->irq_data.chip->end(irq); - return 1; - } - - raw_spin_lock(&desc->lock); - if (desc->irq_data.chip->ack) - desc->irq_data.chip->ack(irq); - /* - * REPLAY is when Linux resends an IRQ that was dropped earlier - * WAITING is used by probe to mark irqs that are being tested - */ - status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); - status |= IRQ_PENDING; /* we _want_ to handle it */ - - /* - * If the IRQ is disabled for whatever reason, we cannot - * use the action we have. - */ - action = NULL; - if (likely(!(status & (IRQ_DISABLED | IRQ_INPROGRESS)))) { - action = desc->action; - status &= ~IRQ_PENDING; /* we commit to handling */ - status |= IRQ_INPROGRESS; /* we are handling it */ - } - desc->status = status; - - /* - * If there is no IRQ handler or it was disabled, exit early. - * Since we set PENDING, if another processor is handling - * a different instance of this same irq, the other processor - * will take care of it. - */ - if (unlikely(!action)) - goto out; - - /* - * Edge triggered interrupts need to remember - * pending events. - * This applies to any hw interrupts that allow a second - * instance of the same irq to arrive while we are in do_IRQ - * or in the handler. But the code here only handles the _second_ - * instance of the irq, not the third or fourth. So it is mostly - * useful for irq hardware that does not mask cleanly in an - * SMP environment. - */ - for (;;) { - irqreturn_t action_ret; - - raw_spin_unlock(&desc->lock); - - action_ret = handle_IRQ_event(irq, action); - if (!noirqdebug) - note_interrupt(irq, desc, action_ret); - - raw_spin_lock(&desc->lock); - if (likely(!(desc->status & IRQ_PENDING))) - break; - desc->status &= ~IRQ_PENDING; - } - desc->status &= ~IRQ_INPROGRESS; - -out: - /* - * The ->end() handler has to deal with interrupts which got - * disabled while the handler was running. - */ - desc->irq_data.chip->end(irq); - raw_spin_unlock(&desc->lock); - - return 1; -} -#endif -- cgit v1.3-8-gc7d7 From 547e9fd7d328af261f184bf66effc5033c886498 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Jan 2011 12:51:39 +0100 Subject: perf: Annotate cpuctx->ctx.mutex to avoid a lockdep splat Lockdep spotted: loop_1b_instruc/1899 is trying to acquire lock: (event_mutex){+.+.+.}, at: [] perf_trace_init+0x3b/0x2f7 but task is already holding lock: (&ctx->mutex){+.+.+.}, at: [] perf_event_init_context+0xc0/0x218 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&ctx->mutex){+.+.+.}: -> #2 (cpu_hotplug.lock){+.+.+.}: -> #1 (module_mutex){+.+...}: -> #0 (event_mutex){+.+.+.}: But because the deadlock would be cpuhotplug (cpu-event) vs fork (task-event) it cannot, in fact, happen. We can annotate this by giving the perf_event_context used for the cpuctx a different lock class from those used by tasks. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 244ca3acb0ee..c5fa717cf099 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5380,6 +5380,8 @@ free_dev: goto out; } +static struct lock_class_key cpuctx_mutex; + int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5428,6 +5430,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; -- cgit v1.3-8-gc7d7 From 806839b22cbda90176d7f8d421889bddd7826e93 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 21 Jan 2011 18:45:47 +0100 Subject: perf: perf_event_exit_task_context: s/rcu_dereference/rcu_dereference_raw/ In theory, almost every user of task->child->perf_event_ctxp[] is wrong. find_get_context() can install the new context at any moment, we need read_barrier_depends(). dbe08d82ce3967ccdf459f7951d02589cf967300 "perf: Fix find_get_context() vs perf_event_exit_task() race" added rcu_dereference() into perf_event_exit_task_context() to make the precedent, but this makes __rcu_dereference_check() unhappy. Use rcu_dereference_raw() to shut up the warning. Reported-by: Ingo Molnar Signed-off-by: Oleg Nesterov Cc: acme@redhat.com Cc: paulus@samba.org Cc: stern@rowland.harvard.edu Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: roland@redhat.com Cc: prasad@linux.vnet.ibm.com Cc: Paul E. McKenney LKML-Reference: <20110121174547.GA8796@redhat.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c5fa717cf099..126a302c481c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6136,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = rcu_dereference(child->perf_event_ctxp[ctxn]); + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* -- cgit v1.3-8-gc7d7 From e94965ed5beb23c6fabf7ed31f625e66d7ff28de Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 15 Dec 2010 14:00:19 -0800 Subject: module: show version information for built-in modules in sysfs Currently only drivers that are built as modules have their versions shown in /sys/module//version, but this information might also be useful for built-in drivers as well. This especially important for drivers that do not define any parameters - such drivers, if built-in, are completely invisible from userspace. This patch changes MODULE_VERSION() macro so that in case when we are compiling built-in module, version information is stored in a separate section. Kernel then uses this data to create 'version' sysfs attribute in the same fashion it creates attributes for module parameters. Signed-off-by: Dmitry Torokhov Signed-off-by: Rusty Russell --- include/asm-generic/vmlinux.lds.h | 7 +++++ include/linux/module.h | 27 ++++++++++++++++ kernel/params.c | 65 ++++++++++++++++++++++++++++++++------- 3 files changed, 88 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 68649336c4ad..6ebb81030d2d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -364,6 +364,13 @@ VMLINUX_SYMBOL(__start___param) = .; \ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ + } \ + \ + /* Built-in module versions. */ \ + __modver : AT(ADDR(__modver) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start___modver) = .; \ + *(__modver) \ + VMLINUX_SYMBOL(__stop___modver) = .; \ . = ALIGN((align)); \ VMLINUX_SYMBOL(__end_rodata) = .; \ } \ diff --git a/include/linux/module.h b/include/linux/module.h index 8b17fd8c790d..50efcd3ae850 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -58,6 +58,12 @@ struct module_attribute { void (*free)(struct module *); }; +struct module_version_attribute { + struct module_attribute mattr; + const char *module_name; + const char *version; +}; + struct module_kobject { struct kobject kobj; @@ -161,7 +167,28 @@ extern struct module __this_module; Using this automatically adds a checksum of the .c files and the local headers in "srcversion". */ + +#ifdef MODULE #define MODULE_VERSION(_version) MODULE_INFO(version, _version) +#else +#define MODULE_VERSION(_version) \ + extern ssize_t __modver_version_show(struct module_attribute *, \ + struct module *, char *); \ + static struct module_version_attribute __modver_version_attr \ + __used \ + __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \ + = { \ + .mattr = { \ + .attr = { \ + .name = "version", \ + .mode = S_IRUGO, \ + }, \ + .show = __modver_version_show, \ + }, \ + .module_name = KBUILD_MODNAME, \ + .version = _version, \ + } +#endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware diff --git a/kernel/params.c b/kernel/params.c index 08107d181758..0da1411222b9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, - unsigned int name_skip) +static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, kobj = kset_find_obj(module_kset, name); if (kobj) { - /* We already have one. Remove params so we can add more. */ mk = to_module_kobject(kobj); - /* We need to remove it before adding parameters. */ - sysfs_remove_group(&mk->kobj, &mk->mp->grp); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, "%s", name); if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed add to sysfs, " - "error number %d\n", name, err); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + printk(KERN_ERR + "Module '%s' failed add to sysfs, error number %d\n", + name, err); + printk(KERN_ERR + "The system will be unstable now.\n"); + return NULL; } - /* So that exit path is even. */ + + /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } + return mk; +} + +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) +{ + struct module_kobject *mk; + int err; + + mk = locate_module_kobject(name); + if (!mk) + return; + + /* We need to remove old parameters before adding more. */ + if (mk->mp) + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); @@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) } } +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_version_attribute *vattr = + container_of(mattr, struct module_version_attribute, mattr); + + return sprintf(buf, "%s\n", vattr->version); +} + +extern struct module_version_attribute __start___modver[], __stop___modver[]; + +static void __init version_sysfs_builtin(void) +{ + const struct module_version_attribute *vattr; + struct module_kobject *mk; + int err; + + for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + mk = locate_module_kobject(vattr->module_name); + if (mk) { + err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); + } + } +} /* module-related sysfs stuff */ @@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) } module_sysfs_initialized = 1; + version_sysfs_builtin(); param_sysfs_builtin(); return 0; -- cgit v1.3-8-gc7d7 From 3ff6dcac735704824c1dff64dc6863c390d364cc Mon Sep 17 00:00:00 2001 From: Yong Zhang Date: Mon, 24 Jan 2011 15:33:52 +0800 Subject: sched: Fix poor interactivity on UP systems due to group scheduler nice tune bug Michael Witten and Christian Kujau reported that the autogroup scheduling feature hurts interactivity on their UP systems. It turns out that this is an older bug in the group scheduling code, and the wider appeal provided by the autogroup feature exposed it more prominently. When on UP with FAIR_GROUP_SCHED enabled, tune shares only affect tg->shares, but is not reflected in tg->se->load. The reason is that update_cfs_shares() does nothing on UP. So introduce update_cfs_shares() for UP && FAIR_GROUP_SCHED. This issue was found when enable autogroup scheduling was enabled, but it is an older bug that also exists on cgroup.cpu on UP. Reported-and-Tested-by: Michael Witten Reported-and-Tested-by: Christian Kujau Signed-off-by: Yong Zhang Acked-by: Pekka Enberg Acked-by: Mike Galbraith Acked-by: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: <20110124073352.GA24186@windriver.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 78 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 77e9166d7bbf..354769979c02 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) { @@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; - long load_weight, load, shares; + long shares; if (!cfs_rq) return; @@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) return; - - load = cfs_rq->load.weight + weight_delta; - - load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; - load_weight += load; - - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg, weight_delta); reweight_entity(cfs_rq_of(se), se, shares); } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } -} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { -- cgit v1.3-8-gc7d7 From 8c6a98b22b750c9eb52653ba643faa17db8d3881 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Mon, 24 Jan 2011 09:31:38 -0800 Subject: Input: sysrq - ensure sysrq_enabled and __sysrq_enabled are consistent Currently sysrq_enabled and __sysrq_enabled are initialised separately and inconsistently, leading to sysrq being actually enabled by reported as not enabled in sysfs. The first change to the sysfs configurable synchronises these two: static int __read_mostly sysrq_enabled = 1; static int __sysrq_enabled; Add a common define to carry the default for these preventing them becoming out of sync again. Default this to 1 to mirror previous behaviour. Signed-off-by: Andy Whitcroft Cc: stable@kernel.org Signed-off-by: Dmitry Torokhov --- drivers/char/sysrq.c | 2 +- include/linux/sysrq.h | 3 +++ kernel/sysctl.c | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index c556ed9db13d..8e0dd254eb11 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -46,7 +46,7 @@ #include /* Whether we react on sysrq keys or just ignore them */ -static int __read_mostly sysrq_enabled = 1; +static int __read_mostly sysrq_enabled = SYSRQ_DEFAULT_ENABLE; static bool __read_mostly sysrq_always_enabled; static bool sysrq_on(void) diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 387fa7d05c98..7faf933cced7 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -17,6 +17,9 @@ #include #include +/* Enable/disable SYSRQ support by default (0==no, 1==yes). */ +#define SYSRQ_DEFAULT_ENABLE 1 + /* Possible values of bitmask for enabling sysrq functions */ /* 0x0001 is reserved for enable everything */ #define SYSRQ_ENABLE_LOG 0x0002 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c33a1edb799f..3afce4dc9ba5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -169,7 +169,8 @@ static int proc_taint(struct ctl_table *table, int write, #endif #ifdef CONFIG_MAGIC_SYSRQ -static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */ +/* Note: sysrq code uses it's own private copy */ +static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; static int sysrq_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *lenp, -- cgit v1.3-8-gc7d7 From ac751efa6a0d70f2c9daef5c7e3a92270f5c2dff Mon Sep 17 00:00:00 2001 From: Torben Hohn Date: Tue, 25 Jan 2011 15:07:35 -0800 Subject: console: rename acquire/release_console_sem() to console_lock/unlock() The -rt patches change the console_semaphore to console_mutex. As a result, a quite large chunk of the patches changes all acquire/release_console_sem() to acquire/release_console_mutex() This commit makes things use more neutral function names which dont make implications about the underlying lock. The only real change is the return value of console_trylock which is inverted from try_acquire_console_sem() This patch also paves the way to switching console_sem from a semaphore to a mutex. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make console_trylock return 1 on success, per Geert] Signed-off-by: Torben Hohn Cc: Thomas Gleixner Cc: Greg KH Cc: Ingo Molnar Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-omap2/pm24xx.c | 4 +- arch/arm/mach-omap2/pm34xx.c | 4 +- arch/arm/mach-omap2/serial.c | 4 +- arch/parisc/kernel/pdc_cons.c | 4 +- drivers/char/bfin_jtag_comm.c | 8 +-- drivers/gpu/drm/nouveau/nouveau_drv.c | 8 +-- drivers/gpu/drm/radeon/radeon_device.c | 10 +-- drivers/staging/msm/msm_fb.c | 8 +-- drivers/staging/olpc_dcon/olpc_dcon.c | 10 +-- drivers/staging/sm7xx/smtcfb.c | 8 +-- drivers/tty/serial/sb1250-duart.c | 2 +- drivers/tty/tty_io.c | 4 +- drivers/tty/vt/selection.c | 4 +- drivers/tty/vt/vc_screen.c | 16 ++--- drivers/tty/vt/vt.c | 124 ++++++++++++++++----------------- drivers/tty/vt/vt_ioctl.c | 60 ++++++++-------- drivers/video/arkfb.c | 12 ++-- drivers/video/aty/aty128fb.c | 12 ++-- drivers/video/aty/atyfb_base.c | 10 +-- drivers/video/aty/radeon_pm.c | 10 +-- drivers/video/chipsfb.c | 8 +-- drivers/video/console/fbcon.c | 42 +++++------ drivers/video/da8xx-fb.c | 8 +-- drivers/video/fbmem.c | 12 ++-- drivers/video/fbsysfs.c | 20 +++--- drivers/video/geode/gxfb_core.c | 8 +-- drivers/video/geode/lxfb_core.c | 8 +-- drivers/video/i810/i810_main.c | 8 +-- drivers/video/jz4740_fb.c | 8 +-- drivers/video/mx3fb.c | 8 +-- drivers/video/nvidia/nvidia.c | 8 +-- drivers/video/ps3fb.c | 16 ++--- drivers/video/s3fb.c | 16 ++--- drivers/video/savage/savagefb_driver.c | 8 +-- drivers/video/sh_mobile_hdmi.c | 8 +-- drivers/video/sh_mobile_lcdcfb.c | 4 +- drivers/video/sm501fb.c | 8 +-- drivers/video/tmiofb.c | 10 +-- drivers/video/via/viafbdev.c | 8 +-- drivers/video/vt8623fb.c | 12 ++-- drivers/video/xen-fbfront.c | 4 +- fs/proc/consoles.c | 4 +- include/linux/console.h | 6 +- kernel/printk.c | 100 ++++++++++++++------------ 44 files changed, 336 insertions(+), 328 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-omap2/pm24xx.c b/arch/arm/mach-omap2/pm24xx.c index 9e5dc8ed51e9..97feb3ab6a69 100644 --- a/arch/arm/mach-omap2/pm24xx.c +++ b/arch/arm/mach-omap2/pm24xx.c @@ -134,7 +134,7 @@ static void omap2_enter_full_retention(void) /* Block console output in case it is on one of the OMAP UARTs */ if (!is_suspending()) - if (try_acquire_console_sem()) + if (!console_trylock()) goto no_sleep; omap_uart_prepare_idle(0); @@ -151,7 +151,7 @@ static void omap2_enter_full_retention(void) omap_uart_resume_idle(0); if (!is_suspending()) - release_console_sem(); + console_unlock(); no_sleep: if (omap2_pm_debug) { diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c index 8cbbeade4b8a..a4aa1920a75c 100644 --- a/arch/arm/mach-omap2/pm34xx.c +++ b/arch/arm/mach-omap2/pm34xx.c @@ -398,7 +398,7 @@ void omap_sram_idle(void) if (!is_suspending()) if (per_next_state < PWRDM_POWER_ON || core_next_state < PWRDM_POWER_ON) - if (try_acquire_console_sem()) + if (!console_trylock()) goto console_still_active; /* PER */ @@ -481,7 +481,7 @@ void omap_sram_idle(void) } if (!is_suspending()) - release_console_sem(); + console_unlock(); console_still_active: /* Disable IO-PAD and IO-CHAIN wakeup */ diff --git a/arch/arm/mach-omap2/serial.c b/arch/arm/mach-omap2/serial.c index 302da7403a10..32e91a9c8b6b 100644 --- a/arch/arm/mach-omap2/serial.c +++ b/arch/arm/mach-omap2/serial.c @@ -812,7 +812,7 @@ void __init omap_serial_init_port(struct omap_board_data *bdata) oh->dev_attr = uart; - acquire_console_sem(); /* in case the earlycon is on the UART */ + console_lock(); /* in case the earlycon is on the UART */ /* * Because of early UART probing, UART did not get idled @@ -838,7 +838,7 @@ void __init omap_serial_init_port(struct omap_board_data *bdata) omap_uart_block_sleep(uart); uart->timeout = DEFAULT_TIMEOUT; - release_console_sem(); + console_unlock(); if ((cpu_is_omap34xx() && uart->padconf) || (uart->wk_en && uart->wk_mask)) { diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c index 11bdd68e5762..fc770be465ff 100644 --- a/arch/parisc/kernel/pdc_cons.c +++ b/arch/parisc/kernel/pdc_cons.c @@ -169,11 +169,11 @@ static int __init pdc_console_tty_driver_init(void) struct console *tmp; - acquire_console_sem(); + console_lock(); for_each_console(tmp) if (tmp == &pdc_cons) break; - release_console_sem(); + console_unlock(); if (!tmp) { printk(KERN_INFO "PDC console driver not registered anymore, not creating %s\n", pdc_cons.name); diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c index e397df3ad98e..16402445f2b2 100644 --- a/drivers/char/bfin_jtag_comm.c +++ b/drivers/char/bfin_jtag_comm.c @@ -183,16 +183,16 @@ bfin_jc_circ_write(const unsigned char *buf, int count) } #ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE -# define acquire_console_sem() -# define release_console_sem() +# define console_lock() +# define console_unlock() #endif static int bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count) { int i; - acquire_console_sem(); + console_lock(); i = bfin_jc_circ_write(buf, count); - release_console_sem(); + console_unlock(); wake_up_process(bfin_jc_kthread); return i; } diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.c b/drivers/gpu/drm/nouveau/nouveau_drv.c index 13bb672a16f4..f658a04eecf9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.c +++ b/drivers/gpu/drm/nouveau/nouveau_drv.c @@ -234,9 +234,9 @@ nouveau_pci_suspend(struct pci_dev *pdev, pm_message_t pm_state) pci_set_power_state(pdev, PCI_D3hot); } - acquire_console_sem(); + console_lock(); nouveau_fbcon_set_suspend(dev, 1); - release_console_sem(); + console_unlock(); nouveau_fbcon_restore_accel(dev); return 0; @@ -359,9 +359,9 @@ nouveau_pci_resume(struct pci_dev *pdev) nv_crtc->lut.depth = 0; } - acquire_console_sem(); + console_lock(); nouveau_fbcon_set_suspend(dev, 0); - release_console_sem(); + console_unlock(); nouveau_fbcon_zfill_all(dev); diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 26091d602b84..0d478932b1a9 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -891,9 +891,9 @@ int radeon_suspend_kms(struct drm_device *dev, pm_message_t state) pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3hot); } - acquire_console_sem(); + console_lock(); radeon_fbdev_set_suspend(rdev, 1); - release_console_sem(); + console_unlock(); return 0; } @@ -905,11 +905,11 @@ int radeon_resume_kms(struct drm_device *dev) if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; - acquire_console_sem(); + console_lock(); pci_set_power_state(dev->pdev, PCI_D0); pci_restore_state(dev->pdev); if (pci_enable_device(dev->pdev)) { - release_console_sem(); + console_unlock(); return -1; } pci_set_master(dev->pdev); @@ -920,7 +920,7 @@ int radeon_resume_kms(struct drm_device *dev) radeon_restore_bios_scratch_regs(rdev); radeon_fbdev_set_suspend(rdev, 0); - release_console_sem(); + console_unlock(); /* reset hpd state */ radeon_hpd_init(rdev); diff --git a/drivers/staging/msm/msm_fb.c b/drivers/staging/msm/msm_fb.c index 23fa049b51f2..a2f29d464051 100644 --- a/drivers/staging/msm/msm_fb.c +++ b/drivers/staging/msm/msm_fb.c @@ -347,7 +347,7 @@ static int msm_fb_suspend(struct platform_device *pdev, pm_message_t state) if ((!mfd) || (mfd->key != MFD_KEY)) return 0; - acquire_console_sem(); + console_lock(); fb_set_suspend(mfd->fbi, 1); ret = msm_fb_suspend_sub(mfd); @@ -358,7 +358,7 @@ static int msm_fb_suspend(struct platform_device *pdev, pm_message_t state) pdev->dev.power.power_state = state; } - release_console_sem(); + console_unlock(); return ret; } #else @@ -431,11 +431,11 @@ static int msm_fb_resume(struct platform_device *pdev) if ((!mfd) || (mfd->key != MFD_KEY)) return 0; - acquire_console_sem(); + console_lock(); ret = msm_fb_resume_sub(mfd); pdev->dev.power.power_state = PMSG_ON; fb_set_suspend(mfd->fbi, 1); - release_console_sem(); + console_unlock(); return ret; } diff --git a/drivers/staging/olpc_dcon/olpc_dcon.c b/drivers/staging/olpc_dcon/olpc_dcon.c index 9f26dc9408bb..56a283d1a74d 100644 --- a/drivers/staging/olpc_dcon/olpc_dcon.c +++ b/drivers/staging/olpc_dcon/olpc_dcon.c @@ -373,17 +373,17 @@ static void dcon_source_switch(struct work_struct *work) * * For now, we just hope.. */ - acquire_console_sem(); + console_lock(); ignore_fb_events = 1; if (fb_blank(fbinfo, FB_BLANK_UNBLANK)) { ignore_fb_events = 0; - release_console_sem(); + console_unlock(); printk(KERN_ERR "olpc-dcon: Failed to enter CPU mode\n"); dcon_pending = DCON_SOURCE_DCON; return; } ignore_fb_events = 0; - release_console_sem(); + console_unlock(); /* And turn off the DCON */ pdata->set_dconload(1); @@ -435,12 +435,12 @@ static void dcon_source_switch(struct work_struct *work) } } - acquire_console_sem(); + console_lock(); ignore_fb_events = 1; if (fb_blank(fbinfo, FB_BLANK_POWERDOWN)) printk(KERN_ERR "olpc-dcon: couldn't blank fb!\n"); ignore_fb_events = 0; - release_console_sem(); + console_unlock(); printk(KERN_INFO "olpc-dcon: The DCON has control\n"); break; diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c index 0bc113c44d39..d007e4a12c14 100644 --- a/drivers/staging/sm7xx/smtcfb.c +++ b/drivers/staging/sm7xx/smtcfb.c @@ -1044,9 +1044,9 @@ static int __maybe_unused smtcfb_suspend(struct pci_dev *pdev, pm_message_t msg) /* when doing suspend, call fb apis and pci apis */ if (msg.event == PM_EVENT_SUSPEND) { - acquire_console_sem(); + console_lock(); fb_set_suspend(&sfb->fb, 1); - release_console_sem(); + console_unlock(); retv = pci_save_state(pdev); pci_disable_device(pdev); retv = pci_choose_state(pdev, msg); @@ -1105,9 +1105,9 @@ static int __maybe_unused smtcfb_resume(struct pci_dev *pdev) smtcfb_setmode(sfb); - acquire_console_sem(); + console_lock(); fb_set_suspend(&sfb->fb, 0); - release_console_sem(); + console_unlock(); return 0; } diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c index a2f2b3254499..602d9845c52f 100644 --- a/drivers/tty/serial/sb1250-duart.c +++ b/drivers/tty/serial/sb1250-duart.c @@ -829,7 +829,7 @@ static void __init sbd_probe_duarts(void) #ifdef CONFIG_SERIAL_SB1250_DUART_CONSOLE /* * Serial console stuff. Very basic, polling driver for doing serial - * console output. The console_sem is held by the caller, so we + * console output. The console_lock is held by the caller, so we * shouldn't be interrupted for more console activity. */ static void sbd_console_putchar(struct uart_port *uport, int ch) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 464d09d97873..6158eae0f64a 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3256,7 +3256,7 @@ static ssize_t show_cons_active(struct device *dev, struct console *c; ssize_t count = 0; - acquire_console_sem(); + console_lock(); for (c = console_drivers; c; c = c->next) { if (!c->device) continue; @@ -3271,7 +3271,7 @@ static ssize_t show_cons_active(struct device *dev, while (i--) count += sprintf(buf + count, "%s%d%c", cs[i]->name, cs[i]->index, i ? ' ':'\n'); - release_console_sem(); + console_unlock(); return count; } diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index ebae344ce910..c956ed6c83a3 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -316,9 +316,9 @@ int paste_selection(struct tty_struct *tty) /* always called with BTM from vt_ioctl */ WARN_ON(!tty_locked()); - acquire_console_sem(); + console_lock(); poke_blanked_console(); - release_console_sem(); + console_unlock(); ld = tty_ldisc_ref(tty); if (!ld) { diff --git a/drivers/tty/vt/vc_screen.c b/drivers/tty/vt/vc_screen.c index eab3a1ff99e4..a672ed192d33 100644 --- a/drivers/tty/vt/vc_screen.c +++ b/drivers/tty/vt/vc_screen.c @@ -202,7 +202,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Select the proper current console and verify * sanity of the situation under the console lock. */ - acquire_console_sem(); + console_lock(); attr = (currcons & 128); currcons = (currcons & 127); @@ -336,9 +336,9 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) * the pagefault handling code may want to call printk(). */ - release_console_sem(); + console_unlock(); ret = copy_to_user(buf, con_buf_start, orig_count); - acquire_console_sem(); + console_lock(); if (ret) { read += (orig_count - ret); @@ -354,7 +354,7 @@ vcs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) if (read) ret = read; unlock_out: - release_console_sem(); + console_unlock(); mutex_unlock(&con_buf_mtx); return ret; } @@ -379,7 +379,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* Select the proper current console and verify * sanity of the situation under the console lock. */ - acquire_console_sem(); + console_lock(); attr = (currcons & 128); currcons = (currcons & 127); @@ -414,9 +414,9 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) /* Temporarily drop the console lock so that we can read * in the write data from userspace safely. */ - release_console_sem(); + console_unlock(); ret = copy_from_user(con_buf, buf, this_round); - acquire_console_sem(); + console_lock(); if (ret) { this_round -= ret; @@ -542,7 +542,7 @@ vcs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) vcs_scr_updated(vc); unlock_out: - release_console_sem(); + console_unlock(); mutex_unlock(&con_buf_mtx); diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 76407eca9ab0..b230bd3f056f 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -1003,9 +1003,9 @@ static int vt_resize(struct tty_struct *tty, struct winsize *ws) struct vc_data *vc = tty->driver_data; int ret; - acquire_console_sem(); + console_lock(); ret = vc_do_resize(tty, vc, ws->ws_col, ws->ws_row); - release_console_sem(); + console_unlock(); return ret; } @@ -1271,7 +1271,7 @@ static void default_attr(struct vc_data *vc) vc->vc_color = vc->vc_def_color; } -/* console_sem is held */ +/* console_lock is held */ static void csi_m(struct vc_data *vc) { int i; @@ -1415,7 +1415,7 @@ int mouse_reporting(void) return vc_cons[fg_console].d->vc_report_mouse; } -/* console_sem is held */ +/* console_lock is held */ static void set_mode(struct vc_data *vc, int on_off) { int i; @@ -1485,7 +1485,7 @@ static void set_mode(struct vc_data *vc, int on_off) } } -/* console_sem is held */ +/* console_lock is held */ static void setterm_command(struct vc_data *vc) { switch(vc->vc_par[0]) { @@ -1545,7 +1545,7 @@ static void setterm_command(struct vc_data *vc) } } -/* console_sem is held */ +/* console_lock is held */ static void csi_at(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_cols - vc->vc_x) @@ -1555,7 +1555,7 @@ static void csi_at(struct vc_data *vc, unsigned int nr) insert_char(vc, nr); } -/* console_sem is held */ +/* console_lock is held */ static void csi_L(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_rows - vc->vc_y) @@ -1566,7 +1566,7 @@ static void csi_L(struct vc_data *vc, unsigned int nr) vc->vc_need_wrap = 0; } -/* console_sem is held */ +/* console_lock is held */ static void csi_P(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_cols - vc->vc_x) @@ -1576,7 +1576,7 @@ static void csi_P(struct vc_data *vc, unsigned int nr) delete_char(vc, nr); } -/* console_sem is held */ +/* console_lock is held */ static void csi_M(struct vc_data *vc, unsigned int nr) { if (nr > vc->vc_rows - vc->vc_y) @@ -1587,7 +1587,7 @@ static void csi_M(struct vc_data *vc, unsigned int nr) vc->vc_need_wrap = 0; } -/* console_sem is held (except via vc_init->reset_terminal */ +/* console_lock is held (except via vc_init->reset_terminal */ static void save_cur(struct vc_data *vc) { vc->vc_saved_x = vc->vc_x; @@ -1603,7 +1603,7 @@ static void save_cur(struct vc_data *vc) vc->vc_saved_G1 = vc->vc_G1_charset; } -/* console_sem is held */ +/* console_lock is held */ static void restore_cur(struct vc_data *vc) { gotoxy(vc, vc->vc_saved_x, vc->vc_saved_y); @@ -1625,7 +1625,7 @@ enum { ESnormal, ESesc, ESsquare, ESgetpars, ESgotpars, ESfunckey, EShash, ESsetG0, ESsetG1, ESpercent, ESignore, ESnonstd, ESpalette }; -/* console_sem is held (except via vc_init()) */ +/* console_lock is held (except via vc_init()) */ static void reset_terminal(struct vc_data *vc, int do_clear) { vc->vc_top = 0; @@ -1685,7 +1685,7 @@ static void reset_terminal(struct vc_data *vc, int do_clear) csi_J(vc, 2); } -/* console_sem is held */ +/* console_lock is held */ static void do_con_trol(struct tty_struct *tty, struct vc_data *vc, int c) { /* @@ -2119,7 +2119,7 @@ static int is_double_width(uint32_t ucs) return bisearch(ucs, double_width, ARRAY_SIZE(double_width) - 1); } -/* acquires console_sem */ +/* acquires console_lock */ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int count) { #ifdef VT_BUF_VRAM_ONLY @@ -2147,11 +2147,11 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co might_sleep(); - acquire_console_sem(); + console_lock(); vc = tty->driver_data; if (vc == NULL) { printk(KERN_ERR "vt: argh, driver_data is NULL !\n"); - release_console_sem(); + console_unlock(); return 0; } @@ -2159,7 +2159,7 @@ static int do_con_write(struct tty_struct *tty, const unsigned char *buf, int co if (!vc_cons_allocated(currcons)) { /* could this happen? */ printk_once("con_write: tty %d not allocated\n", currcons+1); - release_console_sem(); + console_unlock(); return 0; } @@ -2375,7 +2375,7 @@ rescan_last_byte: } FLUSH console_conditional_schedule(); - release_console_sem(); + console_unlock(); notify_update(vc); return n; #undef FLUSH @@ -2388,11 +2388,11 @@ rescan_last_byte: * us to do the switches asynchronously (needed when we want * to switch due to a keyboard interrupt). Synchronization * with other console code and prevention of re-entrancy is - * ensured with console_sem. + * ensured with console_lock. */ static void console_callback(struct work_struct *ignored) { - acquire_console_sem(); + console_lock(); if (want_console >= 0) { if (want_console != fg_console && @@ -2422,7 +2422,7 @@ static void console_callback(struct work_struct *ignored) } notify_update(vc_cons[fg_console].d); - release_console_sem(); + console_unlock(); } int set_console(int nr) @@ -2603,7 +2603,7 @@ static struct console vt_console_driver = { */ /* - * Generally a bit racy with respect to console_sem(). + * Generally a bit racy with respect to console_lock();. * * There are some functions which don't need it. * @@ -2629,17 +2629,17 @@ int tioclinux(struct tty_struct *tty, unsigned long arg) switch (type) { case TIOCL_SETSEL: - acquire_console_sem(); + console_lock(); ret = set_selection((struct tiocl_selection __user *)(p+1), tty); - release_console_sem(); + console_unlock(); break; case TIOCL_PASTESEL: ret = paste_selection(tty); break; case TIOCL_UNBLANKSCREEN: - acquire_console_sem(); + console_lock(); unblank_screen(); - release_console_sem(); + console_unlock(); break; case TIOCL_SELLOADLUT: ret = sel_loadlut(p); @@ -2688,10 +2688,10 @@ int tioclinux(struct tty_struct *tty, unsigned long arg) } break; case TIOCL_BLANKSCREEN: /* until explicitly unblanked, not only poked */ - acquire_console_sem(); + console_lock(); ignore_poke = 1; do_blank_screen(0); - release_console_sem(); + console_unlock(); break; case TIOCL_BLANKEDSCREEN: ret = console_blanked; @@ -2790,11 +2790,11 @@ static void con_flush_chars(struct tty_struct *tty) return; /* if we race with con_close(), vt may be null */ - acquire_console_sem(); + console_lock(); vc = tty->driver_data; if (vc) set_cursor(vc); - release_console_sem(); + console_unlock(); } /* @@ -2805,7 +2805,7 @@ static int con_open(struct tty_struct *tty, struct file *filp) unsigned int currcons = tty->index; int ret = 0; - acquire_console_sem(); + console_lock(); if (tty->driver_data == NULL) { ret = vc_allocate(currcons); if (ret == 0) { @@ -2813,7 +2813,7 @@ static int con_open(struct tty_struct *tty, struct file *filp) /* Still being freed */ if (vc->port.tty) { - release_console_sem(); + console_unlock(); return -ERESTARTSYS; } tty->driver_data = vc; @@ -2827,11 +2827,11 @@ static int con_open(struct tty_struct *tty, struct file *filp) tty->termios->c_iflag |= IUTF8; else tty->termios->c_iflag &= ~IUTF8; - release_console_sem(); + console_unlock(); return ret; } } - release_console_sem(); + console_unlock(); return ret; } @@ -2844,9 +2844,9 @@ static void con_shutdown(struct tty_struct *tty) { struct vc_data *vc = tty->driver_data; BUG_ON(vc == NULL); - acquire_console_sem(); + console_lock(); vc->port.tty = NULL; - release_console_sem(); + console_unlock(); tty_shutdown(tty); } @@ -2893,13 +2893,13 @@ static int __init con_init(void) struct vc_data *vc; unsigned int currcons = 0, i; - acquire_console_sem(); + console_lock(); if (conswitchp) display_desc = conswitchp->con_startup(); if (!display_desc) { fg_console = 0; - release_console_sem(); + console_unlock(); return 0; } @@ -2946,7 +2946,7 @@ static int __init con_init(void) printable = 1; printk("\n"); - release_console_sem(); + console_unlock(); #ifdef CONFIG_VT_CONSOLE register_console(&vt_console_driver); @@ -3037,7 +3037,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last, if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + console_lock(); /* check if driver is registered */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { @@ -3122,7 +3122,7 @@ static int bind_con_driver(const struct consw *csw, int first, int last, retval = 0; err: - release_console_sem(); + console_unlock(); module_put(owner); return retval; }; @@ -3171,7 +3171,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + console_lock(); /* check if driver is registered and if it is unbindable */ for (i = 0; i < MAX_NR_CON_DRIVER; i++) { @@ -3185,7 +3185,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) } if (retval) { - release_console_sem(); + console_unlock(); goto err; } @@ -3204,12 +3204,12 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) } if (retval) { - release_console_sem(); + console_unlock(); goto err; } if (!con_is_bound(csw)) { - release_console_sem(); + console_unlock(); goto err; } @@ -3238,7 +3238,7 @@ int unbind_con_driver(const struct consw *csw, int first, int last, int deflt) if (!con_is_bound(csw)) con_driver->flag &= ~CON_DRIVER_FLAG_INIT; - release_console_sem(); + console_unlock(); /* ignore return value, binding should not fail */ bind_con_driver(defcsw, first, last, deflt); err: @@ -3538,7 +3538,7 @@ int register_con_driver(const struct consw *csw, int first, int last) if (!try_module_get(owner)) return -ENODEV; - acquire_console_sem(); + console_lock(); for (i = 0; i < MAX_NR_CON_DRIVER; i++) { con_driver = ®istered_con_driver[i]; @@ -3592,7 +3592,7 @@ int register_con_driver(const struct consw *csw, int first, int last) } err: - release_console_sem(); + console_unlock(); module_put(owner); return retval; } @@ -3613,7 +3613,7 @@ int unregister_con_driver(const struct consw *csw) { int i, retval = -ENODEV; - acquire_console_sem(); + console_lock(); /* cannot unregister a bound driver */ if (con_is_bound(csw)) @@ -3639,7 +3639,7 @@ int unregister_con_driver(const struct consw *csw) } } err: - release_console_sem(); + console_unlock(); return retval; } EXPORT_SYMBOL(unregister_con_driver); @@ -3934,9 +3934,9 @@ int con_set_cmap(unsigned char __user *arg) { int rc; - acquire_console_sem(); + console_lock(); rc = set_get_cmap (arg,1); - release_console_sem(); + console_unlock(); return rc; } @@ -3945,9 +3945,9 @@ int con_get_cmap(unsigned char __user *arg) { int rc; - acquire_console_sem(); + console_lock(); rc = set_get_cmap (arg,0); - release_console_sem(); + console_unlock(); return rc; } @@ -3994,12 +3994,12 @@ static int con_font_get(struct vc_data *vc, struct console_font_op *op) } else font.data = NULL; - acquire_console_sem(); + console_lock(); if (vc->vc_sw->con_font_get) rc = vc->vc_sw->con_font_get(vc, &font); else rc = -ENOSYS; - release_console_sem(); + console_unlock(); if (rc) goto out; @@ -4076,12 +4076,12 @@ static int con_font_set(struct vc_data *vc, struct console_font_op *op) font.data = memdup_user(op->data, size); if (IS_ERR(font.data)) return PTR_ERR(font.data); - acquire_console_sem(); + console_lock(); if (vc->vc_sw->con_font_set) rc = vc->vc_sw->con_font_set(vc, &font, op->flags); else rc = -ENOSYS; - release_console_sem(); + console_unlock(); kfree(font.data); return rc; } @@ -4103,12 +4103,12 @@ static int con_font_default(struct vc_data *vc, struct console_font_op *op) else name[MAX_FONT_NAME - 1] = 0; - acquire_console_sem(); + console_lock(); if (vc->vc_sw->con_font_default) rc = vc->vc_sw->con_font_default(vc, &font, s); else rc = -ENOSYS; - release_console_sem(); + console_unlock(); if (!rc) { op->width = font.width; op->height = font.height; @@ -4124,7 +4124,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op) if (vc->vc_mode != KD_TEXT) return -EINVAL; - acquire_console_sem(); + console_lock(); if (!vc->vc_sw->con_font_copy) rc = -ENOSYS; else if (con < 0 || !vc_cons_allocated(con)) @@ -4133,7 +4133,7 @@ static int con_font_copy(struct vc_data *vc, struct console_font_op *op) rc = 0; else rc = vc->vc_sw->con_font_copy(vc, con); - release_console_sem(); + console_unlock(); return rc; } diff --git a/drivers/tty/vt/vt_ioctl.c b/drivers/tty/vt/vt_ioctl.c index 6b68a0fb4611..1235ebda6e1c 100644 --- a/drivers/tty/vt/vt_ioctl.c +++ b/drivers/tty/vt/vt_ioctl.c @@ -649,12 +649,12 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, /* * explicitly blank/unblank the screen if switching modes */ - acquire_console_sem(); + console_lock(); if (arg == KD_TEXT) do_unblank_screen(1); else do_blank_screen(1); - release_console_sem(); + console_unlock(); break; case KDGETMODE: @@ -893,7 +893,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -EINVAL; goto out; } - acquire_console_sem(); + console_lock(); vc->vt_mode = tmp; /* the frsig is ignored, so we set it to 0 */ vc->vt_mode.frsig = 0; @@ -901,7 +901,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc->vt_pid = get_pid(task_pid(current)); /* no switch is required -- saw@shade.msu.ru */ vc->vt_newvt = -1; - release_console_sem(); + console_unlock(); break; } @@ -910,9 +910,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, struct vt_mode tmp; int rc; - acquire_console_sem(); + console_lock(); memcpy(&tmp, &vc->vt_mode, sizeof(struct vt_mode)); - release_console_sem(); + console_unlock(); rc = copy_to_user(up, &tmp, sizeof(struct vt_mode)); if (rc) @@ -965,9 +965,9 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -ENXIO; else { arg--; - acquire_console_sem(); + console_lock(); ret = vc_allocate(arg); - release_console_sem(); + console_unlock(); if (ret) break; set_console(arg); @@ -990,7 +990,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, ret = -ENXIO; else { vsa.console--; - acquire_console_sem(); + console_lock(); ret = vc_allocate(vsa.console); if (ret == 0) { struct vc_data *nvc; @@ -1003,7 +1003,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, put_pid(nvc->vt_pid); nvc->vt_pid = get_pid(task_pid(current)); } - release_console_sem(); + console_unlock(); if (ret) break; /* Commence switch and lock */ @@ -1044,7 +1044,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, /* * Switching-from response */ - acquire_console_sem(); + console_lock(); if (vc->vt_newvt >= 0) { if (arg == 0) /* @@ -1063,7 +1063,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc->vt_newvt = -1; ret = vc_allocate(newvt); if (ret) { - release_console_sem(); + console_unlock(); break; } /* @@ -1083,7 +1083,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, if (arg != VT_ACKACQ) ret = -EINVAL; } - release_console_sem(); + console_unlock(); break; /* @@ -1096,20 +1096,20 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, } if (arg == 0) { /* deallocate all unused consoles, but leave 0 */ - acquire_console_sem(); + console_lock(); for (i=1; iv_cols)) ret = -EFAULT; else { - acquire_console_sem(); + console_lock(); for (i = 0; i < MAX_NR_CONSOLES; i++) { vc = vc_cons[i].d; @@ -1135,7 +1135,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc_resize(vc_cons[i].d, cc, ll); } } - release_console_sem(); + console_unlock(); } break; } @@ -1187,14 +1187,14 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, for (i = 0; i < MAX_NR_CONSOLES; i++) { if (!vc_cons[i].d) continue; - acquire_console_sem(); + console_lock(); if (vlin) vc_cons[i].d->vc_scan_lines = vlin; if (clin) vc_cons[i].d->vc_font.height = clin; vc_cons[i].d->vc_resize_user = 1; vc_resize(vc_cons[i].d, cc, ll); - release_console_sem(); + console_unlock(); } break; } @@ -1367,7 +1367,7 @@ void vc_SAK(struct work_struct *work) struct vc_data *vc; struct tty_struct *tty; - acquire_console_sem(); + console_lock(); vc = vc_con->d; if (vc) { tty = vc->port.tty; @@ -1379,7 +1379,7 @@ void vc_SAK(struct work_struct *work) __do_SAK(tty); reset_vc(vc); } - release_console_sem(); + console_unlock(); } #ifdef CONFIG_COMPAT @@ -1737,10 +1737,10 @@ int vt_move_to_console(unsigned int vt, int alloc) { int prev; - acquire_console_sem(); + console_lock(); /* Graphics mode - up to X */ if (disable_vt_switch) { - release_console_sem(); + console_unlock(); return 0; } prev = fg_console; @@ -1748,7 +1748,7 @@ int vt_move_to_console(unsigned int vt, int alloc) if (alloc && vc_allocate(vt)) { /* we can't have a free VC for now. Too bad, * we don't want to mess the screen for now. */ - release_console_sem(); + console_unlock(); return -ENOSPC; } @@ -1758,10 +1758,10 @@ int vt_move_to_console(unsigned int vt, int alloc) * Let the calling function know so it can decide * what to do. */ - release_console_sem(); + console_unlock(); return -EIO; } - release_console_sem(); + console_unlock(); tty_lock(); if (vt_waitactive(vt + 1)) { pr_debug("Suspend: Can't switch VCs."); @@ -1781,8 +1781,8 @@ int vt_move_to_console(unsigned int vt, int alloc) */ void pm_set_vt_switch(int do_switch) { - acquire_console_sem(); + console_lock(); disable_vt_switch = !do_switch; - release_console_sem(); + console_unlock(); } EXPORT_SYMBOL(pm_set_vt_switch); diff --git a/drivers/video/arkfb.c b/drivers/video/arkfb.c index d583bea608fd..391ac939f011 100644 --- a/drivers/video/arkfb.c +++ b/drivers/video/arkfb.c @@ -23,7 +23,7 @@ #include #include #include -#include /* Why should fb driver call console functions? because acquire_console_sem() */ +#include /* Why should fb driver call console functions? because console_lock() */ #include