From 684999149002dd046269666a390458e0acb38280 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 6 Feb 2009 13:52:43 -0700 Subject: Rename struct file->f_ep_lock This lock moves out of the CONFIG_EPOLL ifdef and becomes f_lock. For now, epoll remains the only user, but a future patch will use it to protect f_flags as well. Cc: Davide Libenzi Reviewed-by: Christoph Hellwig Signed-off-by: Jonathan Corbet --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 92734c0012e6..2011600d12c7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -848,6 +848,7 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; + spinlock_t f_lock; /* f_ep_links */ atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -866,7 +867,6 @@ struct file { #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; - spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; #ifdef CONFIG_DEBUG_WRITECOUNT -- cgit v1.3-8-gc7d7 From db1dd4d376134eba0e08af523b61cc566a4ea1cd Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 6 Feb 2009 15:25:24 -0700 Subject: Use f_lock to protect f_flags Traditionally, changes to struct file->f_flags have been done under BKL protection, or with no protection at all. This patch causes all f_flags changes after file open/creation time to be done under protection of f_lock. This allows the removal of some BKL usage and fixes a number of longstanding (if microscopic) races. Reviewed-by: Christoph Hellwig Cc: Al Viro Signed-off-by: Jonathan Corbet --- drivers/char/tty_io.c | 5 ++--- drivers/usb/gadget/file_storage.c | 7 ++++++- fs/fcntl.c | 2 ++ fs/ioctl.c | 7 ++++--- fs/nfsd/vfs.c | 5 ++++- include/linux/fs.h | 2 +- ipc/mqueue.c | 2 ++ sound/core/oss/pcm_oss.c | 2 ++ sound/oss/au1550_ac97.c | 2 ++ sound/oss/audio.c | 2 ++ sound/oss/sh_dac_audio.c | 2 ++ sound/oss/swarm_cs4297a.c | 2 ++ sound/oss/vwsnd.c | 2 ++ 13 files changed, 33 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index bc84e125c6bc..224f271d8cbe 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -2162,13 +2162,12 @@ static int fionbio(struct file *file, int __user *p) if (get_user(nonblock, p)) return -EFAULT; - /* file->f_flags is still BKL protected in the fs layer - vomit */ - lock_kernel(); + spin_lock(&file->f_lock); if (nonblock) file->f_flags |= O_NONBLOCK; else file->f_flags &= ~O_NONBLOCK; - unlock_kernel(); + spin_unlock(&file->f_lock); return 0; } diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c index 1ab9dac7e12d..33bb76cef33c 100644 --- a/drivers/usb/gadget/file_storage.c +++ b/drivers/usb/gadget/file_storage.c @@ -1711,7 +1711,9 @@ static int do_write(struct fsg_dev *fsg) curlun->sense_data = SS_WRITE_PROTECTED; return -EINVAL; } + spin_lock(&curlun->filp->f_lock); curlun->filp->f_flags &= ~O_SYNC; // Default is not to wait + spin_unlock(&curlun->filp->f_lock); /* Get the starting Logical Block Address and check that it's * not too big */ @@ -1728,8 +1730,11 @@ static int do_write(struct fsg_dev *fsg) curlun->sense_data = SS_INVALID_FIELD_IN_CDB; return -EINVAL; } - if (fsg->cmnd[1] & 0x08) // FUA + if (fsg->cmnd[1] & 0x08) { // FUA + spin_lock(&curlun->filp->f_lock); curlun->filp->f_flags |= O_SYNC; + spin_unlock(&curlun->filp->f_lock); + } } if (lba >= curlun->num_sectors) { curlun->sense_data = SS_LOGICAL_BLOCK_ADDRESS_OUT_OF_RANGE; diff --git a/fs/fcntl.c b/fs/fcntl.c index bd215cc791da..04df8570a2d2 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -189,7 +189,9 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } } + spin_lock(&filp->f_lock); filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK); + spin_unlock(&filp->f_lock); out: unlock_kernel(); return error; diff --git a/fs/ioctl.c b/fs/ioctl.c index 240ec63984cb..421aab465dab 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp) if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif + spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; + spin_unlock(&filp->f_lock); return error; } @@ -432,10 +434,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp, if (error) return error; + spin_lock(&filp->f_lock); if (on) filp->f_flags |= FASYNC; else filp->f_flags &= ~FASYNC; + spin_unlock(&filp->f_lock); return error; } @@ -499,10 +503,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, break; case FIONBIO: - /* BKL needed to avoid races tweaking f_flags */ - lock_kernel(); error = ioctl_fionbio(filp, argp); - unlock_kernel(); break; case FIOASYNC: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6e50aaa56ca2..c165a6403df0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (!EX_ISSYNC(exp)) stable = 0; - if (stable && !EX_WGATHER(exp)) + if (stable && !EX_WGATHER(exp)) { + spin_lock(&file->f_lock); file->f_flags |= O_SYNC; + spin_unlock(&file->f_lock); + } /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2011600d12c7..7428c6d35e65 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -848,7 +848,7 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; - spinlock_t f_lock; /* f_ep_links */ + spinlock_t f_lock; /* f_ep_links, f_flags */ atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 54b4077fed79..a8ddadbc7459 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1156,10 +1156,12 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, omqstat.mq_flags = filp->f_flags & O_NONBLOCK; if (u_mqstat) { audit_mq_getsetattr(mqdes, &mqstat); + spin_lock(&filp->f_lock); if (mqstat.mq_flags & O_NONBLOCK) filp->f_flags |= O_NONBLOCK; else filp->f_flags &= ~O_NONBLOCK; + spin_unlock(&filp->f_lock); inode->i_atime = inode->i_ctime = CURRENT_TIME; } diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index 0a1798eafb0b..d4460f18e76c 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1895,7 +1895,9 @@ static int snd_pcm_oss_set_fragment(struct snd_pcm_oss_file *pcm_oss_file, unsig static int snd_pcm_oss_nonblock(struct file * file) { + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; } diff --git a/sound/oss/au1550_ac97.c b/sound/oss/au1550_ac97.c index 81e1f443d094..4191acccbcdb 100644 --- a/sound/oss/au1550_ac97.c +++ b/sound/oss/au1550_ac97.c @@ -1627,7 +1627,9 @@ au1550_ioctl(struct inode *inode, struct file *file, unsigned int cmd, sizeof(abinfo)) ? -EFAULT : 0; case SNDCTL_DSP_NONBLOCK: + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; case SNDCTL_DSP_GETODELAY: diff --git a/sound/oss/audio.c b/sound/oss/audio.c index 89bd27a5e865..b69c05b7ea7b 100644 --- a/sound/oss/audio.c +++ b/sound/oss/audio.c @@ -433,7 +433,9 @@ int audio_ioctl(int dev, struct file *file, unsigned int cmd, void __user *arg) return dma_ioctl(dev, cmd, arg); case SNDCTL_DSP_NONBLOCK: + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; case SNDCTL_DSP_GETCAPS: diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c index e5d423994918..78cfb66e4c59 100644 --- a/sound/oss/sh_dac_audio.c +++ b/sound/oss/sh_dac_audio.c @@ -135,7 +135,9 @@ static int dac_audio_ioctl(struct inode *inode, struct file *file, return put_user(AFMT_U8, (int *)arg); case SNDCTL_DSP_NONBLOCK: + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; case SNDCTL_DSP_GETCAPS: diff --git a/sound/oss/swarm_cs4297a.c b/sound/oss/swarm_cs4297a.c index 41562ecde5bb..1edab7b4ea83 100644 --- a/sound/oss/swarm_cs4297a.c +++ b/sound/oss/swarm_cs4297a.c @@ -2200,7 +2200,9 @@ static int cs4297a_ioctl(struct inode *inode, struct file *file, sizeof(abinfo)) ? -EFAULT : 0; case SNDCTL_DSP_NONBLOCK: + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; case SNDCTL_DSP_GETODELAY: diff --git a/sound/oss/vwsnd.c b/sound/oss/vwsnd.c index 78b8acc7c3b9..187f72750e8f 100644 --- a/sound/oss/vwsnd.c +++ b/sound/oss/vwsnd.c @@ -2673,7 +2673,9 @@ static int vwsnd_audio_do_ioctl(struct inode *inode, case SNDCTL_DSP_NONBLOCK: /* _SIO ('P',14) */ DBGX("SNDCTL_DSP_NONBLOCK\n"); + spin_lock(&file->f_lock); file->f_flags |= O_NONBLOCK; + spin_unlock(&file->f_lock); return 0; case SNDCTL_DSP_RESET: /* _SIO ('P', 0) */ -- cgit v1.3-8-gc7d7 From d0adde574b8487ef30f69e2d08bba769e4be513f Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Thu, 26 Mar 2009 17:49:56 +0000 Subject: Add a strictatime mount option Add support for explicitly requesting full atime updates. This makes it possible for kernels to default to relatime but still allow userspace to override it. Signed-off-by: Matthew Garrett Signed-off-by: Linus Torvalds --- fs/namespace.c | 6 +++++- include/linux/fs.h | 1 + include/linux/mount.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/fs/namespace.c b/fs/namespace.c index 06f8e63f6cb1..d0659ec291c9 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -780,6 +780,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) { MNT_NOATIME, ",noatime" }, { MNT_NODIRATIME, ",nodiratime" }, { MNT_RELATIME, ",relatime" }, + { MNT_STRICTATIME, ",strictatime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; @@ -1932,11 +1933,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODIRATIME; if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; + if (flags & MS_STRICTATIME) + mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | - MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); + MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | + MS_STRICTATIME); /* ... and get the mountpoint */ retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); diff --git a/include/linux/fs.h b/include/linux/fs.h index 92734c0012e6..5bc81c4a98c1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -141,6 +141,7 @@ struct inodes_stat_t { #define MS_RELATIME (1<<21) /* Update atime relative to mtime/ctime. */ #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ +#define MS_STRICTATIME (1<<24) /* Always perform atime updates */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) diff --git a/include/linux/mount.h b/include/linux/mount.h index cab2a85e2ee8..51f55f903aff 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -27,6 +27,7 @@ struct mnt_namespace; #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 #define MNT_READONLY 0x40 /* does the user want this to be r/o? */ +#define MNT_STRICTATIME 0x80 #define MNT_SHRINKABLE 0x100 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ -- cgit v1.3-8-gc7d7 From 10f303ae1e5e77a9f7cb053e6329906afb132c67 Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Wed, 14 Jan 2009 17:01:33 +0800 Subject: do_pipe cleanup: drop its last user in arch/alpha/ The last user of do_pipe is in arch/alpha/, after replacing it with do_pipe_flags, the do_pipe can be totally dropped. Signed-off-by: Cheng Renquan Acked-by: Richard Henderson Signed-off-by: Al Viro --- arch/alpha/kernel/entry.S | 3 ++- arch/alpha/kernel/osf_sys.c | 2 -- fs/pipe.c | 5 ----- include/linux/fs.h | 1 - 4 files changed, 2 insertions(+), 9 deletions(-) (limited to 'include/linux/fs.h') diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index e4a54b615894..b45d913a51c3 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -903,8 +903,9 @@ sys_alpha_pipe: stq $26, 0($sp) .prologue 0 + mov $31, $17 lda $16, 8($sp) - jsr $26, do_pipe + jsr $26, do_pipe_flags ldq $26, 0($sp) bne $0, 1f diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ae41f097864b..42ee05981e71 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -46,8 +46,6 @@ #include #include -extern int do_pipe(int *); - /* * Brk needs to return an error. Still support Linux's brk(0) query idiom, * which OSF programs just shouldn't be doing. We're still not quite diff --git a/fs/pipe.c b/fs/pipe.c index 14f502b89cf5..df3719562fc1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1034,11 +1034,6 @@ int do_pipe_flags(int *fd, int flags) return error; } -int do_pipe(int *fd) -{ - return do_pipe_flags(fd, 0); -} - /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. diff --git a/include/linux/fs.h b/include/linux/fs.h index 92734c0012e6..51de83bd8a87 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1881,7 +1881,6 @@ static inline void allow_write_access(struct file *file) if (file) atomic_inc(&file->f_path.dentry->d_inode->i_writecount); } -extern int do_pipe(int *); extern int do_pipe_flags(int *, int); extern struct file *create_read_pipe(struct file *f, int flags); extern struct file *create_write_pipe(int flags); -- cgit v1.3-8-gc7d7 From c2aca5e529a2499d454c41e01f59f1d5fe4a1364 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 20 Jan 2009 10:29:45 +0000 Subject: vfs: Update fs.h to use inline functions when no file locking set This avoids various issues which might give rise to compiler warnings about missing functions and/or unused variable with the previous macros. This also fixes a bug where one of the macros was returning 0, but it should have been void. Reported-by: Randy Dunlap Signed-off-by: Steven Whitehouse Tested-by: Randy Dunlap Signed-off-by: Al Viro --- include/linux/fs.h | 165 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 139 insertions(+), 26 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 51de83bd8a87..d84020b7e676 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1063,34 +1063,147 @@ extern int lease_modify(struct file_lock **, int); extern int lock_may_read(struct inode *, loff_t start, unsigned long count); extern int lock_may_write(struct inode *, loff_t start, unsigned long count); #else /* !CONFIG_FILE_LOCKING */ -#define fcntl_getlk(a, b) ({ -EINVAL; }) -#define fcntl_setlk(a, b, c, d) ({ -EACCES; }) +static inline int fcntl_getlk(struct file *file, struct flock __user *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk(unsigned int fd, struct file *file, + unsigned int cmd, struct flock __user *user) +{ + return -EACCES; +} + #if BITS_PER_LONG == 32 -#define fcntl_getlk64(a, b) ({ -EINVAL; }) -#define fcntl_setlk64(a, b, c, d) ({ -EACCES; }) +static inline int fcntl_getlk64(struct file *file, struct flock64 __user *user) +{ + return -EINVAL; +} + +static inline int fcntl_setlk64(unsigned int fd, struct file *file, + unsigned int cmd, struct flock64 __user *user) +{ + return -EACCES; +} #endif -#define fcntl_setlease(a, b, c) ({ 0; }) -#define fcntl_getlease(a) ({ 0; }) -#define locks_init_lock(a) ({ }) -#define __locks_copy_lock(a, b) ({ }) -#define locks_copy_lock(a, b) ({ }) -#define locks_remove_posix(a, b) ({ }) -#define locks_remove_flock(a) ({ }) -#define posix_test_lock(a, b) ({ 0; }) -#define posix_lock_file(a, b, c) ({ -ENOLCK; }) -#define posix_lock_file_wait(a, b) ({ -ENOLCK; }) -#define posix_unblock_lock(a, b) (-ENOENT) -#define vfs_test_lock(a, b) ({ 0; }) -#define vfs_lock_file(a, b, c, d) (-ENOLCK) -#define vfs_cancel_lock(a, b) ({ 0; }) -#define flock_lock_file_wait(a, b) ({ -ENOLCK; }) -#define __break_lease(a, b) ({ 0; }) -#define lease_get_mtime(a, b) ({ }) -#define generic_setlease(a, b, c) ({ -EINVAL; }) -#define vfs_setlease(a, b, c) ({ -EINVAL; }) -#define lease_modify(a, b) ({ -EINVAL; }) -#define lock_may_read(a, b, c) ({ 1; }) -#define lock_may_write(a, b, c) ({ 1; }) +static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +{ + return 0; +} + +static inline int fcntl_getlease(struct file *filp) +{ + return 0; +} + +static inline void locks_init_lock(struct file_lock *fl) +{ + return; +} + +static inline void __locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) +{ + return; +} + +static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) +{ + return; +} + +static inline void locks_remove_flock(struct file *filp) +{ + return; +} + +static inline void posix_test_lock(struct file *filp, struct file_lock *fl) +{ + return; +} + +static inline int posix_lock_file(struct file *filp, struct file_lock *fl, + struct file_lock *conflock) +{ + return -ENOLCK; +} + +static inline int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +{ + return -ENOLCK; +} + +static inline int posix_unblock_lock(struct file *filp, + struct file_lock *waiter) +{ + return -ENOENT; +} + +static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline int vfs_lock_file(struct file *filp, unsigned int cmd, + struct file_lock *fl, struct file_lock *conf) +{ + return -ENOLCK; +} + +static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) +{ + return 0; +} + +static inline int flock_lock_file_wait(struct file *filp, + struct file_lock *request) +{ + return -ENOLCK; +} + +static inline int __break_lease(struct inode *inode, unsigned int mode) +{ + return 0; +} + +static inline void lease_get_mtime(struct inode *inode, struct timespec *time) +{ + return; +} + +static inline int generic_setlease(struct file *filp, long arg, + struct file_lock **flp) +{ + return -EINVAL; +} + +static inline int vfs_setlease(struct file *filp, long arg, + struct file_lock **lease) +{ + return -EINVAL; +} + +static inline int lease_modify(struct file_lock **before, int arg) +{ + return -EINVAL; +} + +static inline int lock_may_read(struct inode *inode, loff_t start, + unsigned long len) +{ + return 1; +} + +static inline int lock_may_write(struct inode *inode, loff_t start, + unsigned long len) +{ + return 1; +} + #endif /* !CONFIG_FILE_LOCKING */ -- cgit v1.3-8-gc7d7 From af5df56688acfb75c1b15b4e000ec5e82a9cdc29 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 20 Jan 2009 10:29:46 +0000 Subject: vfs: Further changes from macro to inline function in fs.h There is a second set of macros for when CONFIG_FILE_LOCKING is not set. This patch updates those to become inline functions as well. Signed-off-by: Steven Whitehouse Signed-off-by: Al Viro --- include/linux/fs.h | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index d84020b7e676..5f74d616cd7d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1800,13 +1800,44 @@ static inline int break_lease(struct inode *inode, unsigned int mode) return 0; } #else /* !CONFIG_FILE_LOCKING */ -#define locks_mandatory_locked(a) ({ 0; }) -#define locks_mandatory_area(a, b, c, d, e) ({ 0; }) -#define __mandatory_lock(a) ({ 0; }) -#define mandatory_lock(a) ({ 0; }) -#define locks_verify_locked(a) ({ 0; }) -#define locks_verify_truncate(a, b, c) ({ 0; }) -#define break_lease(a, b) ({ 0; }) +static inline int locks_mandatory_locked(struct inode *inode) +{ + return 0; +} + +static inline int locks_mandatory_area(int rw, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) +{ + return 0; +} + +static inline int __mandatory_lock(struct inode *inode) +{ + return 0; +} + +static inline int mandatory_lock(struct inode *inode) +{ + return 0; +} + +static inline int locks_verify_locked(struct inode *inode) +{ + return 0; +} + +static inline int locks_verify_truncate(struct inode *inode, struct file *filp, + size_t size) +{ + return 0; +} + +static inline int break_lease(struct inode *inode, unsigned int mode) +{ + return 0; +} + #endif /* CONFIG_FILE_LOCKING */ /* fs/open.c */ -- cgit v1.3-8-gc7d7 From 585d3bc06f4ca57f975a5a1f698f65a45ea66225 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 25 Feb 2009 10:44:19 +0100 Subject: fs: move bdev code out of buffer.c Move some block device related code out from buffer.c and put it in block_dev.c. I'm trying to move non-buffer_head code out of buffer.c Signed-off-by: Al Viro --- fs/block_dev.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ fs/buffer.c | 145 ------------------------------------------- include/linux/buffer_head.h | 7 --- include/linux/fs.h | 7 +++ 4 files changed, 153 insertions(+), 152 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/block_dev.c b/fs/block_dev.c index b3c1efff5e1d..8c3c6899ccf3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -174,6 +175,151 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, iov, offset, nr_segs, blkdev_get_blocks, NULL); } +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + int ret = 0; + + if (bdev) + ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); + return ret; +} +EXPORT_SYMBOL(sync_blockdev); + +/* + * Write out and wait upon all dirty data associated with this + * device. Filesystem data as well as the underlying block + * device. Takes the superblock lock. + */ +int fsync_bdev(struct block_device *bdev) +{ + struct super_block *sb = get_super(bdev); + if (sb) { + int res = fsync_super(sb); + drop_super(sb); + return res; + } + return sync_blockdev(bdev); +} + +/** + * freeze_bdev -- lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * This takes the block device bd_mount_sem to make sure no new mounts + * happen on bdev until thaw_bdev() is called. + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in freeze_bdev() and + * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + */ +struct super_block *freeze_bdev(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + bdev->bd_fsfreeze_count++; + sb = get_super(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return sb; + } + bdev->bd_fsfreeze_count++; + + down(&bdev->bd_mount_sem); + sb = get_super(bdev); + if (sb && !(sb->s_flags & MS_RDONLY)) { + sb->s_frozen = SB_FREEZE_WRITE; + smp_wmb(); + + __fsync_super(sb); + + sb->s_frozen = SB_FREEZE_TRANS; + smp_wmb(); + + sync_blockdev(sb->s_bdev); + + if (sb->s_op->freeze_fs) { + error = sb->s_op->freeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem freeze failed\n"); + sb->s_frozen = SB_UNFROZEN; + drop_super(sb); + up(&bdev->bd_mount_sem); + bdev->bd_fsfreeze_count--; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return ERR_PTR(error); + } + } + } + + sync_blockdev(bdev); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + + return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ +} +EXPORT_SYMBOL(freeze_bdev); + +/** + * thaw_bdev -- unlock filesystem + * @bdev: blockdevice to unlock + * @sb: associated superblock + * + * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + */ +int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (!bdev->bd_fsfreeze_count) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return -EINVAL; + } + + bdev->bd_fsfreeze_count--; + if (bdev->bd_fsfreeze_count > 0) { + if (sb) + drop_super(sb); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + + if (sb) { + BUG_ON(sb->s_bdev != bdev); + if (!(sb->s_flags & MS_RDONLY)) { + if (sb->s_op->unfreeze_fs) { + error = sb->s_op->unfreeze_fs(sb); + if (error) { + printk(KERN_ERR + "VFS:Filesystem thaw failed\n"); + sb->s_frozen = SB_FREEZE_TRANS; + bdev->bd_fsfreeze_count++; + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; + } + } + sb->s_frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_wait_unfrozen); + } + drop_super(sb); + } + + up(&bdev->bd_mount_sem); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; +} +EXPORT_SYMBOL(thaw_bdev); + static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); diff --git a/fs/buffer.c b/fs/buffer.c index 891e1c78e4f1..a2fd743d97cb 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -165,151 +165,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) put_bh(bh); } -/* - * Write out and wait upon all the dirty data associated with a block - * device via its mapping. Does not take the superblock lock. - */ -int sync_blockdev(struct block_device *bdev) -{ - int ret = 0; - - if (bdev) - ret = filemap_write_and_wait(bdev->bd_inode->i_mapping); - return ret; -} -EXPORT_SYMBOL(sync_blockdev); - -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = fsync_super(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} - -/** - * freeze_bdev -- lock a filesystem and force it into a consistent state - * @bdev: blockdevice to lock - * - * This takes the block device bd_mount_sem to make sure no new mounts - * happen on bdev until thaw_bdev() is called. - * If a superblock is found on this device, we take the s_umount semaphore - * on it to make sure nobody unmounts until the snapshot creation is done. - * The reference counter (bd_fsfreeze_count) guarantees that only the last - * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze - * actually. - */ -struct super_block *freeze_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - bdev->bd_fsfreeze_count++; - sb = get_super(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return sb; - } - bdev->bd_fsfreeze_count++; - - down(&bdev->bd_mount_sem); - sb = get_super(bdev); - if (sb && !(sb->s_flags & MS_RDONLY)) { - sb->s_frozen = SB_FREEZE_WRITE; - smp_wmb(); - - __fsync_super(sb); - - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); - - sync_blockdev(sb->s_bdev); - - if (sb->s_op->freeze_fs) { - error = sb->s_op->freeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; - drop_super(sb); - up(&bdev->bd_mount_sem); - bdev->bd_fsfreeze_count--; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return ERR_PTR(error); - } - } - } - - sync_blockdev(bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - - return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */ -} -EXPORT_SYMBOL(freeze_bdev); - -/** - * thaw_bdev -- unlock filesystem - * @bdev: blockdevice to unlock - * @sb: associated superblock - * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). - */ -int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return -EINVAL; - } - - bdev->bd_fsfreeze_count--; - if (bdev->bd_fsfreeze_count > 0) { - if (sb) - drop_super(sb); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; - } - - if (sb) { - BUG_ON(sb->s_bdev != bdev); - if (!(sb->s_flags & MS_RDONLY)) { - if (sb->s_op->unfreeze_fs) { - error = sb->s_op->unfreeze_fs(sb); - if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; - bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } - } - sb->s_frozen = SB_UNFROZEN; - smp_wmb(); - wake_up(&sb->s_wait_unfrozen); - } - drop_super(sb); - } - - up(&bdev->bd_mount_sem); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; -} -EXPORT_SYMBOL(thaw_bdev); - /* * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index bd7ac793be19..f19fd9045ea0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -165,15 +165,8 @@ int sync_mapping_buffers(struct address_space *mapping); void unmap_underlying_metadata(struct block_device *bdev, sector_t block); void mark_buffer_async_write(struct buffer_head *bh); -void invalidate_bdev(struct block_device *); -int sync_blockdev(struct block_device *bdev); void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); -int fsync_bdev(struct block_device *); -struct super_block *freeze_bdev(struct block_device *); -int thaw_bdev(struct block_device *, struct super_block *); -int fsync_super(struct super_block *); -int fsync_no_super(struct block_device *); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *__getblk(struct block_device *bdev, sector_t block, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5f74d616cd7d..c2c4454a268a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1874,6 +1874,13 @@ extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); extern struct block_device *open_by_devnum(dev_t, fmode_t); +extern void invalidate_bdev(struct block_device *); +extern int sync_blockdev(struct block_device *bdev); +extern struct super_block *freeze_bdev(struct block_device *); +extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); +extern int fsync_bdev(struct block_device *); +extern int fsync_super(struct super_block *); +extern int fsync_no_super(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} #endif -- cgit v1.3-8-gc7d7 From a3ec947c85ec339884b30ef6a08133e9311fdae1 Mon Sep 17 00:00:00 2001 From: Sukadev Bhattiprolu Date: Wed, 4 Mar 2009 12:06:34 -0800 Subject: vfs: simple_set_mnt() should return void simple_set_mnt() is defined as returning 'int' but always returns 0. Callers assume simple_set_mnt() never fails and don't properly cleanup if it were to _ever_ fail. For instance, get_sb_single() and get_sb_nodev() should: up_write(sb->s_unmount); deactivate_super(sb); if simple_set_mnt() fails. Since simple_set_mnt() never fails, would be cleaner if it did not return anything. [akpm@linux-foundation.org: fix build] Signed-off-by: Sukadev Bhattiprolu Acked-by: Serge Hallyn Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- drivers/mtd/mtdsuper.c | 7 +++++-- fs/9p/vfs_super.c | 5 +++-- fs/cifs/cifsfs.c | 3 ++- fs/devpts/inode.c | 3 ++- fs/libfs.c | 3 ++- fs/namespace.c | 3 +-- fs/proc/root.c | 3 ++- fs/super.c | 9 ++++++--- fs/ubifs/super.c | 3 ++- include/linux/fs.h | 2 +- kernel/cgroup.c | 3 ++- 11 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include/linux/fs.h') diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c index 00d46e137b2a..92285d0089c2 100644 --- a/drivers/mtd/mtdsuper.c +++ b/drivers/mtd/mtdsuper.c @@ -81,13 +81,16 @@ static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags, /* go */ sb->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + + return 0; /* new mountpoint for an already mounted superblock */ already_mounted: DEBUG(1, "MTDSB: Device %d (\"%s\") is already mounted\n", mtd->index, mtd->name); - ret = simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + ret = 0; goto out_put; out_error: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 93212e40221a..5f8ab8adb5f5 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, p9stat_free(st); kfree(st); -P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n"); - return simple_set_mnt(mnt, sb); +P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); + simple_set_mnt(mnt, sb); + return 0; release_sb: if (sb) { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 13ea53251dcf..38491fd3871d 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type, return rc; } sb->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + return 0; } static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 140b43144cd8..b0a76340a4cd 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -454,7 +454,8 @@ static int get_init_pts_sb(struct file_system_type *fs_type, int flags, s->s_flags |= MS_ACTIVE; } do_remount_sb(s, flags, data, 0); - return simple_set_mnt(mnt, s); + simple_set_mnt(mnt, s); + return 0; } /* diff --git a/fs/libfs.c b/fs/libfs.c index ec600bd33e75..4910a36f516e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); + simple_set_mnt(mnt, s); + return 0; Enomem: up_write(&s->s_umount); diff --git a/fs/namespace.c b/fs/namespace.c index 06f8e63f6cb1..2432ca6bb223 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -397,11 +397,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt) spin_unlock(&vfsmount_lock); } -int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; mnt->mnt_root = dget(sb->s_root); - return 0; } EXPORT_SYMBOL(simple_set_mnt); diff --git a/fs/proc/root.c b/fs/proc/root.c index f6299a25594e..1e15a2b176e8 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type, ns->proc_mnt = mnt; } - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + return 0; } static void proc_kill_sb(struct super_block *sb) diff --git a/fs/super.c b/fs/super.c index 6ce501447ada..e512fab64c93 100644 --- a/fs/super.c +++ b/fs/super.c @@ -831,7 +831,8 @@ int get_sb_bdev(struct file_system_type *fs_type, bdev->bd_super = s; } - return simple_set_mnt(mnt, s); + simple_set_mnt(mnt, s); + return 0; error_s: error = PTR_ERR(s); @@ -877,7 +878,8 @@ int get_sb_nodev(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; - return simple_set_mnt(mnt, s); + simple_set_mnt(mnt, s); + return 0; } EXPORT_SYMBOL(get_sb_nodev); @@ -909,7 +911,8 @@ int get_sb_single(struct file_system_type *fs_type, s->s_flags |= MS_ACTIVE; } do_remount_sb(s, flags, data, 0); - return simple_set_mnt(mnt, s); + simple_set_mnt(mnt, s); + return 0; } EXPORT_SYMBOL(get_sb_single); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1182b66a5491..c5c98355459a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2034,7 +2034,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, /* 'fill_super()' opens ubi again so we must close it here */ ubi_close_volume(ubi); - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + return 0; out_deact: up_write(&sb->s_umount); diff --git a/include/linux/fs.h b/include/linux/fs.h index c2c4454a268a..a7d73914a9f7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1719,7 +1719,7 @@ struct super_block *sget(struct file_system_type *type, extern int get_sb_pseudo(struct file_system_type *, char *, const struct super_operations *ops, unsigned long, struct vfsmount *mnt); -extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); +extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); int __put_super_and_need_restart(struct super_block *sb); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b01100ebd074..c500ca7239b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1071,7 +1071,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_unlock(&cgroup_mutex); } - return simple_set_mnt(mnt, sb); + simple_set_mnt(mnt, sb); + return 0; free_cg_links: free_cg_links(&tmp_cg_links); -- cgit v1.3-8-gc7d7 From 4a6a4499693a419a20559c41e33a7bd70bf20a6f Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 27 Mar 2009 12:24:31 -0600 Subject: Fix a lockdep warning in fasync_helper() Lockdep gripes if file->f_lock is taken in a no-IRQ situation, since that is not always the case. We don't really want to disable IRQs for every acquisition of f_lock; instead, just move it outside of fasync_lock. Reported-by: Bartlomiej Zolnierkiewicz Reported-by: Larry Finger Reported-by: Wu Fengguang Signed-off-by: Jonathan Corbet --- fs/fcntl.c | 10 +++++++--- include/linux/fs.h | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/fcntl.c b/fs/fcntl.c index d865ca66ccba..cc8e4de2fee5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -531,6 +531,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap if (!new) return -ENOMEM; } + + /* + * We need to take f_lock first since it's not an IRQ-safe + * lock. + */ + spin_lock(&filp->f_lock); write_lock_irq(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { if (fa->fa_file == filp) { @@ -555,14 +561,12 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap result = 1; } out: - /* Fix up FASYNC bit while still holding fasync_lock */ - spin_lock(&filp->f_lock); if (on) filp->f_flags |= FASYNC; else filp->f_flags &= ~FASYNC; - spin_unlock(&filp->f_lock); write_unlock_irq(&fasync_lock); + spin_unlock(&filp->f_lock); return result; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 7428c6d35e65..2f13c1d77812 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -848,7 +848,7 @@ struct file { #define f_dentry f_path.dentry #define f_vfsmnt f_path.mnt const struct file_operations *f_op; - spinlock_t f_lock; /* f_ep_links, f_flags */ + spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; -- cgit v1.3-8-gc7d7 From ce3b0f8d5c2203301fc87f3aaaed73e5819e2a48 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 29 Mar 2009 19:08:22 -0400 Subject: New helper - current_umask() current->fs->umask is what most of fs_struct users are doing. Put that into a helper function. Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 2 +- fs/btrfs/acl.c | 2 +- fs/btrfs/ioctl.c | 2 +- fs/cifs/dir.c | 4 ++-- fs/cifs/inode.c | 4 ++-- fs/ext2/acl.c | 2 +- fs/ext3/acl.c | 2 +- fs/ext4/acl.c | 2 +- fs/fat/inode.c | 2 +- fs/fs_struct.c | 6 ++++++ fs/generic_acl.c | 2 +- fs/gfs2/acl.c | 2 +- fs/hfsplus/options.c | 2 +- fs/hpfs/super.c | 2 +- fs/jffs2/acl.c | 2 +- fs/jfs/acl.c | 2 +- fs/namei.c | 6 +++--- fs/nfs/nfs3proc.c | 6 +++--- fs/nfs/nfs4proc.c | 2 +- fs/ocfs2/acl.c | 2 +- fs/omfs/inode.c | 2 +- fs/reiserfs/xattr_acl.c | 2 +- fs/xfs/linux-2.6/xfs_iops.c | 4 ++-- include/linux/fs.h | 2 ++ ipc/mqueue.c | 2 +- net/unix/af_unix.c | 2 +- 26 files changed, 39 insertions(+), 31 deletions(-) (limited to 'include/linux/fs.h') diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 64f068540d0d..706eb5c7e2ee 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -635,7 +635,7 @@ long spufs_create(struct nameidata *nd, unsigned int flags, mode_t mode, if (dentry->d_inode) goto out_dput; - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (flags & SPU_CREATE_GANG) ret = spufs_create_gang(nd->path.dentry->d_inode, diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 1d53b62dbba5..7fdd184a528d 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -256,7 +256,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (IS_POSIXACL(dir) && acl) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bca729fc80c8..7594bec1be10 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -267,7 +267,7 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name, goto out_dput; if (!IS_POSIXACL(parent->dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(parent->mnt); if (error) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 2f35cccfcd8d..54dce78fbb73 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -254,7 +254,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, return -ENOMEM; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (oplockEnabled) oplock = REQ_OPLOCK; @@ -479,7 +479,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, rc = -ENOMEM; else if (pTcon->unix_ext) { struct cifs_unix_set_info_args args = { - .mode = mode & ~current->fs->umask, + .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a8797cc60805..f121a80fdd6f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1125,7 +1125,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) goto mkdir_out; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, pTcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, NULL /* netfid */, pInfo, &oplock, full_path, cifs_sb->local_nls, @@ -1204,7 +1204,7 @@ mkdir_get_info: if ((direntry->d_inode) && (direntry->d_inode->i_nlink < 2)) direntry->d_inode->i_nlink = 2; - mode &= ~current->fs->umask; + mode &= ~current_umask(); /* must turn on setgid bit if parent dir has it */ if (inode->i_mode & S_ISGID) mode |= S_ISGID; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index ae8c4f850b27..d46e38cb85c5 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -318,7 +318,7 @@ ext2_init_acl(struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index b60bb241880c..d81ef2fdb08e 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -323,7 +323,7 @@ ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 694ed6fadcc8..647e0d65a284 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -323,7 +323,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if (test_opt(inode->i_sb, POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0004fe6e00..ab657db4c94e 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -930,7 +930,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); - opts->fs_fmask = opts->fs_dmask = current->fs->umask; + opts->fs_fmask = current_umask(); opts->allow_utime = -1; opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 41cff72b377b..6ac219338670 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -138,6 +138,12 @@ int unshare_fs_struct(void) } EXPORT_SYMBOL_GPL(unshare_fs_struct); +int current_umask(void) +{ + return current->fs->umask; +} +EXPORT_SYMBOL(current_umask); + /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 995d63b2e747..e0b53aa7bbec 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -134,7 +134,7 @@ generic_acl_init(struct inode *inode, struct inode *dir, mode_t mode = inode->i_mode; int error; - inode->i_mode = mode & ~current->fs->umask; + inode->i_mode = mode & ~current_umask(); if (!S_ISLNK(inode->i_mode)) acl = ops->getacl(dir, ACL_TYPE_DEFAULT); if (acl) { diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 43764f4fa763..fa881bdc3d85 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -215,7 +215,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (error) return error; if (!acl) { - mode &= ~current->fs->umask; + mode &= ~current_umask(); if (mode != ip->i_inode.i_mode) error = munge_mode(ip, mode); return error; diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index bab7f8d1bdfa..3fcbb0e1f6fc 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -48,7 +48,7 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts) opts->creator = HFSPLUS_DEF_CR_TYPE; opts->type = HFSPLUS_DEF_CR_TYPE; - opts->umask = current->fs->umask; + opts->umask = current_umask(); opts->uid = current_uid(); opts->gid = current_gid(); opts->part = -1; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 0d049b8919c4..c696d01bc8f7 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -477,7 +477,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) uid = current_uid(); gid = current_gid(); - umask = current->fs->umask; + umask = current_umask(); lowercase = 0; conv = CONV_BINARY; eas = 2; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index d98713777a1b..77ccf8cb0823 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -336,7 +336,7 @@ int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) return PTR_ERR(acl); if (!acl) { - *i_mode &= ~current->fs->umask; + *i_mode &= ~current_umask(); } else { if (S_ISDIR(*i_mode)) jffs2_iset_acl(inode, &f->i_acl_default, acl); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index a166c1669e82..06ca1b8d2054 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -182,7 +182,7 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) cleanup: posix_acl_release(acl); } else - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | inode->i_mode; diff --git a/fs/namei.c b/fs/namei.c index 4c65a6460138..964c0249444b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1578,7 +1578,7 @@ static int __open_namei_create(struct nameidata *nd, struct path *path, struct dentry *dir = nd->path.dentry; if (!IS_POSIXACL(dir->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = security_path_mknod(&nd->path, path->dentry, mode, 0); if (error) goto out_unlock; @@ -1989,7 +1989,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, int, mode, goto out_unlock; } if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = may_mknod(mode); if (error) goto out_dput; @@ -2067,7 +2067,7 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, int, mode) goto out_unlock; if (!IS_POSIXACL(nd.path.dentry->d_inode)) - mode &= ~current->fs->umask; + mode &= ~current_umask(); error = mnt_want_write(nd.path.mnt); if (error) goto out_dput; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index c55be7a7679e..e47d4400fb87 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -328,7 +328,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, data->arg.create.verifier[1] = current->pid; } - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); for (;;) { status = nfs3_do_create(dir, dentry, data); @@ -528,7 +528,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) dprintk("NFS call mkdir %s\n", dentry->d_name.name); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) @@ -639,7 +639,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dprintk("NFS call mknod %s %u:%u\n", dentry->d_name.name, MAJOR(rdev), MINOR(rdev)); - sattr->ia_mode &= ~current->fs->umask; + sattr->ia_mode &= ~current_umask(); data = nfs3_alloc_createdata(); if (data == NULL) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 8dde84b988d9..bbee587dd597 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1509,7 +1509,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) attr.ia_mode = nd->intent.open.create_mode; attr.ia_valid = ATTR_MODE; if (!IS_POSIXACL(dir)) - attr.ia_mode &= ~current->fs->umask; + attr.ia_mode &= ~current_umask(); } else { attr.ia_valid = 0; BUG_ON(nd->intent.open.flags & O_CREAT); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 12dfb44c22e5..fbeaec762103 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -296,7 +296,7 @@ int ocfs2_init_acl(handle_t *handle, return PTR_ERR(acl); } if (!acl) - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { struct posix_acl *clone; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 633e9dc972bb..aa6fc30772af 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -421,7 +421,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_uid = current_uid(); sbi->s_gid = current_gid(); - sbi->s_dmask = sbi->s_fmask = current->fs->umask; + sbi->s_dmask = sbi->s_fmask = current_umask(); if (!parse_options((char *) data, sbi)) goto end; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d423416d93d1..c303c426fe2b 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -428,7 +428,7 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, } else { apply_umask: /* no ACL, apply umask */ - inode->i_mode &= ~current->fs->umask; + inode->i_mode &= ~current_umask(); } return err; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 7aa53fefc67f..2940612e3aeb 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -227,7 +227,7 @@ xfs_vn_mknod( xfs_dentry_to_name(&name, dentry); if (IS_POSIXACL(dir) && !default_acl) - mode &= ~current->fs->umask; + mode &= ~current_umask(); switch (mode & S_IFMT) { case S_IFCHR: @@ -416,7 +416,7 @@ xfs_vn_symlink( mode_t mode; mode = S_IFLNK | - (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); + (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); xfs_dentry_to_name(&name, dentry); error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd7..3d7bd5447ca3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1741,6 +1741,8 @@ extern void drop_collected_mounts(struct vfsmount *); extern int vfs_statfs(struct dentry *, struct kstatfs *); +extern int current_umask(void); + /* /sys/fs */ extern struct kobject *fs_kobj; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index a8ddadbc7459..916785363f0f 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -602,7 +602,7 @@ static struct file *do_create(struct dentry *dir, struct dentry *dentry, dentry->d_fsdata = attr; } - mode &= ~current->fs->umask; + mode &= ~current_umask(); ret = mnt_want_write(mqueue_mnt); if (ret) goto out; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index baac91049b0e..9dcc6e7f96ec 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -832,7 +832,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * All right, let's create it. */ mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current->fs->umask); + (SOCK_INODE(sock)->i_mode & ~current_umask()); err = mnt_want_write(nd.path.mnt); if (err) goto out_mknod_dput; -- cgit v1.3-8-gc7d7 From 47e4491b40df73c3b117e3d80b31b5b512a4b19f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 1 Apr 2009 07:07:16 -0400 Subject: Cleanup after commit 585d3bc06f4ca57f975a5a1f698f65a45ea66225 fsync_bdev() export and a bunch of stubs for !CONFIG_BLOCK case had been left behind Signed-off-by: Al Viro --- fs/block_dev.c | 1 + fs/buffer.c | 1 - include/linux/buffer_head.h | 12 ------------ include/linux/fs.h | 12 ++++++++++++ 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/block_dev.c b/fs/block_dev.c index 8c3c6899ccf3..f45dbc18dd17 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -204,6 +204,7 @@ int fsync_bdev(struct block_device *bdev) } return sync_blockdev(bdev); } +EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state diff --git a/fs/buffer.c b/fs/buffer.c index a2fd743d97cb..b71e52925c83 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3281,7 +3281,6 @@ EXPORT_SYMBOL(cont_write_begin); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); -EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f19fd9045ea0..fc91665d39d0 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -332,22 +332,10 @@ extern int __set_page_dirty_buffers(struct page *page); static inline void buffer_init(void) {} static inline int try_to_free_buffers(struct page *page) { return 1; } -static inline int sync_blockdev(struct block_device *bdev) { return 0; } static inline int inode_has_buffers(struct inode *inode) { return 0; } static inline void invalidate_inode_buffers(struct inode *inode) {} static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } -static inline void invalidate_bdev(struct block_device *bdev) {} - -static inline struct super_block *freeze_bdev(struct block_device *sb) -{ - return NULL; -} - -static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) -{ - return 0; -} #endif /* CONFIG_BLOCK */ #endif /* _LINUX_BUFFER_HEAD_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3d7bd5447ca3..674134725597 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1886,6 +1886,18 @@ extern int fsync_super(struct super_block *); extern int fsync_no_super(struct block_device *); #else static inline void bd_forget(struct inode *inode) {} +static inline int sync_blockdev(struct block_device *bdev) { return 0; } +static inline void invalidate_bdev(struct block_device *bdev) {} + +static inline struct super_block *freeze_bdev(struct block_device *sb) +{ + return NULL; +} + +static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb) +{ + return 0; +} #endif extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; -- cgit v1.3-8-gc7d7 From c2d7543851849a6923680cdd7e1047ed1a84a1c5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 31 Mar 2009 15:23:46 -0700 Subject: filesystem freeze: allow SysRq emergency thaw to thaw frozen filesystems Now that the filesystem freeze operation has been elevated to the VFS, and is just an ioctl away, some sort of safety net for unintentionally frozen root filesystems may be in order. The timeout thaw originally proposed did not get merged, but perhaps something like this would be useful in emergencies. For example, freeze /path/to/mountpoint may freeze your root filesystem if you forgot that you had that unmounted. I chose 'j' as the last remaining character other than 'h' which is sort of reserved for help (because help is generated on any unknown character). I've tested this on a non-root fs with multiple (nested) freezers, as well as on a system rendered unresponsive due to a frozen root fs. [randy.dunlap@oracle.com: emergency thaw only if CONFIG_BLOCK enabled] Signed-off-by: Eric Sandeen Cc: Takashi Sato Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysrq.txt | 5 +++++ drivers/char/sysrq.c | 19 ++++++++++++++++++- fs/buffer.c | 33 +++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 4 files changed, 57 insertions(+), 1 deletion(-) (limited to 'include/linux/fs.h') diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt index 9e592c718afb..afa2946892da 100644 --- a/Documentation/sysrq.txt +++ b/Documentation/sysrq.txt @@ -81,6 +81,8 @@ On all - write a character to /proc/sysrq-trigger. e.g.: 'i' - Send a SIGKILL to all processes, except for init. +'j' - Forcibly "Just thaw it" - filesystems frozen by the FIFREEZE ioctl. + 'k' - Secure Access Key (SAK) Kills all programs on the current virtual console. NOTE: See important comments below in SAK section. @@ -160,6 +162,9 @@ t'E'rm and k'I'll are useful if you have some sort of runaway process you are unable to kill any other way, especially if it's spawning other processes. +"'J'ust thaw it" is useful if your system becomes unresponsive due to a frozen +(probably root) filesystem via the FIFREEZE ioctl. + * Sometimes SysRq seems to get 'stuck' after using it, what can I do? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ That happens to me, also. I've found that tapping shift, alt, and control diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 33a9351c896d..5afe7316c72e 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -346,6 +346,19 @@ static struct sysrq_key_op sysrq_moom_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; +#ifdef CONFIG_BLOCK +static void sysrq_handle_thaw(int key, struct tty_struct *tty) +{ + emergency_thaw_all(); +} +static struct sysrq_key_op sysrq_thaw_op = { + .handler = sysrq_handle_thaw, + .help_msg = "thaw-filesystems(J)", + .action_msg = "Emergency Thaw of all frozen filesystems", + .enable_mask = SYSRQ_ENABLE_SIGNAL, +}; +#endif + static void sysrq_handle_kill(int key, struct tty_struct *tty) { send_sig_all(SIGKILL); @@ -396,9 +409,13 @@ static struct sysrq_key_op *sysrq_key_table[36] = { &sysrq_moom_op, /* f */ /* g: May be registered by ppc for kgdb */ NULL, /* g */ - NULL, /* h */ + NULL, /* h - reserved for help */ &sysrq_kill_op, /* i */ +#ifdef CONFIG_BLOCK + &sysrq_thaw_op, /* j */ +#else NULL, /* j */ +#endif &sysrq_SAK_op, /* k */ #ifdef CONFIG_SMP &sysrq_showallcpus_op, /* l */ diff --git a/fs/buffer.c b/fs/buffer.c index c77b848c3d43..f5f8b15a6e40 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -547,6 +547,39 @@ repeat: return err; } +void do_thaw_all(unsigned long unused) +{ + struct super_block *sb; + char b[BDEVNAME_SIZE]; + + spin_lock(&sb_lock); +restart: + list_for_each_entry(sb, &super_blocks, s_list) { + sb->s_count++; + spin_unlock(&sb_lock); + down_read(&sb->s_umount); + while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) + printk(KERN_WARNING "Emergency Thaw on %s\n", + bdevname(sb->s_bdev, b)); + up_read(&sb->s_umount); + spin_lock(&sb_lock); + if (__put_super_and_need_restart(sb)) + goto restart; + } + spin_unlock(&sb_lock); + printk(KERN_WARNING "Emergency Thaw complete\n"); +} + +/** + * emergency_thaw_all -- forcibly thaw every frozen filesystem + * + * Used for emergency unfreeze of all filesystems via SysRq + */ +void emergency_thaw_all(void) +{ + pdflush_operation(do_thaw_all, 0); +} + /** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written diff --git a/include/linux/fs.h b/include/linux/fs.h index 87e7bfc5ebd7..61211ad823fe 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1878,6 +1878,7 @@ extern struct block_device *open_by_devnum(dev_t, fmode_t); extern void invalidate_bdev(struct block_device *); extern int sync_blockdev(struct block_device *bdev); extern struct super_block *freeze_bdev(struct block_device *); +extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); extern int fsync_super(struct super_block *); -- cgit v1.3-8-gc7d7 From 3d544f411f2971eb82f5c52322251eb04494542a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 24 Mar 2009 11:59:23 +0200 Subject: kmemtrace, fs, security: move alloc_secdata() and free_secdata() to linux/security.h Impact: cleanup We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace changes), but this is not possible currently without breaking the build because fs.h has implicit include file depedencies: it uses GFP_* types in inlines but does not include gfp.h. In practice most fs.h using .c files get gfp.h included implicitly, via an indirect route: via rcupdate.h inclusion - so this underlying problem gets masked in practice. So we want to solve fs.h's dependency on gfp.h. gfp.h can not be included here directly because it is not exported and it would break the build the following way: /home/mingo/tip/usr/include/linux/bsg.h:11: found __[us]{8,16,32,64} type without #include /home/mingo/tip/usr/include/linux/fs.h:11: included file 'linux/gfp.h' is not exported make[3]: *** [/home/mingo/tip/usr/include/linux/.check] Error 1 make[2]: *** [linux] Error 2 As suggested by Alexey Dobriyan, move alloc_secdata() and free_secdata() to linux/security.h - they belong there. This also cleans fs.h of GFP_* usage. Suggested-by: Alexey Dobriyan Signed-off-by: Pekka Enberg Cc: Eduard - Gabriel Munteanu LKML-Reference: <1237906803.25315.96.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- include/linux/fs.h | 21 --------------------- include/linux/security.h | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 21 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 61211ad823fe..fc4dc28c5735 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2382,27 +2382,6 @@ ssize_t simple_attr_read(struct file *file, char __user *buf, ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); - -#ifdef CONFIG_SECURITY -static inline char *alloc_secdata(void) -{ - return (char *)get_zeroed_page(GFP_KERNEL); -} - -static inline void free_secdata(void *secdata) -{ - free_page((unsigned long)secdata); -} -#else -static inline char *alloc_secdata(void) -{ - return (char *)1; -} - -static inline void free_secdata(void *secdata) -{ } -#endif /* CONFIG_SECURITY */ - struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/security.h b/include/linux/security.h index 54ed15799a83..d5fd6163606f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -32,6 +32,7 @@ #include #include #include +#include #include /* Maximum number of letters for an LSM name string */ @@ -2953,5 +2954,28 @@ static inline void securityfs_remove(struct dentry *dentry) #endif +#ifdef CONFIG_SECURITY + +static inline char *alloc_secdata(void) +{ + return (char *)get_zeroed_page(GFP_KERNEL); +} + +static inline void free_secdata(void *secdata) +{ + free_page((unsigned long)secdata); +} + +#else + +static inline char *alloc_secdata(void) +{ + return (char *)1; +} + +static inline void free_secdata(void *secdata) +{ } +#endif /* CONFIG_SECURITY */ + #endif /* ! __LINUX_SECURITY_H */ -- cgit v1.3-8-gc7d7 From 76791ab2d5e00c1eef728a8df4347ba133760fb8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 25 Mar 2009 16:48:35 +0100 Subject: kmemtrace, fs: uninline simple_transaction_set() Impact: cleanup We want to remove percpu.h from rcupdate.h (for upcoming kmemtrace changes), but this is not possible currently without breaking the build because fs.h has an implicit include file depedency: it uses PAGE_SIZE but does not include asm/page.h which defines it. This problem gets masked in practice because most fs.h using sites use rcupreempt.h (and other headers) which includes percpu.h which brings in asm/page.h indirectly. We cannot add asm/page.h to asm/fs.h because page.h is not an exported header. Move simple_transaction_set() to the other simple-transaction file helpers in fs/libfs.c. This removes the include file hell and also reduces kernel size a bit. Acked-by: Al Viro Cc: Alexey Dobriyan Cc: Pekka Enberg Cc: Eduard - Gabriel Munteanu Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1237898630.25315.83.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- fs/libfs.c | 16 ++++++++++++++++ include/linux/fs.h | 14 +------------- 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/libfs.c b/fs/libfs.c index 4910a36f516e..cd223190c4e9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, * possibly a read which collects the result - which is stored in a * file-local buffer. */ + +void simple_transaction_set(struct file *file, size_t n) +{ + struct simple_transaction_argresp *ar = file->private_data; + + BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); + + /* + * The barrier ensures that ar->size will really remain zero until + * ar->data is ready for reading. + */ + smp_mb(); + ar->size = n; +} + char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; @@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file); EXPORT_SYMBOL(simple_unlink); EXPORT_SYMBOL(simple_read_from_buffer); EXPORT_SYMBOL(memory_read_from_buffer); +EXPORT_SYMBOL(simple_transaction_set); EXPORT_SYMBOL(simple_transaction_get); EXPORT_SYMBOL(simple_transaction_read); EXPORT_SYMBOL(simple_transaction_release); diff --git a/include/linux/fs.h b/include/linux/fs.h index fc4dc28c5735..e4de2b543a73 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2323,19 +2323,7 @@ ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); -static inline void simple_transaction_set(struct file *file, size_t n) -{ - struct simple_transaction_argresp *ar = file->private_data; - - BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); - - /* - * The barrier ensures that ar->size will really remain zero until - * ar->data is ready for reading. - */ - smp_mb(); - ar->size = n; -} +void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files -- cgit v1.3-8-gc7d7 From a1f242524c3c1f5d40f1c9c343427e34d1aadd6e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 6 Apr 2009 14:48:02 +0200 Subject: Add WRITE_SYNC_PLUG and SWRITE_SYNC_PLUG (S)WRITE_SYNC always unplugs the device right after IO submission. Sometimes we want to build up a queue before doing so, so add variants that explicitly DON'T unplug the queue. The caller must then do that after submitting all the IO. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index a09e17c8f5fd..ea0510978f76 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -96,7 +96,10 @@ struct inodes_stat_t { #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) #define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define SWRITE_SYNC_PLUG \ + (SWRITE | (1 << BIO_RW_SYNCIO)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) -- cgit v1.3-8-gc7d7 From aeb6fafb8fa53266d70ca7474fcda2bdaf96524a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 6 Apr 2009 14:48:07 +0200 Subject: block: Add flag for telling the IO schedulers NOT to anticipate more IO By default, CFQ will anticipate more IO from a given io context if the previously completed IO was sync. This used to be fine, since the only sync IO was reads and O_DIRECT writes. But with more "normal" sync writes being used now, we don't want to anticipate for those. Add a bio/request flag that informs the IO scheduler that this is a sync request that we should not idle for. Introduce WRITE_ODIRECT specifically for O_DIRECT writes, and make sure that the other sync writes set this flag. Signed-off-by: Jens Axboe Signed-off-by: Linus Torvalds --- block/blk-core.c | 2 ++ block/cfq-iosched.c | 4 +++- fs/direct-io.c | 2 +- include/linux/bio.h | 19 +++++++++++-------- include/linux/blkdev.h | 3 +++ include/linux/fs.h | 9 +++++---- 6 files changed, 25 insertions(+), 14 deletions(-) (limited to 'include/linux/fs.h') diff --git a/block/blk-core.c b/block/blk-core.c index c4198f083e5b..25572802dac2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1128,6 +1128,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_UNPLUG; if (bio_rw_meta(bio)) req->cmd_flags |= REQ_RW_META; + if (bio_noidle(bio)) + req->cmd_flags |= REQ_NOIDLE; req->errors = 0; req->hard_sector = req->sector = bio->bi_sector; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 664ebfd092ec..9e809345f71a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1992,8 +1992,10 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) } if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (sync && RB_EMPTY_ROOT(&cfqq->sort_list)) + else if (sync && !rq_noidle(rq) && + RB_EMPTY_ROOT(&cfqq->sort_list)) { cfq_arm_slice_timer(cfqd); + } } if (!cfqd->rq_in_driver) diff --git a/fs/direct-io.c b/fs/direct-io.c index b6d43908ff7a..da258e7249cc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC; + rw = WRITE_ODIRECT; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); diff --git a/include/linux/bio.h b/include/linux/bio.h index b05b1d4d17d2..b900d2c67d29 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -145,20 +145,21 @@ struct bio { * bit 2 -- barrier * Insert a serialization point in the IO queue, forcing previously * submitted IO to be completed before this one is issued. - * bit 3 -- synchronous I/O hint: the block layer will unplug immediately - * Note that this does NOT indicate that the IO itself is sync, just - * that the block layer will not postpone issue of this IO by plugging. - * bit 4 -- metadata request + * bit 3 -- synchronous I/O hint. + * bit 4 -- Unplug the device immediately after submitting this bio. + * bit 5 -- metadata request * Used for tracing to differentiate metadata and data IO. May also * get some preferential treatment in the IO scheduler - * bit 5 -- discard sectors + * bit 6 -- discard sectors * Informs the lower level device that this range of sectors is no longer * used by the file system and may thus be freed by the device. Used * for flash based storage. - * bit 6 -- fail fast device errors - * bit 7 -- fail fast transport errors - * bit 8 -- fail fast driver errors + * bit 7 -- fail fast device errors + * bit 8 -- fail fast transport errors + * bit 9 -- fail fast driver errors * Don't want driver retries for any fast fail whatever the reason. + * bit 10 -- Tell the IO scheduler not to wait for more requests after this + one has been submitted, even if it is a SYNC request. */ #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ @@ -170,6 +171,7 @@ struct bio { #define BIO_RW_FAILFAST_DEV 7 #define BIO_RW_FAILFAST_TRANSPORT 8 #define BIO_RW_FAILFAST_DRIVER 9 +#define BIO_RW_NOIDLE 10 #define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) @@ -188,6 +190,7 @@ struct bio { #define bio_rw_ahead(bio) bio_rw_flagged(bio, BIO_RW_AHEAD) #define bio_rw_meta(bio) bio_rw_flagged(bio, BIO_RW_META) #define bio_discard(bio) bio_rw_flagged(bio, BIO_RW_DISCARD) +#define bio_noidle(bio) bio_rw_flagged(bio, BIO_RW_NOIDLE) /* * upper 16 bits of bi_rw define the io priority of this bio diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 67dae3bd881c..e03660964e02 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -118,6 +118,7 @@ enum rq_flag_bits { __REQ_COPY_USER, /* contains copies of user pages */ __REQ_INTEGRITY, /* integrity metadata has been remapped */ __REQ_UNPLUG, /* unplug queue on submission */ + __REQ_NOIDLE, /* Don't anticipate more IO after this one */ __REQ_NR_BITS, /* stops here */ }; @@ -145,6 +146,7 @@ enum rq_flag_bits { #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_INTEGRITY (1 << __REQ_INTEGRITY) #define REQ_UNPLUG (1 << __REQ_UNPLUG) +#define REQ_NOIDLE (1 << __REQ_NOIDLE) #define BLK_MAX_CDB 16 @@ -633,6 +635,7 @@ static inline bool rq_is_sync(struct request *rq) } #define rq_is_meta(rq) ((rq)->cmd_flags & REQ_RW_META) +#define rq_noidle(rq) ((rq)->cmd_flags & REQ_NOIDLE) static inline int blk_queue_full(struct request_queue *q, int sync) { diff --git a/include/linux/fs.h b/include/linux/fs.h index ea0510978f76..cae5720f431c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -95,11 +95,12 @@ struct inodes_stat_t { #define SWRITE 3 /* for ll_rw_block() - wait for buffer lock */ #define READ_SYNC (READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define READ_META (READ | (1 << BIO_RW_META)) -#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) -#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) -#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) +#define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) +#define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) +#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define SWRITE_SYNC_PLUG \ - (SWRITE | (1 << BIO_RW_SYNCIO)) + (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) +#define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) -- cgit v1.3-8-gc7d7 From 909e6d94795654040ed416ac69858d5d2ce66dd3 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Mon, 6 Apr 2009 19:01:07 -0700 Subject: namespaces: move proc_net_get_sb to a generic fs/super.c helper The mqueuefs filesystem will use this helper as well. Proc's main get_sb could also be made to use it, but that will require a bit more rework. Signed-off-by: Serge E. Hallyn Cc: Cedric Le Goater Cc: Alexey Dobriyan Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 40 ++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 3 +++ 2 files changed, 43 insertions(+) (limited to 'include/linux/fs.h') diff --git a/fs/super.c b/fs/super.c index 77cb4ec919b9..786fe7d72790 100644 --- a/fs/super.c +++ b/fs/super.c @@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb) EXPORT_SYMBOL(kill_litter_super); +static int ns_test_super(struct super_block *sb, void *data) +{ + return sb->s_fs_info == data; +} + +static int ns_set_super(struct super_block *sb, void *data) +{ + sb->s_fs_info = data; + return set_anon_super(sb, NULL); +} + +int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) +{ + struct super_block *sb; + + sb = sget(fs_type, ns_test_super, ns_set_super, data); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + int err; + sb->s_flags = flags; + err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) { + up_write(&sb->s_umount); + deactivate_super(sb); + return err; + } + + sb->s_flags |= MS_ACTIVE; + } + + simple_set_mnt(mnt, sb); + return 0; +} + +EXPORT_SYMBOL(get_sb_ns); + #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { diff --git a/include/linux/fs.h b/include/linux/fs.h index bce40a2207ee..562d2855cf30 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1699,6 +1699,9 @@ struct file_system_type { struct lock_class_key i_alloc_sem_key; }; +extern int get_sb_ns(struct file_system_type *fs_type, int flags, void *data, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt); extern int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int), -- cgit v1.3-8-gc7d7 From 48e70bc18ac81881dedd3aa327c55b924fc41ecf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 14 Apr 2009 08:19:27 +0200 Subject: Document and move the various READ/WRITE types It's a somewhat twisty maze of hints and behavioural modifiers, try and clear it up a bit with some documentation. Signed-off-by: Jens Axboe --- include/linux/fs.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index 562d2855cf30..b535aec4406b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -87,6 +87,60 @@ struct inodes_stat_t { */ #define FMODE_NOCMTIME ((__force fmode_t)2048) +/* + * The below are the various read and write types that we support. Some of + * them include behavioral modifiers that send information down to the + * block layer and IO scheduler. Terminology: + * + * The block layer uses device plugging to defer IO a little bit, in + * the hope that we will see more IO very shortly. This increases + * coalescing of adjacent IO and thus reduces the number of IOs we + * have to send to the device. It also allows for better queuing, + * if the IO isn't mergeable. If the caller is going to be waiting + * for the IO, then he must ensure that the device is unplugged so + * that the IO is dispatched to the driver. + * + * All IO is handled async in Linux. This is fine for background + * writes, but for reads or writes that someone waits for completion + * on, we want to notify the block layer and IO scheduler so that they + * know about it. That allows them to make better scheduling + * decisions. So when the below references 'sync' and 'async', it + * is referencing this priority hint. + * + * With that in mind, the available types are: + * + * READ A normal read operation. Device will be plugged. + * READ_SYNC A synchronous read. Device is not plugged, caller can + * immediately wait on this read without caring about + * unplugging. + * READA Used for read-ahead operations. Lower priority, and the + * block layer could (in theory) choose to ignore this + * request if it runs into resource problems. + * WRITE A normal async write. Device will be plugged. + * SWRITE Like WRITE, but a special case for ll_rw_block() that + * tells it to lock the buffer first. Normally a buffer + * must be locked before doing IO. + * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down + * the hint that someone will be waiting on this IO + * shortly. The device must still be unplugged explicitly, + * WRITE_SYNC_PLUG does not do this as we could be + * submitting more writes before we actually wait on any + * of them. + * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device + * immediately after submission. The write equivalent + * of READ_SYNC. + * WRITE_ODIRECT Special case write for O_DIRECT only. + * SWRITE_SYNC + * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. + * See SWRITE. + * WRITE_BARRIER Like WRITE, but tells the block layer that all + * previously submitted writes must be safely on storage + * before this one is started. Also guarantees that when + * this write is complete, it itself is also safely on + * storage. Prevents reordering of writes on both sides + * of this IO. + * + */ #define RW_MASK 1 #define RWA_MASK 2 #define READ 0 @@ -102,6 +156,11 @@ struct inodes_stat_t { (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) #define WRITE_BARRIER (WRITE | (1 << BIO_RW_BARRIER)) + +/* + * These aren't really reads or writes, they pass down information about + * parts of device that are now unused by the file system. + */ #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD) #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER)) -- cgit v1.3-8-gc7d7 From f8cc774ce4844811a55e2352f1443055e3994e28 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:40 +0200 Subject: splice: remove generic_file_splice_write_nolock() Remove the now unused generic_file_splice_write_nolock() function. It's conceptually broken anyway, because splice may need to wait for pipe events so holding locks across the whole operation is wrong. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 59 ------------------------------------------------------ include/linux/fs.h | 2 -- 2 files changed, 61 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/splice.c b/fs/splice.c index 584b2b7a1dbe..128ee36a719b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -810,65 +810,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, return ret; } -/** - * generic_file_splice_write_nolock - generic_file_splice_write without mutexes - * @pipe: pipe info - * @out: file to write to - * @ppos: position in @out - * @len: number of bytes to splice - * @flags: splice modifier flags - * - * Description: - * Will either move or copy pages (determined by @flags options) from - * the given pipe inode to the given file. The caller is responsible - * for acquiring i_mutex on both inodes. - * - */ -ssize_t -generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - struct address_space *mapping = out->f_mapping; - struct inode *inode = mapping->host; - struct splice_desc sd = { - .total_len = len, - .flags = flags, - .pos = *ppos, - .u.file = out, - }; - ssize_t ret; - int err; - - err = file_remove_suid(out); - if (unlikely(err)) - return err; - - ret = __splice_from_pipe(pipe, &sd, pipe_to_file); - if (ret > 0) { - unsigned long nr_pages; - - *ppos += ret; - nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - - if (err) - ret = err; - } - balance_dirty_pages_ratelimited_nr(mapping, nr_pages); - } - - return ret; -} - -EXPORT_SYMBOL(generic_file_splice_write_nolock); - /** * generic_file_splice_write - splice data from a pipe to a file * @pipe: pipe info diff --git a/include/linux/fs.h b/include/linux/fs.h index b535aec4406b..907d8f56c6fa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2209,8 +2209,6 @@ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); -extern ssize_t generic_file_splice_write_nolock(struct pipe_inode_info *, - struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, -- cgit v1.3-8-gc7d7 From 61e0d47c33cc371f725bcda4a47ae0efe652dba8 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 14 Apr 2009 19:48:41 +0200 Subject: splice: add helpers for locking pipe inode There are lots of sequences like this, especially in splice code: if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); /* do something */ if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); so introduce helpers which do the conditional locking and unlocking. Also replace the inode_double_lock() call with a pipe_double_lock() helper to avoid spreading the use of this functionality beyond the pipe code. This patch is just a cleanup, and should cause no behavioral changes. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/inode.c | 36 ---------------------------------- fs/pipe.c | 42 +++++++++++++++++++++++++++++++++++---- fs/splice.c | 50 ++++++++++++++++++++--------------------------- include/linux/fs.h | 3 --- include/linux/pipe_fs_i.h | 5 +++++ 5 files changed, 64 insertions(+), 72 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/inode.c b/fs/inode.c index d06d6d268de9..6ad14a1cd8c9 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1470,42 +1470,6 @@ static void __wait_on_freeing_inode(struct inode *inode) spin_lock(&inode_lock); } -/* - * We rarely want to lock two inodes that do not have a parent/child - * relationship (such as directory, child inode) simultaneously. The - * vast majority of file systems should be able to get along fine - * without this. Do not use these functions except as a last resort. - */ -void inode_double_lock(struct inode *inode1, struct inode *inode2) -{ - if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { - if (inode1) - mutex_lock(&inode1->i_mutex); - else if (inode2) - mutex_lock(&inode2->i_mutex); - return; - } - - if (inode1 < inode2) { - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); - } else { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); - } -} -EXPORT_SYMBOL(inode_double_lock); - -void inode_double_unlock(struct inode *inode1, struct inode *inode2) -{ - if (inode1) - mutex_unlock(&inode1->i_mutex); - - if (inode2 && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); -} -EXPORT_SYMBOL(inode_double_unlock); - static __initdata unsigned long ihash_entries; static int __init set_ihash_entries(char *str) { diff --git a/fs/pipe.c b/fs/pipe.c index 4af7aa521813..13414ec45b8d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -37,6 +37,42 @@ * -- Manfred Spraul 2002-05-09 */ +static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +{ + if (pipe->inode) + mutex_lock_nested(&pipe->inode->i_mutex, subclass); +} + +void pipe_lock(struct pipe_inode_info *pipe) +{ + /* + * pipe_lock() nests non-pipe inode locks (for writing to a file) + */ + pipe_lock_nested(pipe, I_MUTEX_PARENT); +} +EXPORT_SYMBOL(pipe_lock); + +void pipe_unlock(struct pipe_inode_info *pipe) +{ + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); +} +EXPORT_SYMBOL(pipe_unlock); + +void pipe_double_lock(struct pipe_inode_info *pipe1, + struct pipe_inode_info *pipe2) +{ + BUG_ON(pipe1 == pipe2); + + if (pipe1 < pipe2) { + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + } else { + pipe_lock_nested(pipe2, I_MUTEX_CHILD); + pipe_lock_nested(pipe1, I_MUTEX_PARENT); + } +} + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe) { @@ -47,12 +83,10 @@ void pipe_wait(struct pipe_inode_info *pipe) * is considered a noninteractive wait: */ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); schedule(); finish_wait(&pipe->wait, &wait); - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); } static int diff --git a/fs/splice.c b/fs/splice.c index 128ee36a719b..5384a90665d0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -182,8 +182,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, do_wakeup = 0; page_nr = 0; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); for (;;) { if (!pipe->readers) { @@ -245,15 +244,13 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe->waiting_writers--; } - if (pipe->inode) { - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (page_nr < spd_pages) @@ -801,11 +798,9 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, .u.file = out, }; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); ret = __splice_from_pipe(pipe, &sd, actor); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -837,8 +832,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, }; ssize_t ret; - if (pipe->inode) - mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); + pipe_lock(pipe); splice_from_pipe_begin(&sd); do { @@ -854,8 +848,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, } while (ret > 0); splice_from_pipe_end(pipe, &sd); - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (sd.num_spliced) ret = sd.num_spliced; @@ -1348,8 +1341,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, if (!pipe) return -EBADF; - if (pipe->inode) - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); error = ret = 0; while (nr_segs) { @@ -1404,8 +1396,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, iov++; } - if (pipe->inode) - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); if (!ret) ret = error; @@ -1533,7 +1524,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (!pipe->nrbufs) { if (signal_pending(current)) { @@ -1551,7 +1542,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe_wait(pipe); } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -1571,7 +1562,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return 0; ret = 0; - mutex_lock(&pipe->inode->i_mutex); + pipe_lock(pipe); while (pipe->nrbufs >= PIPE_BUFFERS) { if (!pipe->readers) { @@ -1592,7 +1583,7 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) pipe->waiting_writers--; } - mutex_unlock(&pipe->inode->i_mutex); + pipe_unlock(pipe); return ret; } @@ -1608,10 +1599,10 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * Potential ABBA deadlock, work around it by ordering lock - * grabbing by inode address. Otherwise two different processes + * grabbing by pipe info address. Otherwise two different processes * could deadlock (one doing tee from A -> B, the other from B -> A). */ - inode_double_lock(ipipe->inode, opipe->inode); + pipe_double_lock(ipipe, opipe); do { if (!opipe->readers) { @@ -1662,7 +1653,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) ret = -EAGAIN; - inode_double_unlock(ipipe->inode, opipe->inode); + pipe_unlock(ipipe); + pipe_unlock(opipe); /* * If we put data in the output pipe, wakeup any potential readers. diff --git a/include/linux/fs.h b/include/linux/fs.h index 907d8f56c6fa..e766be0d4329 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -797,9 +797,6 @@ enum inode_i_mutex_lock_class I_MUTEX_QUOTA }; -extern void inode_double_lock(struct inode *inode1, struct inode *inode2); -extern void inode_double_unlock(struct inode *inode1, struct inode *inode2); - /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8e4120285f72..c8f038554e80 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -134,6 +134,11 @@ struct pipe_buf_operations { memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE +/* Pipe lock and unlock operations */ +void pipe_lock(struct pipe_inode_info *); +void pipe_unlock(struct pipe_inode_info *); +void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); + /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe); -- cgit v1.3-8-gc7d7