aboutsummaryrefslogtreecommitdiffstats
path: root/fs/namei.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-17 11:01:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-17 11:01:31 -0700
commit7f427d3a6029331304f91ef4d7cf646f054216d2 (patch)
tree61c4a7b9b0ec387da0536324cc2c07b2427b9b46 /fs/namei.c
parentMerge branch 'irq-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (diff)
parentMerge branch 'ovl-fixes' into for-linus (diff)
downloadlinux-dev-7f427d3a6029331304f91ef4d7cf646f054216d2.tar.xz
linux-dev-7f427d3a6029331304f91ef4d7cf646f054216d2.zip
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
Pull parallel filesystem directory handling update from Al Viro. This is the main parallel directory work by Al that makes the vfs layer able to do lookup and readdir in parallel within a single directory. That's a big change, since this used to be all protected by the directory inode mutex. The inode mutex is replaced by an rwsem, and serialization of lookups of a single name is done by a "in-progress" dentry marker. The series begins with xattr cleanups, and then ends with switching filesystems over to actually doing the readdir in parallel (switching to the "iterate_shared()" that only takes the read lock). A more detailed explanation of the process from Al Viro: "The xattr work starts with some acl fixes, then switches ->getxattr to passing inode and dentry separately. This is the point where the things start to get tricky - that got merged into the very beginning of the -rc3-based #work.lookups, to allow untangling the security_d_instantiate() mess. The xattr work itself proceeds to switch a lot of filesystems to generic_...xattr(); no complications there. After that initial xattr work, the series then does the following: - untangle security_d_instantiate() - convert a bunch of open-coded lookup_one_len_unlocked() to calls of that thing; one such place (in overlayfs) actually yields a trivial conflict with overlayfs fixes later in the cycle - overlayfs ended up switching to a variant of lookup_one_len_unlocked() sans the permission checks. I would've dropped that commit (it gets overridden on merge from #ovl-fixes in #for-next; proper resolution is to use the variant in mainline fs/overlayfs/super.c), but I didn't want to rebase the damn thing - it was fairly late in the cycle... - some filesystems had managed to depend on lookup/lookup exclusion for *fs-internal* data structures in a way that would break if we relaxed the VFS exclusion. Fixing hadn't been hard, fortunately. - core of that series - parallel lookup machinery, replacing ->i_mutex with rwsem, making lookup_slow() take it only shared. At that point lookups happen in parallel; lookups on the same name wait for the in-progress one to be done with that dentry. Surprisingly little code, at that - almost all of it is in fs/dcache.c, with fs/namei.c changes limited to lookup_slow() - making it use the new primitive and actually switching to locking shared. - parallel readdir stuff - first of all, we provide the exclusion on per-struct file basis, same as we do for read() vs lseek() for regular files. That takes care of most of the needed exclusion in readdir/readdir; however, these guys are trickier than lookups, so I went for switching them one-by-one. To do that, a new method '->iterate_shared()' is added and filesystems are switched to it as they are either confirmed to be OK with shared lock on directory or fixed to be OK with that. I hope to kill the original method come next cycle (almost all in-tree filesystems are switched already), but it's still not quite finished. - several filesystems get switched to parallel readdir. The interesting part here is dealing with dcache preseeding by readdir; that needs minor adjustment to be safe with directory locked only shared. Most of the filesystems doing that got switched to in those commits. Important exception: NFS. Turns out that NFS folks, with their, er, insistence on VFS getting the fuck out of the way of the Smart Filesystem Code That Knows How And What To Lock(tm) have grown the locking of their own. They had their own homegrown rwsem, with lookup/readdir/atomic_open being *writers* (sillyunlink is the reader there). Of course, with VFS getting the fuck out of the way, as requested, the actual smarts of the smart filesystem code etc. had become exposed... - do_last/lookup_open/atomic_open cleanups. As the result, open() without O_CREAT locks the directory only shared. Including the ->atomic_open() case. Backmerge from #for-linus in the middle of that - atomic_open() fix got brought in. - then comes NFS switch to saner (VFS-based ;-) locking, killing the homegrown "lookup and readdir are writers" kinda-sorta rwsem. All exclusion for sillyunlink/lookup is done by the parallel lookups mechanism. Exclusion between sillyunlink and rmdir is a real rwsem now - rmdir being the writer. Result: NFS lookups/readdirs/O_CREAT-less opens happen in parallel now. - the rest of the series consists of switching a lot of filesystems to parallel readdir; in a lot of cases ->llseek() gets simplified as well. One backmerge in there (again, #for-linus - rockridge fix)" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (74 commits) ext4: switch to ->iterate_shared() hfs: switch to ->iterate_shared() hfsplus: switch to ->iterate_shared() hostfs: switch to ->iterate_shared() hpfs: switch to ->iterate_shared() hpfs: handle allocation failures in hpfs_add_pos() gfs2: switch to ->iterate_shared() f2fs: switch to ->iterate_shared() afs: switch to ->iterate_shared() befs: switch to ->iterate_shared() befs: constify stuff a bit isofs: switch to ->iterate_shared() get_acorn_filename(): deobfuscate a bit btrfs: switch to ->iterate_shared() logfs: no need to lock directory in lseek switch ecryptfs to ->iterate_shared 9p: switch to ->iterate_shared() fat: switch to ->iterate_shared() romfs, squashfs: switch to ->iterate_shared() more trivial ->iterate_shared conversions ...
Diffstat (limited to 'fs/namei.c')
-rw-r--r--fs/namei.c387
1 files changed, 188 insertions, 199 deletions
diff --git a/fs/namei.c b/fs/namei.c
index 42f8ca038254..11f3a18d9d2d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -265,7 +265,7 @@ static int check_acl(struct inode *inode, int mask)
if (!acl)
return -EAGAIN;
/* no ->get_acl() calls in RCU mode... */
- if (acl == ACL_NOT_CACHED)
+ if (is_uncached_acl(acl))
return -ECHILD;
return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
}
@@ -1603,32 +1603,42 @@ static struct dentry *lookup_slow(const struct qstr *name,
struct dentry *dir,
unsigned int flags)
{
- struct dentry *dentry;
- inode_lock(dir->d_inode);
- dentry = d_lookup(dir, name);
- if (unlikely(dentry)) {
+ struct dentry *dentry = ERR_PTR(-ENOENT), *old;
+ struct inode *inode = dir->d_inode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ inode_lock_shared(inode);
+ /* Don't go there if it's already dead */
+ if (unlikely(IS_DEADDIR(inode)))
+ goto out;
+again:
+ dentry = d_alloc_parallel(dir, name, &wq);
+ if (IS_ERR(dentry))
+ goto out;
+ if (unlikely(!d_in_lookup(dentry))) {
if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
!(flags & LOOKUP_NO_REVAL)) {
int error = d_revalidate(dentry, flags);
if (unlikely(error <= 0)) {
- if (!error)
+ if (!error) {
d_invalidate(dentry);
+ dput(dentry);
+ goto again;
+ }
dput(dentry);
dentry = ERR_PTR(error);
}
}
- if (dentry) {
- inode_unlock(dir->d_inode);
- return dentry;
+ } else {
+ old = inode->i_op->lookup(inode, dentry, flags);
+ d_lookup_done(dentry);
+ if (unlikely(old)) {
+ dput(dentry);
+ dentry = old;
}
}
- dentry = d_alloc(dir, name);
- if (unlikely(!dentry)) {
- inode_unlock(dir->d_inode);
- return ERR_PTR(-ENOMEM);
- }
- dentry = lookup_real(dir->d_inode, dentry, flags);
- inode_unlock(dir->d_inode);
+out:
+ inode_unlock_shared(inode);
return dentry;
}
@@ -2697,7 +2707,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
return NULL;
}
- mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+ mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
p = d_ancestor(p2, p1);
if (p) {
@@ -2724,7 +2734,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
inode_unlock(p1->d_inode);
if (p1 != p2) {
inode_unlock(p2->d_inode);
- mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
}
}
EXPORT_SYMBOL(unlock_rename);
@@ -2856,143 +2866,56 @@ static int may_o_create(struct path *dir, struct dentry *dentry, umode_t mode)
static int atomic_open(struct nameidata *nd, struct dentry *dentry,
struct path *path, struct file *file,
const struct open_flags *op,
- bool got_write, bool need_lookup,
+ int open_flag, umode_t mode,
int *opened)
{
+ struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
struct inode *dir = nd->path.dentry->d_inode;
- unsigned open_flag = open_to_namei_flags(op->open_flag);
- umode_t mode;
int error;
- int acc_mode;
- int create_error = 0;
- struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
- bool excl;
-
- BUG_ON(dentry->d_inode);
-
- /* Don't create child dentry for a dead directory. */
- if (unlikely(IS_DEADDIR(dir))) {
- error = -ENOENT;
- goto out;
- }
- mode = op->mode;
- if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
- mode &= ~current_umask();
-
- excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
- if (excl)
+ if (!(~open_flag & (O_EXCL | O_CREAT))) /* both O_EXCL and O_CREAT */
open_flag &= ~O_TRUNC;
- /*
- * Checking write permission is tricky, bacuse we don't know if we are
- * going to actually need it: O_CREAT opens should work as long as the
- * file exists. But checking existence breaks atomicity. The trick is
- * to check access and if not granted clear O_CREAT from the flags.
- *
- * Another problem is returing the "right" error value (e.g. for an
- * O_EXCL open we want to return EEXIST not EROFS).
- */
- if (((open_flag & (O_CREAT | O_TRUNC)) ||
- (open_flag & O_ACCMODE) != O_RDONLY) && unlikely(!got_write)) {
- if (!(open_flag & O_CREAT)) {
- /*
- * No O_CREATE -> atomicity not a requirement -> fall
- * back to lookup + open
- */
- goto no_open;
- } else if (open_flag & (O_EXCL | O_TRUNC)) {
- /* Fall back and fail with the right error */
- create_error = -EROFS;
- goto no_open;
- } else {
- /* No side effects, safe to clear O_CREAT */
- create_error = -EROFS;
- open_flag &= ~O_CREAT;
- }
- }
-
- if (open_flag & O_CREAT) {
- error = may_o_create(&nd->path, dentry, mode);
- if (error) {
- create_error = error;
- if (open_flag & O_EXCL)
- goto no_open;
- open_flag &= ~O_CREAT;
- }
- }
-
if (nd->flags & LOOKUP_DIRECTORY)
open_flag |= O_DIRECTORY;
file->f_path.dentry = DENTRY_NOT_SET;
file->f_path.mnt = nd->path.mnt;
- error = dir->i_op->atomic_open(dir, dentry, file, open_flag, mode,
- opened);
- if (error < 0) {
- if (create_error && error == -ENOENT)
- error = create_error;
- goto out;
- }
-
- if (error) { /* returned 1, that is */
+ error = dir->i_op->atomic_open(dir, dentry, file,
+ open_to_namei_flags(open_flag),
+ mode, opened);
+ d_lookup_done(dentry);
+ if (!error) {
+ /*
+ * We didn't have the inode before the open, so check open
+ * permission here.
+ */
+ int acc_mode = op->acc_mode;
+ if (*opened & FILE_CREATED) {
+ WARN_ON(!(open_flag & O_CREAT));
+ fsnotify_create(dir, dentry);
+ acc_mode = 0;
+ }
+ error = may_open(&file->f_path, acc_mode, open_flag);
+ if (WARN_ON(error > 0))
+ error = -EINVAL;
+ } else if (error > 0) {
if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
error = -EIO;
- goto out;
- }
- if (file->f_path.dentry) {
- dput(dentry);
- dentry = file->f_path.dentry;
- }
- if (*opened & FILE_CREATED)
- fsnotify_create(dir, dentry);
- if (!dentry->d_inode) {
- WARN_ON(*opened & FILE_CREATED);
- if (create_error) {
- error = create_error;
- goto out;
- }
} else {
- if (excl && !(*opened & FILE_CREATED)) {
- error = -EEXIST;
- goto out;
+ if (file->f_path.dentry) {
+ dput(dentry);
+ dentry = file->f_path.dentry;
}
+ if (*opened & FILE_CREATED)
+ fsnotify_create(dir, dentry);
+ path->dentry = dentry;
+ path->mnt = nd->path.mnt;
+ return 1;
}
- goto looked_up;
- }
-
- /*
- * We didn't have the inode before the open, so check open permission
- * here.
- */
- acc_mode = op->acc_mode;
- if (*opened & FILE_CREATED) {
- WARN_ON(!(open_flag & O_CREAT));
- fsnotify_create(dir, dentry);
- acc_mode = 0;
}
- error = may_open(&file->f_path, acc_mode, open_flag);
- if (error)
- fput(file);
-
-out:
dput(dentry);
return error;
-
-no_open:
- if (need_lookup) {
- dentry = lookup_real(dir, dentry, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
- }
- if (create_error && !dentry->d_inode) {
- error = create_error;
- goto out;
- }
-looked_up:
- path->dentry = dentry;
- path->mnt = nd->path.mnt;
- return 1;
}
/*
@@ -3020,62 +2943,118 @@ static int lookup_open(struct nameidata *nd, struct path *path,
{
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
+ int open_flag = op->open_flag;
struct dentry *dentry;
- int error;
- bool need_lookup = false;
+ int error, create_error = 0;
+ umode_t mode = op->mode;
+ DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+
+ if (unlikely(IS_DEADDIR(dir_inode)))
+ return -ENOENT;
*opened &= ~FILE_CREATED;
- dentry = lookup_dcache(&nd->last, dir, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+ dentry = d_lookup(dir, &nd->last);
+ for (;;) {
+ if (!dentry) {
+ dentry = d_alloc_parallel(dir, &nd->last, &wq);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ }
+ if (d_in_lookup(dentry))
+ break;
- if (!dentry) {
- dentry = d_alloc(dir, &nd->last);
- if (unlikely(!dentry))
- return -ENOMEM;
- need_lookup = true;
- } else if (dentry->d_inode) {
+ if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ break;
+
+ error = d_revalidate(dentry, nd->flags);
+ if (likely(error > 0))
+ break;
+ if (error)
+ goto out_dput;
+ d_invalidate(dentry);
+ dput(dentry);
+ dentry = NULL;
+ }
+ if (dentry->d_inode) {
/* Cached positive dentry: will open in f_op->open */
goto out_no_open;
}
- if ((nd->flags & LOOKUP_OPEN) && dir_inode->i_op->atomic_open) {
- return atomic_open(nd, dentry, path, file, op, got_write,
- need_lookup, opened);
+ /*
+ * Checking write permission is tricky, bacuse we don't know if we are
+ * going to actually need it: O_CREAT opens should work as long as the
+ * file exists. But checking existence breaks atomicity. The trick is
+ * to check access and if not granted clear O_CREAT from the flags.
+ *
+ * Another problem is returing the "right" error value (e.g. for an
+ * O_EXCL open we want to return EEXIST not EROFS).
+ */
+ if (open_flag & O_CREAT) {
+ if (!IS_POSIXACL(dir->d_inode))
+ mode &= ~current_umask();
+ if (unlikely(!got_write)) {
+ create_error = -EROFS;
+ open_flag &= ~O_CREAT;
+ if (open_flag & (O_EXCL | O_TRUNC))
+ goto no_open;
+ /* No side effects, safe to clear O_CREAT */
+ } else {
+ create_error = may_o_create(&nd->path, dentry, mode);
+ if (create_error) {
+ open_flag &= ~O_CREAT;
+ if (open_flag & O_EXCL)
+ goto no_open;
+ }
+ }
+ } else if ((open_flag & (O_TRUNC|O_WRONLY|O_RDWR)) &&
+ unlikely(!got_write)) {
+ /*
+ * No O_CREATE -> atomicity not a requirement -> fall
+ * back to lookup + open
+ */
+ goto no_open;
}
- if (need_lookup) {
- BUG_ON(dentry->d_inode);
+ if (dir_inode->i_op->atomic_open) {
+ error = atomic_open(nd, dentry, path, file, op, open_flag,
+ mode, opened);
+ if (unlikely(error == -ENOENT) && create_error)
+ error = create_error;
+ return error;
+ }
- dentry = lookup_real(dir_inode, dentry, nd->flags);
- if (IS_ERR(dentry))
- return PTR_ERR(dentry);
+no_open:
+ if (d_in_lookup(dentry)) {
+ struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
+ nd->flags);
+ d_lookup_done(dentry);
+ if (unlikely(res)) {
+ if (IS_ERR(res)) {
+ error = PTR_ERR(res);
+ goto out_dput;
+ }
+ dput(dentry);
+ dentry = res;
+ }
}
/* Negative dentry, just create the file */
- if (!dentry->d_inode && (op->open_flag & O_CREAT)) {
- umode_t mode = op->mode;
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
- /*
- * This write is needed to ensure that a
- * rw->ro transition does not occur between
- * the time when the file is created and when
- * a permanent write count is taken through
- * the 'struct file' in finish_open().
- */
- if (!got_write) {
- error = -EROFS;
- goto out_dput;
- }
+ if (!dentry->d_inode && (open_flag & O_CREAT)) {
*opened |= FILE_CREATED;
- error = security_path_mknod(&nd->path, dentry, mode, 0);
- if (error)
+ audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
+ if (!dir_inode->i_op->create) {
+ error = -EACCES;
goto out_dput;
- error = vfs_create(dir->d_inode, dentry, mode,
- nd->flags & LOOKUP_EXCL);
+ }
+ error = dir_inode->i_op->create(dir_inode, dentry, mode,
+ open_flag & O_EXCL);
if (error)
goto out_dput;
+ fsnotify_create(dir_inode, dentry);
+ }
+ if (unlikely(create_error) && !dentry->d_inode) {
+ error = create_error;
+ goto out_dput;
}
out_no_open:
path->dentry = dentry;
@@ -3147,7 +3126,7 @@ static int do_last(struct nameidata *nd,
}
retry_lookup:
- if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+ if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
error = mnt_want_write(nd->path.mnt);
if (!error)
got_write = true;
@@ -3157,9 +3136,15 @@ retry_lookup:
* dropping this one anyway.
*/
}
- inode_lock(dir->d_inode);
+ if (open_flag & O_CREAT)
+ inode_lock(dir->d_inode);
+ else
+ inode_lock_shared(dir->d_inode);
error = lookup_open(nd, &path, file, op, got_write, opened);
- inode_unlock(dir->d_inode);
+ if (open_flag & O_CREAT)
+ inode_unlock(dir->d_inode);
+ else
+ inode_unlock_shared(dir->d_inode);
if (error <= 0) {
if (error)
@@ -3239,10 +3224,6 @@ finish_open:
return error;
}
audit_inode(nd->name, nd->path.dentry, 0);
- if (unlikely(d_is_symlink(nd->path.dentry)) && !(open_flag & O_PATH)) {
- error = -ELOOP;
- goto out;
- }
error = -EISDIR;
if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
@@ -3259,11 +3240,9 @@ finish_open:
got_write = true;
}
finish_open_created:
- if (likely(!(open_flag & O_PATH))) {
- error = may_open(&nd->path, acc_mode, open_flag);
- if (error)
- goto out;
- }
+ error = may_open(&nd->path, acc_mode, open_flag);
+ if (error)
+ goto out;
BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
error = vfs_open(&nd->path, file, current_cred());
if (!error) {
@@ -3275,18 +3254,13 @@ finish_open_created:
}
opened:
error = open_check_o_direct(file);
- if (error)
- goto exit_fput;
- error = ima_file_check(file, op->acc_mode, *opened);
- if (error)
- goto exit_fput;
-
- if (will_truncate) {
+ if (!error)
+ error = ima_file_check(file, op->acc_mode, *opened);
+ if (!error && will_truncate)
error = handle_truncate(file);
- if (error)
- goto exit_fput;
- }
out:
+ if (unlikely(error) && (*opened & FILE_OPENED))
+ fput(file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
@@ -3296,10 +3270,6 @@ out:
path_put(&save_parent);
return error;
-exit_fput:
- fput(file);
- goto out;
-
stale_open:
/* If no saved parent or already retried then can't retry */
if (!save_parent.dentry || retried)
@@ -3377,6 +3347,18 @@ out:
return error;
}
+static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
+{
+ struct path path;
+ int error = path_lookupat(nd, flags, &path);
+ if (!error) {
+ audit_inode(nd->name, path.dentry, 0);
+ error = vfs_open(&path, file, current_cred());
+ path_put(&path);
+ }
+ return error;
+}
+
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
@@ -3396,6 +3378,13 @@ static struct file *path_openat(struct nameidata *nd,
goto out2;
}
+ if (unlikely(file->f_flags & O_PATH)) {
+ error = do_o_path(nd, flags, file);
+ if (!error)
+ opened |= FILE_OPENED;
+ goto out2;
+ }
+
s = path_init(nd, flags);
if (IS_ERR(s)) {
put_filp(file);