From 00b0c9b82663ac42e5a09f58ce960f81f29d64ee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 14 Dec 2017 21:27:55 -0500
Subject: Add primitives for manipulating bitfields both in host- and
 fixed-endian.

The following primitives are defined in linux/bitfield.h:

* u32 le32_get_bits(__le32 val, u32 field) extracts the contents of the
  bitfield specified by @field in little-endian 32bit object @val and
  converts it to host-endian.

* void le32p_replace_bits(__le32 *p, u32 v, u32 field) replaces
  the contents of the bitfield specified by @field in little-endian
  32bit object pointed to by @p with the value of @v.  New value is
  given in host-endian and stored as little-endian.

* __le32 le32_replace_bits(__le32 old, u32 v, u32 field) is equivalent to
  ({__le32 tmp = old; le32p_replace_bits(&tmp, v, field); tmp;})
  In other words, instead of modifying an object in memory, it takes
  the initial value and returns the modified one.

* __le32 le32_encode_bits(u32 v, u32 field) is equivalent to
  le32_replace_bits(0, v, field).  In other words, it returns a little-endian
  32bit object with the bitfield specified by @field containing the
  value of @v and all bits outside that bitfield being zero.

Such set of helpers is defined for each of little-, big- and host-endian
types; e.g. u64_get_bits(val, field) will return the contents of the bitfield
specified by @field in host-endian 64bit object @val, etc.  Of course, for
host-endian no conversion is involved.

Fields to access are specified as GENMASK() values - an N-bit field
starting at bit #M is encoded as GENMASK(M + N - 1, M).  Note that
bit numbers refer to endianness of the object we are working with -
e.g. GENMASK(11, 0) in __be16 refers to the second byte and the lower
4 bits of the first byte.  In __le16 it would refer to the first byte
and the lower 4 bits of the second byte, etc.

Field specification must be a constant; __builtin_constant_p() doesn't
have to be true for it, but compiler must be able to evaluate it at
build time.  If it cannot or if the value does not encode any bitfield,
the build will fail.

If the value being stored in a bitfield is a constant that does not fit
into that bitfield, a warning will be generated at compile time.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/bitfield.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'include')

diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 1030651f8309..cf2588d81148 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -16,6 +16,7 @@
 #define _LINUX_BITFIELD_H
 
 #include <linux/build_bug.h>
+#include <asm/byteorder.h>
 
 /*
  * Bitfield access macros
@@ -103,4 +104,49 @@
 		(typeof(_mask))(((_reg) & (_mask)) >> __bf_shf(_mask));	\
 	})
 
+extern void __compiletime_warning("value doesn't fit into mask")
+__field_overflow(void);
+extern void __compiletime_error("bad bitfield mask")
+__bad_mask(void);
+static __always_inline u64 field_multiplier(u64 field)
+{
+	if ((field | (field - 1)) & ((field | (field - 1)) + 1))
+		__bad_mask();
+	return field & -field;
+}
+static __always_inline u64 field_mask(u64 field)
+{
+	return field / field_multiplier(field);
+}
+#define ____MAKE_OP(type,base,to,from)					\
+static __always_inline __##type type##_encode_bits(base v, base field)	\
+{									\
+        if (__builtin_constant_p(v) &&	(v & ~field_multiplier(field)))	\
+			    __field_overflow();				\
+	return to((v & field_mask(field)) * field_multiplier(field));	\
+}									\
+static __always_inline __##type type##_replace_bits(__##type old,	\
+					base val, base field)		\
+{									\
+	return (old & ~to(field)) | type##_encode_bits(val, field);	\
+}									\
+static __always_inline void type##p_replace_bits(__##type *p,		\
+					base val, base field)		\
+{									\
+	*p = (*p & ~to(field)) | type##_encode_bits(val, field);	\
+}									\
+static __always_inline base type##_get_bits(__##type v, base field)	\
+{									\
+	return (from(v) & field)/field_multiplier(field);		\
+}
+#define __MAKE_OP(size)							\
+	____MAKE_OP(le##size,u##size,cpu_to_le##size,le##size##_to_cpu)	\
+	____MAKE_OP(be##size,u##size,cpu_to_be##size,be##size##_to_cpu)	\
+	____MAKE_OP(u##size,u##size,,)
+__MAKE_OP(16)
+__MAKE_OP(32)
+__MAKE_OP(64)
+#undef __MAKE_OP
+#undef ____MAKE_OP
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From f1ee616214cb22410e939d963bbb2349c2570f02 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 21 Dec 2017 09:45:40 +1100
Subject: VFS: don't keep disconnected dentries on d_anon

The original purpose of the per-superblock d_anon list was to
keep disconnected dentries in the cache between consecutive
requests to the NFS server.  Dentries can be disconnected if
a client holds a file open and repeatedly performs IO on it,
and if the server drops the dentry, whether due to memory
pressure, server restart, or "echo 3 > /proc/sys/vm/drop_caches".

This purpose was thwarted by commit 75a6f82a0d10 ("freeing unlinked
file indefinitely delayed") which caused disconnected dentries
to be freed as soon as their refcount reached zero.

This means that, when a dentry being used by nfsd gets disconnected, a
new one needs to be allocated for every request (unless requests
overlap).  As the dentry has no name, no parent, and no children,
there is little of value to cache.  As small memory allocations are
typically fast (from per-cpu free lists) this likely has little cost.

This means that the original purpose of s_anon is no longer relevant:
there is no longer any need to keep disconnected dentries on a list so
they appear to be hashed.

However, s_anon now has a new use.  When you mount an NFS filesystem,
the dentry stored in s_root is just a placebo.  The "real" root dentry
is allocated using d_obtain_root() and so it kept on the s_anon list.
I don't know the reason for this, but suspect it related to NFSv4
where a mount of "server:/some/path" require NFS to look up the root
filehandle on the server, then walk down "/some" and "/path" to get
the filehandle to mount.

Whatever the reason, NFS depends on the s_anon list and on
shrink_dcache_for_umount() pruning all dentries on this list.  So we
cannot simply remove s_anon.

We could just leave the code unchanged, but apart from that being
potentially confusing, the (unfair) bit-spin-lock which protects
s_anon can become a bottle neck when lots of disconnected dentries are
being created.

So this patch renames s_anon to s_roots, and stops storing
disconnected dentries on the list.  Only dentries obtained with
d_obtain_root() are now stored on this list.  There are many fewer of
these (only NFS and NILFS2 use the call, and only during filesystem
mount) so contention on the bit-lock will not be a problem.

Possibly an alternate solution should be found for NFS and NILFS2, but
that would require understanding their needs first.

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/nfs/Exporting            | 27 +++++++++++++++-------
 .../staging/lustre/lustre/llite/llite_internal.h   | 10 +-------
 fs/dcache.c                                        | 22 ++++++++++--------
 fs/super.c                                         |  2 +-
 include/linux/fs.h                                 |  2 +-
 5 files changed, 34 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index 520a4becb75c..63889149f532 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -56,13 +56,25 @@ a/ A dentry flag DCACHE_DISCONNECTED which is set on
    any dentry that might not be part of the proper prefix.
    This is set when anonymous dentries are created, and cleared when a
    dentry is noticed to be a child of a dentry which is in the proper
-   prefix. 
-
-b/ A per-superblock list "s_anon" of dentries which are the roots of
-   subtrees that are not in the proper prefix.  These dentries, as
-   well as the proper prefix, need to be released at unmount time.  As
-   these dentries will not be hashed, they are linked together on the
-   d_hash list_head.
+   prefix.  If the refcount on a dentry with this flag set
+   becomes zero, the dentry is immediately discarded, rather than being
+   kept in the dcache.  If a dentry that is not already in the dcache
+   is repeatedly accessed by filehandle (as NFSD might do), an new dentry
+   will be a allocated for each access, and discarded at the end of
+   the access.
+
+   Note that such a dentry can acquire children, name, ancestors, etc.
+   without losing DCACHE_DISCONNECTED - that flag is only cleared when
+   subtree is successfully reconnected to root.  Until then dentries
+   in such subtree are retained only as long as there are references;
+   refcount reaching zero means immediate eviction, same as for unhashed
+   dentries.  That guarantees that we won't need to hunt them down upon
+   umount.
+
+b/ A primitive for creation of secondary roots - d_obtain_root(inode).
+   Those do _not_ bear DCACHE_DISCONNECTED.  They are placed on the
+   per-superblock list (->s_roots), so they can be located at umount
+   time for eviction purposes.
 
 c/ Helper routines to allocate anonymous dentries, and to help attach
    loose directory dentries at lookup time. They are:
@@ -77,7 +89,6 @@ c/ Helper routines to allocate anonymous dentries, and to help attach
       (such as an anonymous one created by d_obtain_alias), if appropriate.
       It returns NULL when the passed-in dentry is used, following the calling
       convention of ->lookup.
-
  
 Filesystem Issues
 -----------------
diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h
index b133fd00c08c..0d62fcf016dc 100644
--- a/drivers/staging/lustre/lustre/llite/llite_internal.h
+++ b/drivers/staging/lustre/lustre/llite/llite_internal.h
@@ -1296,15 +1296,7 @@ static inline void d_lustre_invalidate(struct dentry *dentry, int nested)
 	spin_lock_nested(&dentry->d_lock,
 			 nested ? DENTRY_D_LOCK_NESTED : DENTRY_D_LOCK_NORMAL);
 	ll_d2d(dentry)->lld_invalid = 1;
-	/*
-	 * We should be careful about dentries created by d_obtain_alias().
-	 * These dentries are not put in the dentry tree, instead they are
-	 * linked to sb->s_anon through dentry->d_hash.
-	 * shrink_dcache_for_umount() shrinks the tree and sb->s_anon list.
-	 * If we unhashed such a dentry, unmount would not be able to find
-	 * it and busy inodes would be reported.
-	 */
-	if (d_count(dentry) == 0 && !(dentry->d_flags & DCACHE_DISCONNECTED))
+	if (d_count(dentry) == 0)
 		__d_drop(dentry);
 	spin_unlock(&dentry->d_lock);
 }
diff --git a/fs/dcache.c b/fs/dcache.c
index b99a39206930..17e6b84b9656 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -48,8 +48,8 @@
  *   - i_dentry, d_u.d_alias, d_inode of aliases
  * dcache_hash_bucket lock protects:
  *   - the dcache hash table
- * s_anon bl list spinlock protects:
- *   - the s_anon list (see __d_drop)
+ * s_roots bl list spinlock protects:
+ *   - the s_roots list (see __d_drop)
  * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
@@ -67,7 +67,7 @@
  *   dentry->d_lock
  *     dentry->d_sb->s_dentry_lru_lock
  *     dcache_hash_bucket lock
- *     s_anon lock
+ *     s_roots lock
  *
  * If there is an ancestor relationship:
  * dentry->d_parent->...->d_parent->d_lock
@@ -476,10 +476,10 @@ void __d_drop(struct dentry *dentry)
 		/*
 		 * Hashed dentries are normally on the dentry hashtable,
 		 * with the exception of those newly allocated by
-		 * d_obtain_alias, which are always IS_ROOT:
+		 * d_obtain_root, which are always IS_ROOT:
 		 */
 		if (unlikely(IS_ROOT(dentry)))
-			b = &dentry->d_sb->s_anon;
+			b = &dentry->d_sb->s_roots;
 		else
 			b = d_hash(dentry->d_name.hash);
 
@@ -1499,8 +1499,8 @@ void shrink_dcache_for_umount(struct super_block *sb)
 	sb->s_root = NULL;
 	do_one_tree(dentry);
 
-	while (!hlist_bl_empty(&sb->s_anon)) {
-		dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
+	while (!hlist_bl_empty(&sb->s_roots)) {
+		dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
 		do_one_tree(dentry);
 	}
 }
@@ -1964,9 +1964,11 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
 	spin_lock(&tmp->d_lock);
 	__d_set_inode_and_type(tmp, inode, add_flags);
 	hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry);
-	hlist_bl_lock(&tmp->d_sb->s_anon);
-	hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
-	hlist_bl_unlock(&tmp->d_sb->s_anon);
+	if (!disconnected) {
+		hlist_bl_lock(&tmp->d_sb->s_roots);
+		hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_roots);
+		hlist_bl_unlock(&tmp->d_sb->s_roots);
+	}
 	spin_unlock(&tmp->d_lock);
 	spin_unlock(&inode->i_lock);
 
diff --git a/fs/super.c b/fs/super.c
index d4e33e8f1e6f..9ea66601d664 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -207,7 +207,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	if (s->s_user_ns != &init_user_ns)
 		s->s_iflags |= SB_I_NODEV;
 	INIT_HLIST_NODE(&s->s_instances);
-	INIT_HLIST_BL_HEAD(&s->s_anon);
+	INIT_HLIST_BL_HEAD(&s->s_roots);
 	mutex_init(&s->s_sync_lock);
 	INIT_LIST_HEAD(&s->s_inodes);
 	spin_lock_init(&s->s_inode_list_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2995a271ec46..6276f8315e5b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1359,7 +1359,7 @@ struct super_block {
 
 	const struct fscrypt_operations	*s_cop;
 
-	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
+	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
-- 
cgit v1.2.3-59-g8ed1b


From 7d815165c1a64da9fd1b0f4ac8d97ba938ff1d71 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 6 Jan 2018 09:45:42 -0800
Subject: eventfd: convert to use anon_inode_getfd()

Nothing actually calls eventfd_file_create() besides the eventfd2()
system call itself.  So simplify things by folding it into the system
call and using anon_inode_getfd() instead of anon_inode_getfile().  This
removes over 40 lines with no change in functionality.

(eventfd_file_create() was apparently added years ago for KVM irqfd's,
but was never used.)

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventfd.c            | 53 +++++++------------------------------------------
 include/linux/eventfd.h |  5 -----
 2 files changed, 7 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2fb4eadaa118..4167e670ed4d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -412,72 +412,33 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 
-/**
- * eventfd_file_create - Creates an eventfd file pointer.
- * @count: Initial eventfd counter value.
- * @flags: Flags for the eventfd file.
- *
- * This function creates an eventfd file pointer, w/out installing it into
- * the fd table. This is useful when the eventfd file is used during the
- * initialization of data structures that require extra setup after the eventfd
- * creation. So the eventfd creation is split into the file pointer creation
- * phase, and the file descriptor installation phase.
- * In this way races with userspace closing the newly installed file descriptor
- * can be avoided.
- * Returns an eventfd file pointer, or a proper error pointer.
- */
-struct file *eventfd_file_create(unsigned int count, int flags)
+SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {
-	struct file *file;
 	struct eventfd_ctx *ctx;
+	int fd;
 
 	/* Check the EFD_* constants for consistency.  */
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
 
 	if (flags & ~EFD_FLAGS_SET)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	kref_init(&ctx->kref);
 	init_waitqueue_head(&ctx->wqh);
 	ctx->count = count;
 	ctx->flags = flags;
 
-	file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
-				  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
-	if (IS_ERR(file))
+	fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
+			      O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
+	if (fd < 0)
 		eventfd_free_ctx(ctx);
 
-	return file;
-}
-
-SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
-{
-	int fd, error;
-	struct file *file;
-
-	error = get_unused_fd_flags(flags & EFD_SHARED_FCNTL_FLAGS);
-	if (error < 0)
-		return error;
-	fd = error;
-
-	file = eventfd_file_create(count, flags);
-	if (IS_ERR(file)) {
-		error = PTR_ERR(file);
-		goto err_put_unused_fd;
-	}
-	fd_install(fd, file);
-
 	return fd;
-
-err_put_unused_fd:
-	put_unused_fd(fd);
-
-	return error;
 }
 
 SYSCALL_DEFINE1(eventfd, unsigned int, count)
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 60b2985e8a18..15826192cc23 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -30,7 +30,6 @@ struct file;
 
 #ifdef CONFIG_EVENTFD
 
-struct file *eventfd_file_create(unsigned int count, int flags);
 struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
 void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
@@ -47,10 +46,6 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
  * Ugly ugly ugly error layer to support modules that uses eventfd but
  * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO.
  */
-static inline struct file *eventfd_file_create(unsigned int count, int flags)
-{
-	return ERR_PTR(-ENOSYS);
-}
 
 static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b6364572d641c8eba9eab9bcc31d8962f96ddf15 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 6 Jan 2018 09:45:43 -0800
Subject: eventfd: fold eventfd_ctx_read() into eventfd_read()

eventfd_ctx_read() is not used outside of eventfd.c, so unexport it and
fold it into eventfd_read().  This slightly simplifies the code and
makes it more analogous to eventfd_write().

(eventfd_ctx_read() was apparently added years ago for KVM irqfd's, but
was never used.)

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventfd.c            | 53 ++++++++++++++-----------------------------------
 include/linux/eventfd.h |  7 -------
 2 files changed, 15 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4167e670ed4d..6138d2b5cdeb 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -207,36 +207,27 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
 
-/**
- * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
- * @ctx: [in] Pointer to eventfd context.
- * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64-bit counter value.
- *
- * Returns %0 if successful, or the following error codes:
- *
- *  - -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
- *  - -ERESTARTSYS : A signal interrupted the wait operation.
- *
- * If @no_wait is zero, the function might sleep until the eventfd internal
- * counter becomes greater than zero.
- */
-ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
+static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
 {
+	struct eventfd_ctx *ctx = file->private_data;
 	ssize_t res;
+	__u64 ucnt = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
+	if (count < sizeof(ucnt))
+		return -EINVAL;
+
 	spin_lock_irq(&ctx->wqh.lock);
-	*cnt = 0;
 	res = -EAGAIN;
 	if (ctx->count > 0)
-		res = 0;
-	else if (!no_wait) {
+		res = sizeof(ucnt);
+	else if (!(file->f_flags & O_NONBLOCK)) {
 		__add_wait_queue(&ctx->wqh, &wait);
 		for (;;) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (ctx->count > 0) {
-				res = 0;
+				res = sizeof(ucnt);
 				break;
 			}
 			if (signal_pending(current)) {
@@ -250,31 +241,17 @@ ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt)
 		__remove_wait_queue(&ctx->wqh, &wait);
 		__set_current_state(TASK_RUNNING);
 	}
-	if (likely(res == 0)) {
-		eventfd_ctx_do_read(ctx, cnt);
+	if (likely(res > 0)) {
+		eventfd_ctx_do_read(ctx, &ucnt);
 		if (waitqueue_active(&ctx->wqh))
 			wake_up_locked_poll(&ctx->wqh, POLLOUT);
 	}
 	spin_unlock_irq(&ctx->wqh.lock);
 
-	return res;
-}
-EXPORT_SYMBOL_GPL(eventfd_ctx_read);
-
-static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
-			    loff_t *ppos)
-{
-	struct eventfd_ctx *ctx = file->private_data;
-	ssize_t res;
-	__u64 cnt;
-
-	if (count < sizeof(cnt))
-		return -EINVAL;
-	res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt);
-	if (res < 0)
-		return res;
+	if (res > 0 && put_user(ucnt, (__u64 __user *)buf))
+		return -EFAULT;
 
-	return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt);
+	return res;
 }
 
 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 15826192cc23..566fef14d0a6 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -36,7 +36,6 @@ struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n);
-ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt);
 int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
 				  __u64 *cnt);
 
@@ -62,12 +61,6 @@ static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
 
 }
 
-static inline ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait,
-				       __u64 *cnt)
-{
-	return -ENOSYS;
-}
-
 static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
 						wait_queue_entry_t *wait, __u64 *cnt)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 105f2b7096075eacb6d2c83a6e00b652c2951063 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Sat, 6 Jan 2018 09:45:44 -0800
Subject: eventfd: fold eventfd_ctx_get() into eventfd_ctx_fileget()

eventfd_ctx_get() is not used outside of eventfd.c, so unexport it and
fold it into eventfd_ctx_fileget().

(eventfd_ctx_get() was apparently added years ago for KVM irqfd's, but
was never used.)

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventfd.c            | 21 ++++++---------------
 include/linux/eventfd.h |  2 +-
 2 files changed, 7 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 6138d2b5cdeb..bc0105ae253f 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -79,25 +79,12 @@ static void eventfd_free(struct kref *kref)
 	eventfd_free_ctx(ctx);
 }
 
-/**
- * eventfd_ctx_get - Acquires a reference to the internal eventfd context.
- * @ctx: [in] Pointer to the eventfd context.
- *
- * Returns: In case of success, returns a pointer to the eventfd context.
- */
-struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx)
-{
-	kref_get(&ctx->kref);
-	return ctx;
-}
-EXPORT_SYMBOL_GPL(eventfd_ctx_get);
-
 /**
  * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  * @ctx: [in] Pointer to eventfd context.
  *
  * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget().
+ * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
  */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -382,10 +369,14 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
  */
 struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
 {
+	struct eventfd_ctx *ctx;
+
 	if (file->f_op != &eventfd_fops)
 		return ERR_PTR(-EINVAL);
 
-	return eventfd_ctx_get(file->private_data);
+	ctx = file->private_data;
+	kref_get(&ctx->kref);
+	return ctx;
 }
 EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
 
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index 566fef14d0a6..7094718b653b 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -26,11 +26,11 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+struct eventfd_ctx;
 struct file;
 
 #ifdef CONFIG_EVENTFD
 
-struct eventfd_ctx *eventfd_ctx_get(struct eventfd_ctx *ctx);
 void eventfd_ctx_put(struct eventfd_ctx *ctx);
 struct file *eventfd_fget(int fd);
 struct eventfd_ctx *eventfd_ctx_fdget(int fd);
-- 
cgit v1.2.3-59-g8ed1b


From 50fd2f298bef9d1f69ac755f1fdf70cd98746be2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jan 2018 13:06:15 -0500
Subject: new primitive: vmemdup_user()

similar to memdup_user(), but does *not* guarantee that result will
be physically contiguous; use only in cases where that's not a requirement
and free it with kvfree().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/string.h |  1 +
 mm/util.c              | 29 ++++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/string.h b/include/linux/string.h
index 410ecf17de3c..12d5429de0c8 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -11,6 +11,7 @@
 
 extern char *strndup_user(const char __user *, long);
 extern void *memdup_user(const void __user *, size_t);
+extern void *vmemdup_user(const void __user *, size_t);
 extern void *memdup_user_nul(const void __user *, size_t);
 
 /*
diff --git a/mm/util.c b/mm/util.c
index 4b93ffa6df96..c1250501364f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -150,7 +150,8 @@ EXPORT_SYMBOL(kmemdup_nul);
  * @src: source address in user space
  * @len: number of bytes to copy
  *
- * Returns an ERR_PTR() on failure.
+ * Returns an ERR_PTR() on failure.  Result is physically
+ * contiguous, to be freed by kfree().
  */
 void *memdup_user(const void __user *src, size_t len)
 {
@@ -169,6 +170,32 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
 
+/**
+ * vmemdup_user - duplicate memory region from user space
+ *
+ * @src: source address in user space
+ * @len: number of bytes to copy
+ *
+ * Returns an ERR_PTR() on failure.  Result may be not
+ * physically contiguous.  Use kvfree() to free.
+ */
+void *vmemdup_user(const void __user *src, size_t len)
+{
+	void *p;
+
+	p = kvmalloc(len, GFP_USER);
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, src, len)) {
+		kvfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return p;
+}
+EXPORT_SYMBOL(vmemdup_user);
+
 /*
  * strndup_user - duplicate an existing string from user space
  * @s: The string to duplicate
-- 
cgit v1.2.3-59-g8ed1b


From e1fc742e14e01d84d9693c4aca4ab23da65811fb Mon Sep 17 00:00:00 2001
From: Jürg Billeter <j@bitron.ch>
Date: Fri, 29 Sep 2017 14:07:17 +0200
Subject: fs: add RWF_APPEND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the per-I/O equivalent of O_APPEND to support atomic append
operations on any open file.

If a file is opened with O_APPEND, pwrite() ignores the offset and
always appends data to the end of the file. RWF_APPEND enables atomic
append and pwrite() with offset on a single file descriptor.

Signed-off-by: Jürg Billeter <j@bitron.ch>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h      | 2 ++
 include/uapi/linux/fs.h | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6276f8315e5b..85c8ddc55760 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3224,6 +3224,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 		ki->ki_flags |= IOCB_DSYNC;
 	if (flags & RWF_SYNC)
 		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+	if (flags & RWF_APPEND)
+		ki->ki_flags |= IOCB_APPEND;
 	return 0;
 }
 
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 4199f8acbce5..d2a8313fabd7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -377,7 +377,11 @@ typedef int __bitwise __kernel_rwf_t;
 /* per-IO, return -EAGAIN if operation would block */
 #define RWF_NOWAIT	((__force __kernel_rwf_t)0x00000008)
 
+/* per-IO O_APPEND */
+#define RWF_APPEND	((__force __kernel_rwf_t)0x00000010)
+
 /* mask of flags supported by the kernel */
-#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT)
+#define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
+			 RWF_APPEND)
 
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3-59-g8ed1b


From 4bfd054ae11ea061685c4a2a6234fdc8e92fad41 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 16 Jan 2018 21:44:24 -0800
Subject: fs: fold __inode_permission() into inode_permission()

Since commit 9c630ebefeee ("ovl: simplify permission checking"),
overlayfs doesn't call __inode_permission() anymore, which leaves no
users other than inode_permission().  So just fold it back into
inode_permission().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c         | 71 ++++++++++++++++++++----------------------------------
 include/linux/fs.h |  1 -
 2 files changed, 26 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/fs/namei.c b/fs/namei.c
index f0c7a7b9b6ca..29b044022e9c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -390,50 +390,6 @@ static inline int do_inode_permission(struct inode *inode, int mask)
 	return generic_permission(inode, mask);
 }
 
-/**
- * __inode_permission - Check for access rights to a given inode
- * @inode: Inode to check permission on
- * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Check for read/write/execute permissions on an inode.
- *
- * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
- *
- * This does not check for a read-only file system.  You probably want
- * inode_permission().
- */
-int __inode_permission(struct inode *inode, int mask)
-{
-	int retval;
-
-	if (unlikely(mask & MAY_WRITE)) {
-		/*
-		 * Nobody gets write access to an immutable file.
-		 */
-		if (IS_IMMUTABLE(inode))
-			return -EPERM;
-
-		/*
-		 * Updating mtime will likely cause i_uid and i_gid to be
-		 * written back improperly if their true value is unknown
-		 * to the vfs.
-		 */
-		if (HAS_UNMAPPED_ID(inode))
-			return -EACCES;
-	}
-
-	retval = do_inode_permission(inode, mask);
-	if (retval)
-		return retval;
-
-	retval = devcgroup_inode_permission(inode, mask);
-	if (retval)
-		return retval;
-
-	return security_inode_permission(inode, mask);
-}
-EXPORT_SYMBOL(__inode_permission);
-
 /**
  * sb_permission - Check superblock-level permissions
  * @sb: Superblock of inode to check permission on
@@ -472,7 +428,32 @@ int inode_permission(struct inode *inode, int mask)
 	retval = sb_permission(inode->i_sb, inode, mask);
 	if (retval)
 		return retval;
-	return __inode_permission(inode, mask);
+
+	if (unlikely(mask & MAY_WRITE)) {
+		/*
+		 * Nobody gets write access to an immutable file.
+		 */
+		if (IS_IMMUTABLE(inode))
+			return -EPERM;
+
+		/*
+		 * Updating mtime will likely cause i_uid and i_gid to be
+		 * written back improperly if their true value is unknown
+		 * to the vfs.
+		 */
+		if (HAS_UNMAPPED_ID(inode))
+			return -EACCES;
+	}
+
+	retval = do_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
+	retval = devcgroup_inode_permission(inode, mask);
+	if (retval)
+		return retval;
+
+	return security_inode_permission(inode, mask);
 }
 EXPORT_SYMBOL(inode_permission);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 85c8ddc55760..b49251112add 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2699,7 +2699,6 @@ extern sector_t bmap(struct inode *, sector_t);
 #endif
 extern int notify_change(struct dentry *, struct iattr *, struct inode **);
 extern int inode_permission(struct inode *, int);
-extern int __inode_permission(struct inode *, int);
 extern int generic_permission(struct inode *, int);
 extern int __check_sticky(struct inode *dir, struct inode *inode);
 
-- 
cgit v1.2.3-59-g8ed1b


From 01950a349ec254f28bf9ad06e74a166521d213e1 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Tue, 16 Jan 2018 22:25:12 -0800
Subject: fs/buffer.c: fold init_buffer() into init_page_buffers()

Since commit e76004093db1 ("fs/buffer.c: remove unnecessary init
operation after allocating buffer_head"), there are no callers of
init_buffer() outside of init_page_buffers().  So just fold it into
init_page_buffers().

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 10 ++--------
 include/linux/buffer_head.h |  1 -
 2 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 0736a6a2e2f0..3091801169ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -53,13 +53,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
-void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
-{
-	bh->b_end_io = handler;
-	bh->b_private = private;
-}
-EXPORT_SYMBOL(init_buffer);
-
 inline void touch_buffer(struct buffer_head *bh)
 {
 	trace_block_touch_buffer(bh);
@@ -922,7 +915,8 @@ init_page_buffers(struct page *page, struct block_device *bdev,
 
 	do {
 		if (!buffer_mapped(bh)) {
-			init_buffer(bh, NULL, NULL);
+			bh->b_end_io = NULL;
+			bh->b_private = NULL;
 			bh->b_bdev = bdev;
 			bh->b_blocknr = block;
 			if (uptodate)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 8b1bf8d3d4a2..58a82f58e44e 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -151,7 +151,6 @@ void buffer_check_dirty_writeback(struct page *page,
 
 void mark_buffer_dirty(struct buffer_head *bh);
 void mark_buffer_write_io_error(struct buffer_head *bh);
-void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
 void touch_buffer(struct buffer_head *bh);
 void set_bh_page(struct buffer_head *bh,
 		struct page *page, unsigned long offset);
-- 
cgit v1.2.3-59-g8ed1b