aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_addr.c2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/file.c21
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c212
-rw-r--r--fs/bfs/inode.c1
-rw-r--r--fs/block_dev.c1
-rw-r--r--fs/btrfs/ctree.c8
-rw-r--r--fs/btrfs/ctree.h5
-rw-r--r--fs/btrfs/disk-io.c2
-rw-r--r--fs/btrfs/extent-tree.c51
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/file.c89
-rw-r--r--fs/btrfs/inode.c115
-rw-r--r--fs/btrfs/ordered-data.c7
-rw-r--r--fs/btrfs/qgroup.c2
-rw-r--r--fs/btrfs/send.c171
-rw-r--r--fs/btrfs/tests/inode-tests.c197
-rw-r--r--fs/btrfs/transaction.c42
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c9
-rw-r--r--fs/btrfs/xattr.c8
-rw-r--r--fs/ceph/file.c3
-rw-r--r--fs/cifs/cifsencrypt.c6
-rw-r--r--fs/cifs/connect.c13
-rw-r--r--fs/cifs/file.c1
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/smb2misc.c2
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.c17
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h4
-rw-r--r--fs/ecryptfs/file.c41
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c1
-rw-r--r--fs/ext4/page-io.c1
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/fat/inode.c1
-rw-r--r--fs/fs-writeback.c93
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/dev.c20
-rw-r--r--fs/fuse/file.c55
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/file.c1
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/brec.c20
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jfs/inode.c2
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/file.c1
-rw-r--r--fs/locks.c10
-rw-r--r--fs/namei.c168
-rw-r--r--fs/nfs/client.c2
-rw-r--r--fs/nfs/delegation.c45
-rw-r--r--fs/nfs/dir.c22
-rw-r--r--fs/nfs/direct.c4
-rw-r--r--fs/nfs/file.c12
-rw-r--r--fs/nfs/inode.c111
-rw-r--r--fs/nfs/internal.h1
-rw-r--r--fs/nfs/nfs3proc.c4
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs4client.c9
-rw-r--r--fs/nfs/nfs4proc.c31
-rw-r--r--fs/nfs/nfs4session.h1
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/proc.c6
-rw-r--r--fs/nfs/write.c30
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.c6
-rw-r--r--fs/nfsd/nfs4layouts.c12
-rw-r--r--fs/nfsd/nfs4proc.c2
-rw-r--r--fs/nfsd/nfs4state.c6
-rw-r--r--fs/nfsd/nfs4xdr.c20
-rw-r--r--fs/nfsd/nfscache.c6
-rw-r--r--fs/nilfs2/btree.c47
-rw-r--r--fs/nilfs2/inode.c2
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/notify/fanotify/fanotify.c3
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/file.c784
-rw-r--r--fs/ntfs/inode.c1
-rw-r--r--fs/ocfs2/aops.c1
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/file.c25
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/ocfs2_fs.h15
-rw-r--r--fs/open.c5
-rw-r--r--fs/overlayfs/super.c33
-rw-r--r--fs/pipe.c1
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/pstore/ram.c3
-rw-r--r--fs/read_write.c117
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/splice.c28
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sysfs/group.c11
-rw-r--r--fs/tracefs/Makefile4
-rw-r--r--fs/tracefs/inode.c650
-rw-r--r--fs/ubifs/file.c1
-rw-r--r--fs/udf/file.c4
-rw-r--r--fs/udf/inode.c2
-rw-r--r--fs/xfs/xfs_aops.c1
-rw-r--r--fs/xfs/xfs_file.c15
-rw-r--r--fs/xfs/xfs_inode.c4
-rw-r--r--fs/xfs/xfs_inode.h9
-rw-r--r--fs/xfs/xfs_iops.c36
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_qm.c5
117 files changed, 2497 insertions, 1144 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e055ea83..ff1a5bac4200 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,7 +33,7 @@
#include <linux/pagemap.h>
#include <linux/idr.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
diff --git a/fs/Makefile b/fs/Makefile
index a88ac4838c9e..cb92fd4c3172 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -118,6 +118,7 @@ obj-$(CONFIG_HOSTFS) += hostfs/
obj-$(CONFIG_HPPFS) += hppfs/
obj-$(CONFIG_CACHEFILES) += cachefiles/
obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_TRACING) += tracefs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf95669..3aa7eb66547e 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -12,7 +12,7 @@
* affs regular file handling primitives
*/
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "affs.h"
static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext);
@@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
boff = tmp % bsize;
if (boff) {
bh = affs_bread_ino(inode, bidx, 0);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ written = PTR_ERR(bh);
+ goto err_first_bh;
+ }
tmp = min(bsize - boff, to - from);
BUG_ON(boff + tmp > bsize || tmp > bsize);
memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
@@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
bidx++;
} else if (bidx) {
bh = affs_bread_ino(inode, bidx - 1, 0);
- if (IS_ERR(bh))
- return PTR_ERR(bh);
+ if (IS_ERR(bh)) {
+ written = PTR_ERR(bh);
+ goto err_first_bh;
+ }
}
while (from + bsize <= to) {
prev_bh = bh;
bh = affs_getemptyblk_ino(inode, bidx);
if (IS_ERR(bh))
- goto out;
+ goto err_bh;
memcpy(AFFS_DATA(bh), data + from, bsize);
if (buffer_new(bh)) {
AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
@@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
prev_bh = bh;
bh = affs_bread_ino(inode, bidx, 1);
if (IS_ERR(bh))
- goto out;
+ goto err_bh;
tmp = min(bsize, to - from);
BUG_ON(tmp > bsize);
memcpy(AFFS_DATA(bh), data + from, tmp);
@@ -790,12 +794,13 @@ done:
if (tmp > inode->i_size)
inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+err_first_bh:
unlock_page(page);
page_cache_release(page);
return written;
-out:
+err_bh:
bh = prev_bh;
if (!written)
written = PTR_ERR(bh);
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c13cb08964ed..0714abcd7f32 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,7 +14,6 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
-#include <linux/aio.h>
#include "internal.h"
static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/aio.c b/fs/aio.c
index f8e52a1854c1..1ab60010cf6c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -151,6 +151,38 @@ struct kioctx {
unsigned id;
};
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED ((void *) (~0ULL))
+
+struct aio_kiocb {
+ struct kiocb common;
+
+ struct kioctx *ki_ctx;
+ kiocb_cancel_fn *ki_cancel;
+
+ struct iocb __user *ki_user_iocb; /* user's aiocb */
+ __u64 ki_user_data; /* user's data for completion */
+
+ struct list_head ki_list; /* the aio core uses this
+ * for cancellation */
+
+ /*
+ * If the aio_resfd field of the userspace iocb is not zero,
+ * this is the underlying eventfd context to deliver events to.
+ */
+ struct eventfd_ctx *ki_eventfd;
+};
+
/*------ sysctl variables----*/
static DEFINE_SPINLOCK(aio_nr_lock);
unsigned long aio_nr; /* current system wide number of aio requests */
@@ -220,7 +252,7 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
- kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page));
@@ -278,11 +310,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
+static int aio_ring_remap(struct file *file, struct vm_area_struct *vma)
{
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
- int i;
+ int i, res = -EINVAL;
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
@@ -292,13 +324,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
ctx = table->table[i];
if (ctx && ctx->aio_ring_file == file) {
- ctx->user_id = ctx->mmap_base = vma->vm_start;
+ if (!atomic_read(&ctx->dead)) {
+ ctx->user_id = ctx->mmap_base = vma->vm_start;
+ res = 0;
+ }
break;
}
}
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
+ return res;
}
static const struct file_operations aio_ring_fops = {
@@ -480,8 +516,9 @@ static int aio_setup_ring(struct kioctx *ctx)
#define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
#define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
-void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
+ struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common);
struct kioctx *ctx = req->ki_ctx;
unsigned long flags;
@@ -496,7 +533,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kiocb *kiocb)
+static int kiocb_cancel(struct aio_kiocb *kiocb)
{
kiocb_cancel_fn *old, *cancel;
@@ -514,7 +551,7 @@ static int kiocb_cancel(struct kiocb *kiocb)
cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
} while (cancel != old);
- return cancel(kiocb);
+ return cancel(&kiocb->common);
}
static void free_ioctx(struct work_struct *work)
@@ -550,13 +587,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
static void free_ioctx_users(struct percpu_ref *ref)
{
struct kioctx *ctx = container_of(ref, struct kioctx, users);
- struct kiocb *req;
+ struct aio_kiocb *req;
spin_lock_irq(&ctx->ctx_lock);
while (!list_empty(&ctx->active_reqs)) {
req = list_first_entry(&ctx->active_reqs,
- struct kiocb, ki_list);
+ struct aio_kiocb, ki_list);
list_del_init(&req->ki_list);
kiocb_cancel(req);
@@ -727,6 +764,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
err_cleanup:
aio_nr_sub(ctx->max_reqs);
err_ctx:
+ atomic_set(&ctx->dead, 1);
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
aio_free_ring(ctx);
err:
mutex_unlock(&ctx->ring_lock);
@@ -748,11 +788,12 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
{
struct kioctx_table *table;
- if (atomic_xchg(&ctx->dead, 1))
+ spin_lock(&mm->ioctx_lock);
+ if (atomic_xchg(&ctx->dead, 1)) {
+ spin_unlock(&mm->ioctx_lock);
return -EINVAL;
+ }
-
- spin_lock(&mm->ioctx_lock);
table = rcu_dereference_raw(mm->ioctx_table);
WARN_ON(ctx != table->table[ctx->id]);
table->table[ctx->id] = NULL;
@@ -778,22 +819,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
return 0;
}
-/* wait_on_sync_kiocb:
- * Waits on the given sync kiocb to complete.
- */
-ssize_t wait_on_sync_kiocb(struct kiocb *req)
-{
- while (!req->ki_ctx) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (req->ki_ctx)
- break;
- io_schedule();
- }
- __set_current_state(TASK_RUNNING);
- return req->ki_user_data;
-}
-EXPORT_SYMBOL(wait_on_sync_kiocb);
-
/*
* exit_aio: called when the last user of mm goes away. At this point, there is
* no way for any new requests to be submited or any of the io_* syscalls to be
@@ -948,9 +973,9 @@ static void user_refill_reqs_available(struct kioctx *ctx)
* Allocate a slot for an aio request.
* Returns NULL if no requests are free.
*/
-static inline struct kiocb *aio_get_req(struct kioctx *ctx)
+static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx)
{
- struct kiocb *req;
+ struct aio_kiocb *req;
if (!get_reqs_available(ctx)) {
user_refill_reqs_available(ctx);
@@ -971,10 +996,10 @@ out_put:
return NULL;
}
-static void kiocb_free(struct kiocb *req)
+static void kiocb_free(struct aio_kiocb *req)
{
- if (req->ki_filp)
- fput(req->ki_filp);
+ if (req->common.ki_filp)
+ fput(req->common.ki_filp);
if (req->ki_eventfd != NULL)
eventfd_ctx_put(req->ki_eventfd);
kmem_cache_free(kiocb_cachep, req);
@@ -1010,8 +1035,9 @@ out:
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
-void aio_complete(struct kiocb *iocb, long res, long res2)
+static void aio_complete(struct kiocb *kiocb, long res, long res2)
{
+ struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common);
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
@@ -1025,13 +1051,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
* ref, no other paths have a way to get another ref
* - the sync task helpfully left a reference to itself in the iocb
*/
- if (is_sync_kiocb(iocb)) {
- iocb->ki_user_data = res;
- smp_wmb();
- iocb->ki_ctx = ERR_PTR(-EXDEV);
- wake_up_process(iocb->ki_obj.tsk);
- return;
- }
+ BUG_ON(is_sync_kiocb(kiocb));
if (iocb->ki_list.next) {
unsigned long flags;
@@ -1057,7 +1077,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
event = ev_page + pos % AIO_EVENTS_PER_PAGE;
- event->obj = (u64)(unsigned long)iocb->ki_obj.user;
+ event->obj = (u64)(unsigned long)iocb->ki_user_iocb;
event->data = iocb->ki_user_data;
event->res = res;
event->res2 = res2;
@@ -1066,7 +1086,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]);
pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n",
- ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data,
+ ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data,
res, res2);
/* after flagging the request as done, we
@@ -1113,7 +1133,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
percpu_ref_put(&ctx->reqs);
}
-EXPORT_SYMBOL(aio_complete);
/* aio_read_events_ring
* Pull an event off of the ioctx's event ring. Returns the number of
@@ -1341,46 +1360,19 @@ typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
-static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
- int rw, char __user *buf,
- unsigned long *nr_segs,
- struct iovec **iovec,
- bool compat)
+static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len,
+ struct iovec **iovec,
+ bool compat,
+ struct iov_iter *iter)
{
- ssize_t ret;
-
- *nr_segs = kiocb->ki_nbytes;
-
#ifdef CONFIG_COMPAT
if (compat)
- ret = compat_rw_copy_check_uvector(rw,
+ return compat_import_iovec(rw,
(struct compat_iovec __user *)buf,
- *nr_segs, UIO_FASTIOV, *iovec, iovec);
- else
+ len, UIO_FASTIOV, iovec, iter);
#endif
- ret = rw_copy_check_uvector(rw,
- (struct iovec __user *)buf,
- *nr_segs, UIO_FASTIOV, *iovec, iovec);
- if (ret < 0)
- return ret;
-
- /* ki_nbytes now reflect bytes instead of segs */
- kiocb->ki_nbytes = ret;
- return 0;
-}
-
-static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
- int rw, char __user *buf,
- unsigned long *nr_segs,
- struct iovec *iovec)
-{
- if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
- return -EFAULT;
-
- iovec->iov_base = buf;
- iovec->iov_len = kiocb->ki_nbytes;
- *nr_segs = 1;
- return 0;
+ return import_iovec(rw, (struct iovec __user *)buf,
+ len, UIO_FASTIOV, iovec, iter);
}
/*
@@ -1388,11 +1380,10 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
* Performs the initial checks and io submission.
*/
static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
- char __user *buf, bool compat)
+ char __user *buf, size_t len, bool compat)
{
struct file *file = req->ki_filp;
ssize_t ret;
- unsigned long nr_segs;
int rw;
fmode_t mode;
aio_rw_op *rw_op;
@@ -1423,21 +1414,22 @@ rw_common:
if (!rw_op && !iter_op)
return -EINVAL;
- ret = (opcode == IOCB_CMD_PREADV ||
- opcode == IOCB_CMD_PWRITEV)
- ? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
- &iovec, compat)
- : aio_setup_single_vector(req, rw, buf, &nr_segs,
- iovec);
+ if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV)
+ ret = aio_setup_vectored_rw(rw, buf, len,
+ &iovec, compat, &iter);
+ else {
+ ret = import_single_range(rw, buf, len, iovec, &iter);
+ iovec = NULL;
+ }
if (!ret)
- ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ ret = rw_verify_area(rw, file, &req->ki_pos,
+ iov_iter_count(&iter));
if (ret < 0) {
- if (iovec != inline_vecs)
- kfree(iovec);
+ kfree(iovec);
return ret;
}
- req->ki_nbytes = ret;
+ len = ret;
/* XXX: move/kill - rw_verify_area()? */
/* This matches the pread()/pwrite() logic */
@@ -1450,14 +1442,14 @@ rw_common:
file_start_write(file);
if (iter_op) {
- iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
ret = iter_op(req, &iter);
} else {
- ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+ ret = rw_op(req, iter.iov, iter.nr_segs, req->ki_pos);
}
if (rw == WRITE)
file_end_write(file);
+ kfree(iovec);
break;
case IOCB_CMD_FDSYNC:
@@ -1479,9 +1471,6 @@ rw_common:
return -EINVAL;
}
- if (iovec != inline_vecs)
- kfree(iovec);
-
if (ret != -EIOCBQUEUED) {
/*
* There's no easy way to restart the syscall since other AIO's
@@ -1500,7 +1489,7 @@ rw_common:
static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
struct iocb *iocb, bool compat)
{
- struct kiocb *req;
+ struct aio_kiocb *req;
ssize_t ret;
/* enforce forwards compatibility on users */
@@ -1523,11 +1512,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
if (unlikely(!req))
return -EAGAIN;
- req->ki_filp = fget(iocb->aio_fildes);
- if (unlikely(!req->ki_filp)) {
+ req->common.ki_filp = fget(iocb->aio_fildes);
+ if (unlikely(!req->common.ki_filp)) {
ret = -EBADF;
goto out_put_req;
}
+ req->common.ki_pos = iocb->aio_offset;
+ req->common.ki_complete = aio_complete;
+ req->common.ki_flags = 0;
if (iocb->aio_flags & IOCB_FLAG_RESFD) {
/*
@@ -1542,6 +1534,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
req->ki_eventfd = NULL;
goto out_put_req;
}
+
+ req->common.ki_flags |= IOCB_EVENTFD;
}
ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
@@ -1550,13 +1544,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
goto out_put_req;
}
- req->ki_obj.user = user_iocb;
+ req->ki_user_iocb = user_iocb;
req->ki_user_data = iocb->aio_data;
- req->ki_pos = iocb->aio_offset;
- req->ki_nbytes = iocb->aio_nbytes;
- ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+ ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode,
(char __user *)(unsigned long)iocb->aio_buf,
+ iocb->aio_nbytes,
compat);
if (ret)
goto out_put_req;
@@ -1643,10 +1636,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
/* lookup_kiocb
* Finds a given iocb for cancellation.
*/
-static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
- u32 key)
+static struct aio_kiocb *
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
{
- struct list_head *pos;
+ struct aio_kiocb *kiocb;
assert_spin_locked(&ctx->ctx_lock);
@@ -1654,9 +1647,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
return NULL;
/* TODO: use a hash or array, this sucks. */
- list_for_each(pos, &ctx->active_reqs) {
- struct kiocb *kiocb = list_kiocb(pos);
- if (kiocb->ki_obj.user == iocb)
+ list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
+ if (kiocb->ki_user_iocb == iocb)
return kiocb;
}
return NULL;
@@ -1676,7 +1668,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
struct io_event __user *, result)
{
struct kioctx *ctx;
- struct kiocb *kiocb;
+ struct aio_kiocb *kiocb;
u32 key;
int ret;
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 90bc079d9982..fdcb4d69f430 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -15,6 +15,7 @@
#include <linux/buffer_head.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
+#include <linux/uio.h>
#include <asm/uaccess.h>
#include "bfs.h"
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266be67d3..2e522aed6584 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,7 +27,6 @@
#include <linux/namei.h>
#include <linux/log2.h>
#include <linux/cleancache.h>
-#include <linux/aio.h>
#include <asm/uaccess.h>
#include "internal.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 993642199326..6d67f32e648d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1645,14 +1645,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
parent_nritems = btrfs_header_nritems(parent);
blocksize = root->nodesize;
- end_slot = parent_nritems;
+ end_slot = parent_nritems - 1;
- if (parent_nritems == 1)
+ if (parent_nritems <= 1)
return 0;
btrfs_set_lock_blocking(parent);
- for (i = start_slot; i < end_slot; i++) {
+ for (i = start_slot; i <= end_slot; i++) {
int close = 1;
btrfs_node_key(parent, &disk_key, i);
@@ -1669,7 +1669,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
other = btrfs_node_blockptr(parent, i - 1);
close = close_blocks(blocknr, other, blocksize);
}
- if (!close && i < end_slot - 2) {
+ if (!close && i < end_slot) {
other = btrfs_node_blockptr(parent, i + 1);
close = close_blocks(blocknr, other, blocksize);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84c3b00f3de8..f9c89cae39ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
@@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
loff_t actual_len, u64 *alloc_hint);
int btrfs_inode_check_errors(struct inode *inode);
extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f79f38542a73..639f2663ed3f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk)) {
- printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+ printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
btrfs_super_sys_array_size(sb),
sizeof(struct btrfs_disk_key)
+ sizeof(struct btrfs_chunk));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 571f402d3fc4..8b353ad02f03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3208,6 +3208,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
return 0;
}
+ if (trans->aborted)
+ return 0;
again:
inode = lookup_free_space_inode(root, block_group, path);
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
@@ -3243,6 +3245,20 @@ again:
*/
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ /*
+ * So theoretically we could recover from this, simply set the
+ * super cache generation to 0 so we know to invalidate the
+ * cache, but then we'd have to keep track of the block groups
+ * that fail this way so we know we _have_ to reset this cache
+ * before the next commit or risk reading stale cache. So to
+ * limit our exposure to horrible edge cases lets just abort the
+ * transaction, this only happens in really bad situations
+ * anyway.
+ */
+ btrfs_abort_transaction(trans, root, ret);
+ goto out_put;
+ }
WARN_ON(ret);
if (i_size_read(inode) > 0) {
@@ -3309,6 +3325,32 @@ out:
return ret;
}
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_block_group_cache *cache, *tmp;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ struct btrfs_path *path;
+
+ if (list_empty(&cur_trans->dirty_bgs) ||
+ !btrfs_test_opt(root, SPACE_CACHE))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ /* Could add new block groups, use _safe just in case */
+ list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+ dirty_list) {
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ }
+
+ btrfs_free_path(path);
+ return 0;
+}
+
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -5094,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ nr_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ BTRFS_I(inode)->outstanding_extents += nr_extents;
+ nr_extents = 0;
if (BTRFS_I(inode)->outstanding_extents >
BTRFS_I(inode)->reserved_extents)
@@ -5239,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_free, 0);
if (root->fs_info->quota_enabled) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7233ff1d533..d688cfe5d496 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb)
/* Should be safe to release our pages at this point */
btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+ __free_extent_buffer(eb);
+ return 1;
+ }
+#endif
call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
return 1;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b78bbbac900d..aee18f84e315 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,7 +24,6 @@
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
#include <linux/swap.h>
#include <linux/writeback.h>
@@ -32,6 +31,7 @@
#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/btrfs.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -1811,22 +1811,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
mutex_unlock(&inode->i_mutex);
/*
- * we want to make sure fsync finds this change
- * but we haven't joined a transaction running right now.
- *
- * Later on, someone is sure to update the inode and get the
- * real transid recorded.
- *
- * We set last_trans now to the fs_info generation + 1,
- * this will either be one more than the running transaction
- * or the generation used for the next transaction if there isn't
- * one running right now.
- *
* We also have to set last_sub_trans to the current log transid,
* otherwise subsequent syncs to a file that's been synced in this
* transaction will appear to have already occured.
*/
- BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
@@ -1959,25 +1947,37 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
/*
- * check the transaction that last modified this inode
- * and see if its already been committed
- */
- if (!BTRFS_I(inode)->last_trans) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
-
- /*
- * if the last transaction that changed this file was before
- * the current transaction, we can bail out now without any
- * syncing
+ * If the last transaction that changed this file was before the current
+ * transaction and we have the full sync flag set in our inode, we can
+ * bail out now without any syncing.
+ *
+ * Note that we can't bail out if the full sync flag isn't set. This is
+ * because when the full sync flag is set we start all ordered extents
+ * and wait for them to fully complete - when they complete they update
+ * the inode's last_trans field through:
+ *
+ * btrfs_finish_ordered_io() ->
+ * btrfs_update_inode_fallback() ->
+ * btrfs_update_inode() ->
+ * btrfs_set_inode_last_trans()
+ *
+ * So we are sure that last_trans is up to date and can do this check to
+ * bail out safely. For the fast path, when the full sync flag is not
+ * set in our inode, we can not do it because we start only our ordered
+ * extents and don't wait for them to complete (that is when
+ * btrfs_finish_ordered_io runs), so here at this point their last_trans
+ * value might be less than or equals to fs_info->last_trans_committed,
+ * and setting a speculative last_trans for an inode when a buffered
+ * write is made (such as fs_info->generation + 1 for example) would not
+ * be reliable since after setting the value and before fsync is called
+ * any number of transactions can start and commit (transaction kthread
+ * commits the current transaction periodically), and a transaction
+ * commit does not start nor waits for ordered extents to complete.
*/
smp_mb();
if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
- BTRFS_I(inode)->last_trans <=
- root->fs_info->last_trans_committed) {
- BTRFS_I(inode)->last_trans = 0;
-
+ (full_sync && BTRFS_I(inode)->last_trans <=
+ root->fs_info->last_trans_committed)) {
/*
* We'v had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2275,6 +2275,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
+ bool truncated_page = false;
+ bool updated_inode = false;
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@@ -2306,13 +2308,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < ino_size)
+ if (offset < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, len, 0);
+ } else {
+ ret = 0;
+ }
goto out_only_mutex;
}
/* zero back part of the first page */
if (offset < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2348,6 +2355,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (!ret) {
/* zero the front end of the last page */
if (tail_start + tail_len < ino_size) {
+ truncated_page = true;
ret = btrfs_truncate_page(inode,
tail_start + tail_len, 0, 1);
if (ret)
@@ -2357,8 +2365,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
if (lockend < lockstart) {
- mutex_unlock(&inode->i_mutex);
- return 0;
+ ret = 0;
+ goto out_only_mutex;
}
while (1) {
@@ -2506,6 +2514,7 @@ out_trans:
trans->block_rsv = &root->fs_info->trans_block_rsv;
ret = btrfs_update_inode(trans, root, inode);
+ updated_inode = true;
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
out_free:
@@ -2515,6 +2524,22 @@ out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
out_only_mutex:
+ if (!updated_inode && truncated_page && !ret && !err) {
+ /*
+ * If we only end up zeroing part of a page, we still need to
+ * update the inode item, so that all the time fields are
+ * updated as well as the necessary btrfs inode in memory fields
+ * for detecting, at fsync time, if the inode isn't yet in the
+ * log tree or it's there but not up to date.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ } else {
+ err = btrfs_update_inode(trans, root, inode);
+ ret = btrfs_end_transaction(trans, root);
+ }
+ }
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a85c23dfcddb..686331f22b15 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,7 +32,6 @@
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
-#include <linux/aio.h>
#include <linux/bit_spinlock.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -43,6 +42,7 @@
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
+#include <linux/uio.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
static int btrfs_dirty_inode(struct inode *inode);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
+
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir,
const struct qstr *qstr)
@@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode,
u64 new_size;
/*
- * We need the largest size of the remaining extent to see if we
- * need to add a new outstanding extent. Think of the following
- * case
- *
- * [MEAX_EXTENT_SIZEx2 - 4k][4k]
- *
- * The new_size would just be 4k and we'd think we had enough
- * outstanding extents for this if we only took one side of the
- * split, same goes for the other direction. We need to see if
- * the larger size still is the same amount of extents as the
- * original size, because if it is we need to add a new
- * outstanding extent. But if we split up and the larger size
- * is less than the original then we are good to go since we've
- * already accounted for the extra extent in our original
- * accounting.
+ * See the explanation in btrfs_merge_extent_hook, the same
+ * applies here, just in reverse.
*/
new_size = orig->end - split + 1;
- if ((split - orig->start) > new_size)
- new_size = split - orig->start;
-
- num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
- if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
- BTRFS_MAX_EXTENT_SIZE) < num_extents)
+ new_size = split - orig->start;
+ num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
return;
}
@@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode,
if (!(other->state & EXTENT_DELALLOC))
return;
- old_size = other->end - other->start + 1;
- new_size = old_size + (new->end - new->start + 1);
+ if (new->start > other->start)
+ new_size = new->end - other->start + 1;
+ else
+ new_size = other->end - new->start + 1;
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
@@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode,
}
/*
- * If we grew by another max_extent, just return, we want to keep that
- * reserved amount.
+ * We have to add up either side to figure out how many extents were
+ * accounted for before we merged into one big extent. If the number of
+ * extents we accounted for is <= the amount we need for the new range
+ * then we can return, otherwise drop. Think of it like this
+ *
+ * [ 4k][MAX_SIZE]
+ *
+ * So we've grown the extent by a MAX_SIZE extent, this would mean we
+ * need 2 outstanding extents, on one side we have 1 and the other side
+ * we have 1 so they are == and we can return. But in this case
+ *
+ * [MAX_SIZE+4k][MAX_SIZE+4k]
+ *
+ * Each range on their own accounts for 2 extents, but merged together
+ * they are only 3 extents worth of accounting, so we need to drop in
+ * this case.
*/
+ old_size = other->end - other->start + 1;
num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
+ old_size = new->end - new->start + 1;
+ num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
- BTRFS_MAX_EXTENT_SIZE) > num_extents)
+ BTRFS_MAX_EXTENT_SIZE) >= num_extents)
return;
spin_lock(&BTRFS_I(inode)->lock);
@@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
+ /* For sanity tests */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
root->fs_info->delalloc_batch);
spin_lock(&BTRFS_I(inode)->lock);
@@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
+ /* For sanity tests. */
+ if (btrfs_test_is_dummy_root(root))
+ return;
+
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
btrfs_free_reserved_data_space(inode, len);
@@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
- u64 orig_len = len;
+ u64 *outstanding_extents = NULL;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
@@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
lockstart = start;
lockend = start + len - 1;
+ if (current->journal_info) {
+ /*
+ * Need to pull our outstanding extents and set journal_info to NULL so
+ * that anything that needs to check if there's a transction doesn't get
+ * confused.
+ */
+ outstanding_extents = current->journal_info;
+ current->journal_info = NULL;
+ }
+
/*
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
@@ -7285,7 +7318,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
int type;
- int ret;
u64 block_start, orig_start, orig_block_len, ram_bytes;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
@@ -7349,11 +7381,20 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- if (len < orig_len) {
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (*outstanding_extents) {
+ (*outstanding_extents)--;
+ } else {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
+
+ current->journal_info = outstanding_extents;
btrfs_free_reserved_data_space(inode, len);
}
@@ -7377,6 +7418,8 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+ if (outstanding_extents)
+ current->journal_info = outstanding_extents;
return ret;
}
@@ -8076,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ u64 outstanding_extents = 0;
size_t count = 0;
int flags = 0;
bool wakeup = true;
@@ -8113,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = btrfs_delalloc_reserve_space(inode, count);
if (ret)
goto out;
+ outstanding_extents = div64_u64(count +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+
+ /*
+ * We need to know how many extents we reserved so that we can
+ * do the accounting properly if we go over the number we
+ * originally calculated. Abuse current->journal_info for this.
+ */
+ current->journal_info = &outstanding_extents;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_done(inode);
@@ -8125,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
+ current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
else if (ret >= 0 && (size_t)ret < count)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 534544e08f76..157cc54fc634 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -452,9 +452,7 @@ void btrfs_get_logged_extents(struct inode *inode,
continue;
if (entry_end(ordered) <= start)
break;
- if (!list_empty(&ordered->log_list))
- continue;
- if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
+ if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
continue;
list_add(&ordered->log_list, logged_list);
atomic_inc(&ordered->refs);
@@ -511,8 +509,7 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
- if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
- list_add_tail(&ordered->trans_list, &trans->ordered);
+ list_add_tail(&ordered->trans_list, &trans->ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8e91d4..058c79eecbfb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
if (oper1->seq < oper2->seq)
return -1;
if (oper1->seq > oper2->seq)
- return -1;
+ return 1;
if (oper1->ref_root < oper2->ref_root)
return -1;
if (oper1->ref_root > oper2->ref_root)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fe5857223515..d6033f540cc7 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -230,6 +230,7 @@ struct pending_dir_move {
u64 parent_ino;
u64 ino;
u64 gen;
+ bool is_orphan;
struct list_head update_refs;
};
@@ -2984,7 +2985,8 @@ static int add_pending_dir_move(struct send_ctx *sctx,
u64 ino_gen,
u64 parent_ino,
struct list_head *new_refs,
- struct list_head *deleted_refs)
+ struct list_head *deleted_refs,
+ const bool is_orphan)
{
struct rb_node **p = &sctx->pending_dir_moves.rb_node;
struct rb_node *parent = NULL;
@@ -2999,6 +3001,7 @@ static int add_pending_dir_move(struct send_ctx *sctx,
pm->parent_ino = parent_ino;
pm->ino = ino;
pm->gen = ino_gen;
+ pm->is_orphan = is_orphan;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -3131,16 +3134,20 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
rmdir_ino = dm->rmdir_ino;
free_waiting_dir_move(sctx, dm);
- ret = get_first_ref(sctx->parent_root, pm->ino,
- &parent_ino, &parent_gen, name);
- if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, parent_ino, parent_gen,
- from_path);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(from_path, name);
+ if (pm->is_orphan) {
+ ret = gen_unique_name(sctx, pm->ino,
+ pm->gen, from_path);
+ } else {
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
+ if (ret < 0)
+ goto out;
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ }
if (ret < 0)
goto out;
@@ -3150,7 +3157,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
LIST_HEAD(deleted_refs);
ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
- &pm->update_refs, &deleted_refs);
+ &pm->update_refs, &deleted_refs,
+ pm->is_orphan);
if (ret < 0)
goto out;
if (rmdir_ino) {
@@ -3283,6 +3291,127 @@ out:
return ret;
}
+/*
+ * We might need to delay a directory rename even when no ancestor directory
+ * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
+ * renamed. This happens when we rename a directory to the old name (the name
+ * in the parent root) of some other unrelated directory that got its rename
+ * delayed due to some ancestor with higher number that got renamed.
+ *
+ * Example:
+ *
+ * Parent snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 257)
+ * | |---- file (ino 260)
+ * |
+ * |---- b/ (ino 258)
+ * |---- c/ (ino 259)
+ *
+ * Send snapshot:
+ * . (ino 256)
+ * |---- a/ (ino 258)
+ * |---- x/ (ino 259)
+ * |---- y/ (ino 257)
+ * |----- file (ino 260)
+ *
+ * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
+ * from 'a' to 'x/y' happening first, which in turn depends on the rename of
+ * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
+ * must issue is:
+ *
+ * 1 - rename 259 from 'c' to 'x'
+ * 2 - rename 257 from 'a' to 'x/y'
+ * 3 - rename 258 from 'b' to 'a'
+ *
+ * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
+ * be done right away and < 0 on error.
+ */
+static int wait_for_dest_dir_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref,
+ const bool is_orphan)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ struct btrfs_key di_key;
+ struct btrfs_dir_item *di;
+ u64 left_gen;
+ u64 right_gen;
+ int ret = 0;
+
+ if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
+ return 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = parent_ref->dir;
+ key.type = BTRFS_DIR_ITEM_KEY;
+ key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
+
+ ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ goto out;
+ }
+
+ di = btrfs_match_dir_item_name(sctx->parent_root, path,
+ parent_ref->name, parent_ref->name_len);
+ if (!di) {
+ ret = 0;
+ goto out;
+ }
+ /*
+ * di_key.objectid has the number of the inode that has a dentry in the
+ * parent directory with the same name that sctx->cur_ino is being
+ * renamed to. We need to check if that inode is in the send root as
+ * well and if it is currently marked as an inode with a pending rename,
+ * if it is, we need to delay the rename of sctx->cur_ino as well, so
+ * that it happens after that other inode is renamed.
+ */
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
+ if (di_key.type != BTRFS_INODE_ITEM_KEY) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
+ &left_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
+ &right_gen, NULL, NULL, NULL, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT)
+ ret = 0;
+ goto out;
+ }
+
+ /* Different inode, no need to delay the rename of sctx->cur_ino */
+ if (right_gen != left_gen) {
+ ret = 0;
+ goto out;
+ }
+
+ if (is_waiting_for_move(sctx, di_key.objectid)) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ di_key.objectid,
+ &sctx->new_refs,
+ &sctx->deleted_refs,
+ is_orphan);
+ if (!ret)
+ ret = 1;
+ }
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int wait_for_parent_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref)
{
@@ -3349,7 +3478,8 @@ out:
sctx->cur_inode_gen,
ino,
&sctx->new_refs,
- &sctx->deleted_refs);
+ &sctx->deleted_refs,
+ false);
if (!ret)
ret = 1;
}
@@ -3372,6 +3502,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
int did_overwrite = 0;
int is_orphan = 0;
u64 last_dir_ino_rm = 0;
+ bool can_rename = true;
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -3490,12 +3621,22 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
}
}
+ if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
+ ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ can_rename = false;
+ *pending_move = 1;
+ }
+ }
+
/*
* link/move the ref to the new place. If we have an orphan
* inode, move it and update valid_path. If not, link or move
* it depending on the inode mode.
*/
- if (is_orphan) {
+ if (is_orphan && can_rename) {
ret = send_rename(sctx, valid_path, cur->full_path);
if (ret < 0)
goto out;
@@ -3503,7 +3644,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = fs_path_copy(valid_path, cur->full_path);
if (ret < 0)
goto out;
- } else {
+ } else if (can_rename) {
if (S_ISDIR(sctx->cur_inode_mode)) {
/*
* Dirs can't be linked, so move it. For moved
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a116b55ce788..054fc0d97131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -911,6 +911,197 @@ out:
return ret;
}
+static int test_extent_accounting(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ BTRFS_I(inode)->root = root;
+ btrfs_test_inode_set_ops(inode);
+
+ /* [BTRFS_MAX_EXTENT_SIZE] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 1) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 1, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+ BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ EXTENT_DELALLOC | EXTENT_DIRTY |
+ EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+ (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 2) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 2, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+ *
+ * I'm artificially adding 2 to outstanding_extents because in the
+ * buffered IO case we'd add things up as we go, but I don't feel like
+ * doing that here, this isn't the interesting case we want to test.
+ */
+ BTRFS_I(inode)->outstanding_extents += 2;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+ (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+ NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+ BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 4) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 4, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /*
+ * Refill the hole again just for good measure, because I thought it
+ * might fail and I'd rather satisfy my paranoia at this point.
+ */
+ BTRFS_I(inode)->outstanding_extents++;
+ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+ BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+ if (ret) {
+ test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents != 3) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 3, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+
+ /* Empty */
+ ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ if (ret) {
+ test_msg("clear_extent_bit returned %d\n", ret);
+ goto out;
+ }
+ if (BTRFS_I(inode)->outstanding_extents) {
+ ret = -EINVAL;
+ test_msg("Miscount, wanted 0, got %u\n",
+ BTRFS_I(inode)->outstanding_extents);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (ret)
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+ NULL, GFP_NOFS);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
int btrfs_test_inodes(void)
{
int ret;
@@ -924,5 +1115,9 @@ int btrfs_test_inodes(void)
if (ret)
return ret;
test_msg("Running hole first btrfs_get_extent test\n");
- return test_hole_first();
+ ret = test_hole_first();
+ if (ret)
+ return ret;
+ test_msg("Running outstanding_extents tests\n");
+ return test_extent_accounting();
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7e80f32550a6..8be4278e25e8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
u64 old_root_used;
struct btrfs_root *tree_root = root->fs_info->tree_root;
- bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
old_root_used = btrfs_root_used(&root->root_item);
- btrfs_write_dirty_block_groups(trans, root);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start &&
- old_root_used == btrfs_root_used(&root->root_item) &&
- (!extent_root ||
- list_empty(&trans->transaction->dirty_bgs)))
+ old_root_used == btrfs_root_used(&root->root_item))
break;
btrfs_set_root_node(&root->root_item, root->node);
@@ -1044,17 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
old_root_used = btrfs_root_used(&root->root_item);
- if (extent_root) {
- ret = btrfs_write_dirty_block_groups(trans, root);
- if (ret)
- return ret;
- }
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret)
- return ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret)
- return ret;
}
return 0;
@@ -1071,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
struct list_head *next;
struct extent_buffer *eb;
int ret;
@@ -1098,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_setup_space_cache(trans, root);
+ if (ret)
+ return ret;
+
/* run_qgroups might have added some more refs */
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
-
+again:
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
@@ -1115,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
}
+ while (!list_empty(dirty_bgs)) {
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ }
+
+ if (!list_empty(&fs_info->dirty_cowonly_roots))
+ goto again;
+
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
@@ -1814,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_for_commit(root, cur_trans);
+ if (unlikely(cur_trans->aborted))
+ ret = cur_trans->aborted;
+
btrfs_put_transaction(cur_trans);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a37f8b39bae..c5b8ba37f88e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1012,7 +1012,7 @@ again:
base = btrfs_item_ptr_offset(leaf, path->slots[0]);
while (cur_offset < item_size) {
- extref = (struct btrfs_inode_extref *)base + cur_offset;
+ extref = (struct btrfs_inode_extref *)(base + cur_offset);
victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cd4d1315aaa9..8222f6f74147 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4903,10 +4903,17 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
{
struct btrfs_bio *bbio = kzalloc(
+ /* the size of the btrfs_bio */
sizeof(struct btrfs_bio) +
+ /* plus the variable array for the stripes */
sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+ /* plus the variable array for the tgt dev */
sizeof(int) * (real_stripes) +
- sizeof(u64) * (real_stripes),
+ /*
+ * plus the raid_map, which includes both the tgt dev
+ * and the stripes
+ */
+ sizeof(u64) * (total_stripes),
GFP_NOFS);
if (!bbio)
return NULL;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 47b19465f0dc..883b93623bc5 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -111,6 +111,8 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
name, name_len, -1);
if (!di && (flags & XATTR_REPLACE))
ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
else if (di)
ret = btrfs_delete_one_dir_name(trans, root, path, di);
goto out;
@@ -127,10 +129,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
ASSERT(mutex_is_locked(&inode->i_mutex));
di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
name, name_len, 0);
- if (!di) {
+ if (!di)
ret = -ENODATA;
+ else if (IS_ERR(di))
+ ret = PTR_ERR(di);
+ if (ret)
goto out;
- }
btrfs_release_path(path);
di = NULL;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d533075a823d..139f2fea91a0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,7 +7,6 @@
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
#include "super.h"
@@ -808,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
- size_t len = iocb->ki_nbytes;
+ size_t len = iov_iter_count(to);
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct page *pinned_page = NULL;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4ac7445e6ec7..aa0dc2573374 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -1,6 +1,9 @@
/*
* fs/cifs/cifsencrypt.c
*
+ * Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP
+ * for more detailed information
+ *
* Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
@@ -515,7 +518,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
__func__);
return rc;
}
- } else if (ses->serverName) {
+ } else {
+ /* We use ses->serverName if no domain name available */
len = strlen(ses->serverName);
server = kmalloc(2 + (len * 2), GFP_KERNEL);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d3aa999ab785..480cf9c81d50 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1599,6 +1599,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
pr_warn("CIFS: username too long\n");
goto cifs_parse_mount_err;
}
+
+ kfree(vol->username);
vol->username = kstrdup(string, GFP_KERNEL);
if (!vol->username)
goto cifs_parse_mount_err;
@@ -1700,6 +1702,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
+ kfree(vol->domainname);
vol->domainname = kstrdup(string, GFP_KERNEL);
if (!vol->domainname) {
pr_warn("CIFS: no memory for domainname\n");
@@ -1731,6 +1734,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
if (strncasecmp(string, "default", 7) != 0) {
+ kfree(vol->iocharset);
vol->iocharset = kstrdup(string,
GFP_KERNEL);
if (!vol->iocharset) {
@@ -2913,8 +2917,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
* calling name ends in null (byte 16) from old smb
* convention.
*/
- if (server->workstation_RFC1001_name &&
- server->workstation_RFC1001_name[0] != 0)
+ if (server->workstation_RFC1001_name[0] != 0)
rfc1002mangle(ses_init_buf->trailer.
session_req.calling_name,
server->workstation_RFC1001_name,
@@ -3692,6 +3695,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
#endif /* CIFS_WEAK_PW_HASH */
rc = SMBNTencrypt(tcon->password, ses->server->cryptkey,
bcc_ptr, nls_codepage);
+ if (rc) {
+ cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n",
+ __func__, rc);
+ cifs_buf_release(smb_buffer);
+ return rc;
+ }
bcc_ptr += CIFS_AUTH_RESP_SIZE;
if (ses->capabilities & CAP_UNICODE) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a94b3e673182..ca30c391a894 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1823,6 +1823,7 @@ refind_writable:
cifsFileInfo_put(inv_file);
spin_lock(&cifs_file_list_lock);
++refind;
+ inv_file = NULL;
goto refind_writable;
}
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 2d4f37235ed0..3e126d7bb2ea 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -771,6 +771,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
cifs_buf_release(srchinf->ntwrk_buf_start);
}
kfree(srchinf);
+ if (rc)
+ goto cgii_exit;
} else
goto cgii_exit;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 689f035915cf..22dfdf17d065 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -322,7 +322,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
/* return pointer to beginning of data area, ie offset from SMB start */
if ((*off != 0) && (*len != 0))
- return hdr->ProtocolId + *off;
+ return (char *)(&hdr->ProtocolId[0]) + *off;
else
return NULL;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 96b5d40a2ece..eab05e1aa587 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -684,7 +684,8 @@ smb2_clone_range(const unsigned int xid,
/* No need to change MaxChunks since already set to 1 */
chunk_sizes_updated = true;
- }
+ } else
+ goto cchunk_out;
}
cchunk_out:
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 3417340bf89e..65cd7a84c8bc 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1218,7 +1218,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
struct smb2_ioctl_req *req;
struct smb2_ioctl_rsp *rsp;
struct TCP_Server_Info *server;
- struct cifs_ses *ses = tcon->ses;
+ struct cifs_ses *ses;
struct kvec iov[2];
int resp_buftype;
int num_iovecs;
@@ -1233,6 +1233,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (plen)
*plen = 0;
+ if (tcon)
+ ses = tcon->ses;
+ else
+ return -EIO;
+
if (ses && (ses->server))
server = ses->server;
else
@@ -1296,14 +1301,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
if ((rc != 0) && (rc != -EINVAL)) {
- if (tcon)
- cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+ cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
goto ioctl_exit;
} else if (rc == -EINVAL) {
if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
(opcode != FSCTL_SRV_COPYCHUNK)) {
- if (tcon)
- cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+ cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
goto ioctl_exit;
}
}
@@ -1629,7 +1632,7 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
- if ((rc != 0) && tcon)
+ if (rc != 0)
cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
free_rsp_buf(resp_buftype, iov[0].iov_base);
@@ -2114,7 +2117,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec iov[2];
int rc = 0;
int len;
- int resp_buftype;
+ int resp_buftype = CIFS_NO_BUFFER;
unsigned char *bufptr;
struct TCP_Server_Info *server;
struct cifs_ses *ses = tcon->ses;
diff --git a/fs/dcache.c b/fs/dcache.c
index c71e3732e53b..d99736a63e3c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2690,7 +2690,7 @@ static int __d_unalias(struct inode *inode,
struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL, *m2 = NULL;
- int ret = -EBUSY;
+ int ret = -ESTALE;
/* If alias and dentry share a parent, then no extra locks required */
if (alias->d_parent == dentry->d_parent)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 96400ab42d13..61e72d44cf94 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -254,6 +254,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
pr_debug("debugfs: creating file '%s'\n",name);
+ if (IS_ERR(parent))
+ return parent;
+
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b2e297..6fb00e3f1059 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,7 +37,6 @@
#include <linux/uio.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
-#include <linux/aio.h>
/*
* How many user pages to map in one call to get_user_pages(). This determines
@@ -265,7 +264,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
ret = err;
}
- aio_complete(dio->iocb, ret, 0);
+ dio->iocb->ki_complete(dio->iocb, ret, 0);
}
kmem_cache_free(dio_cache, dio);
@@ -1056,7 +1055,7 @@ static inline int drop_refcount(struct dio *dio)
* operation. AIO can if it was a broken operation described above or
* in fact if all the bios race to complete before we get here. In
* that case dio_complete() translates the EIOCBQUEUED into the proper
- * return code that the caller will hand to aio_complete().
+ * return code that the caller will hand to ->complete().
*
* This is managed by the bio_lock instead of being an atomic_t so that
* completion paths can drop their ref and use the remaining count to
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 90d1882b306f..5ba029e627cc 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -124,7 +124,7 @@ ecryptfs_get_key_payload_data(struct key *key)
}
#define ECRYPTFS_MAX_KEYSET_SIZE 1024
-#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 32
+#define ECRYPTFS_MAX_CIPHER_NAME_SIZE 31
#define ECRYPTFS_MAX_NUM_ENC_KEYS 64
#define ECRYPTFS_MAX_IV_BYTES 16 /* 128 bits */
#define ECRYPTFS_SALT_BYTES 2
@@ -237,7 +237,7 @@ struct ecryptfs_crypt_stat {
struct crypto_ablkcipher *tfm;
struct crypto_hash *hash_tfm; /* Crypto context for generating
* the initialization vectors */
- unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+ unsigned char cipher[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
unsigned char key[ECRYPTFS_MAX_KEY_BYTES];
unsigned char root_iv[ECRYPTFS_MAX_IV_BYTES];
struct list_head keysig_list;
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index b07731e68c0b..79675089443d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,7 +31,6 @@
#include <linux/security.h>
#include <linux/compat.h>
#include <linux/fs_stack.h>
-#include <linux/aio.h>
#include "ecryptfs_kernel.h"
/**
@@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
struct file *file = iocb->ki_filp;
rc = generic_file_read_iter(iocb, to);
- /*
- * Even though this is a async interface, we need to wait
- * for IO to finish to update atime
- */
- if (-EIOCBQUEUED == rc)
- rc = wait_on_sync_kiocb(iocb);
if (rc >= 0) {
path = ecryptfs_dentry_to_lower_path(file->f_path.dentry);
touch_atime(path);
@@ -303,9 +296,22 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOTTY;
- if (lower_file->f_op->unlocked_ioctl)
+ if (!lower_file->f_op->unlocked_ioctl)
+ return rc;
+
+ switch (cmd) {
+ case FITRIM:
+ case FS_IOC_GETFLAGS:
+ case FS_IOC_SETFLAGS:
+ case FS_IOC_GETVERSION:
+ case FS_IOC_SETVERSION:
rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
- return rc;
+ fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+ return rc;
+ default:
+ return rc;
+ }
}
#ifdef CONFIG_COMPAT
@@ -315,9 +321,22 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOIOCTLCMD;
- if (lower_file->f_op->compat_ioctl)
+ if (!lower_file->f_op->compat_ioctl)
+ return rc;
+
+ switch (cmd) {
+ case FITRIM:
+ case FS_IOC32_GETFLAGS:
+ case FS_IOC32_SETFLAGS:
+ case FS_IOC32_GETVERSION:
+ case FS_IOC32_SETVERSION:
rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
- return rc;
+ fsstack_copy_attr_all(file_inode(file), file_inode(lower_file));
+
+ return rc;
+ default:
+ return rc;
+ }
}
#endif
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 917bd5c9776a..6bd67e2011f0 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -891,7 +891,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack {
struct blkcipher_desc desc;
char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1];
char iv[ECRYPTFS_MAX_IV_BYTES];
- char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE];
+ char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1];
};
/**
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 1895d60f4122..c095d3264259 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -407,7 +407,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options,
if (!cipher_name_set) {
int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER);
- BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE);
+ BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE);
strcpy(mount_crypt_stat->global_default_cipher_name,
ECRYPTFS_DEFAULT_CIPHER);
}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6434bc000125..df9d6afbc5d5 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,7 +31,7 @@
#include <linux/mpage.h>
#include <linux/fiemap.h>
#include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "ext2.h"
#include "acl.h"
#include "xattr.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2c6ccc49ba27..db07ffbe7c85 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,7 +27,7 @@
#include <linux/writeback.h>
#include <linux/mpage.h>
#include <linux/namei.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "ext3.h"
#include "xattr.h"
#include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 33a09da16c9c..598abbbe6786 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,9 +23,9 @@
#include <linux/jbd2.h>
#include <linux/mount.h>
#include <linux/path.h>
-#include <linux/aio.h>
#include <linux/quotaops.h>
#include <linux/pagevec.h>
+#include <linux/uio.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 45fe924f82bc..740c7871c117 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,9 +20,9 @@
* (sct@redhat.com), 1993, 1998
*/
-#include <linux/aio.h>
#include "ext4_jbd2.h"
#include "truncate.h"
+#include <linux/uio.h>
#include <trace/events/ext4.h>
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5cb9a212b86f..a3f451370bef 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,7 +37,6 @@
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
-#include <linux/aio.h>
#include <linux/bitops.h>
#include "ext4_jbd2.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b24a2541a9ba..464984261e69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,7 +18,6 @@
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/namei.h>
-#include <linux/aio.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 985ed023a750..497f8515d205 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,12 +12,12 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/prefetch.h>
+#include <linux/uio.h>
#include "f2fs.h"
#include "node.h"
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 497c7c5263c7..8521207de229 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,7 +19,6 @@
#include <linux/mpage.h>
#include <linux/buffer_head.h>
#include <linux/mount.h>
-#include <linux/aio.h>
#include <linux/vfs.h>
#include <linux/parser.h>
#include <linux/uio.h>
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e907052eeadb..32a8bbd7a9ad 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -53,6 +53,18 @@ struct wb_writeback_work {
struct completion *done; /* set if the caller waits */
};
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals. We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
/**
* writeback_in_progress - determine whether there is writeback in progress
* @bdi: the device's backing_dev_info structure.
@@ -275,8 +287,8 @@ static int move_expired_inodes(struct list_head *delaying_queue,
if ((flags & EXPIRE_DIRTY_ATIME) == 0)
older_than_this = work->older_than_this;
- else if ((work->reason == WB_REASON_SYNC) == 0) {
- expire_time = jiffies - (HZ * 86400);
+ else if (!work->for_sync) {
+ expire_time = jiffies - (dirtytime_expire_interval * HZ);
older_than_this = &expire_time;
}
while (!list_empty(delaying_queue)) {
@@ -458,6 +470,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
*/
redirty_tail(inode, wb);
} else if (inode->i_state & I_DIRTY_TIME) {
+ inode->dirtied_when = jiffies;
list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
@@ -505,12 +518,17 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
- if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
- (inode->i_state & I_DIRTY_TIME)) ||
- (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
- dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
- trace_writeback_lazytime(inode);
- }
+ if (inode->i_state & I_DIRTY_TIME) {
+ if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
+ unlikely(time_after(jiffies,
+ (inode->dirtied_time_when +
+ dirtytime_expire_interval * HZ)))) {
+ dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+ trace_writeback_lazytime(inode);
+ }
+ } else
+ inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
inode->i_state &= ~dirty;
/*
@@ -1131,6 +1149,56 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_unlock();
}
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically. We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system. So instead we define a separate delayed work
+ * function which gets called much more rarely. (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary. But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+ if (list_empty(&bdi->wb.b_dirty_time))
+ continue;
+ bdi_wakeup_thread(bdi);
+ }
+ rcu_read_unlock();
+ schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+ schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+ return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret;
+
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret == 0 && write)
+ mod_delayed_work(system_wq, &dirtytime_work, 0);
+ return ret;
+}
+
static noinline void block_dump___mark_inode_dirty(struct inode *inode)
{
if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
@@ -1269,8 +1337,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, dirtytime ?
- &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
+ if (dirtytime)
+ inode->dirtied_time_when = jiffies;
+ if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+ list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ else
+ list_move(&inode->i_wb_list,
+ &bdi->wb.b_dirty_time);
spin_unlock(&bdi->wb.list_lock);
trace_writeback_dirty_inode_enqueue(inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 28d0c7abba1c..b3fa05032234 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,7 +38,6 @@
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
-#include <linux/aio.h>
#include <linux/kdev_t.h>
#include <linux/kthread.h>
#include <linux/list.h>
@@ -48,6 +47,7 @@
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/module.h>
+#include <linux/uio.h>
#include "fuse_i.h"
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed19a7d622fa..95a2797eef66 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,7 +19,6 @@
#include <linux/pipe_fs_i.h>
#include <linux/swap.h>
#include <linux/splice.h>
-#include <linux/aio.h>
MODULE_ALIAS_MISCDEV(FUSE_MINOR);
MODULE_ALIAS("devname:fuse");
@@ -890,8 +889,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
newpage = buf->page;
- if (WARN_ON(!PageUptodate(newpage)))
- return -EIO;
+ if (!PageUptodate(newpage))
+ SetPageUptodate(newpage);
ClearPageMappedToDisk(newpage);
@@ -1353,6 +1352,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
return err;
}
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The fuse device's file's private_data is used to hold
+ * the fuse_conn(ection) when it is mounted, and is used to
+ * keep track of whether the file has been mounted already.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
@@ -1797,6 +1807,9 @@ copy_finish:
static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
unsigned int size, struct fuse_copy_state *cs)
{
+ /* Don't try to move pages (yet) */
+ cs->move_pages = 0;
+
switch (code) {
case FUSE_NOTIFY_POLL:
return fuse_notify_poll(fc, size, cs);
@@ -2217,6 +2230,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
const struct file_operations fuse_dev_operations = {
.owner = THIS_MODULE,
+ .open = fuse_dev_open,
.llseek = no_llseek,
.read = do_sync_read,
.aio_read = fuse_dev_read,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index c01ec3bdcfd8..ff102cbf16ea 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,8 +15,8 @@
#include <linux/module.h>
#include <linux/compat.h>
#include <linux/swap.h>
-#include <linux/aio.h>
#include <linux/falloc.h>
+#include <linux/uio.h>
static const struct file_operations fuse_direct_io_file_operations;
@@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
}
}
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+ if (io->err)
+ return io->err;
+
+ if (io->bytes >= 0 && io->write)
+ return -EIO;
+
+ return io->bytes < 0 ? io->size : io->bytes;
+}
+
/**
* In case of short read, the caller sets 'pos' to the position of
* actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
*/
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
+ bool is_sync = is_sync_kiocb(io->iocb);
int left;
spin_lock(&io->lock);
@@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io->bytes = pos;
left = --io->reqs;
+ if (!left && is_sync)
+ complete(io->done);
spin_unlock(&io->lock);
- if (!left) {
- long res;
+ if (!left && !is_sync) {
+ ssize_t res = fuse_get_res_by_io(io);
- if (io->err)
- res = io->err;
- else if (io->bytes >= 0 && io->write)
- res = -EIO;
- else {
- res = io->bytes < 0 ? io->size : io->bytes;
+ if (res >= 0) {
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
- if (!is_sync_kiocb(io->iocb)) {
- struct inode *inode = file_inode(io->iocb->ki_filp);
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
-
- spin_lock(&fc->lock);
- fi->attr_version = ++fc->attr_version;
- spin_unlock(&fc->lock);
- }
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ spin_unlock(&fc->lock);
}
- aio_complete(io->iocb, res, 0);
+ io->iocb->ki_complete(io->iocb, res, 0);
kfree(io);
}
}
@@ -2801,6 +2807,7 @@ static ssize_t
fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
loff_t offset)
{
+ DECLARE_COMPLETION_ONSTACK(wait);
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
struct fuse_file *ff = file->private_data;
@@ -2852,6 +2859,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
io->async = false;
+ if (io->async && is_sync_kiocb(iocb))
+ io->done = &wait;
+
if (rw == WRITE)
ret = __fuse_direct_write(io, iter, &pos);
else
@@ -2864,11 +2874,12 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
if (!is_sync_kiocb(iocb))
return -EIOCBQUEUED;
- ret = wait_on_sync_kiocb(iocb);
- } else {
- kfree(io);
+ wait_for_completion(&wait);
+ ret = fuse_get_res_by_io(io);
}
+ kfree(io);
+
if (rw == WRITE) {
if (ret > 0)
fuse_write_update_size(inode, pos);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1cdfb07c1376..7354dc142a50 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -263,6 +263,7 @@ struct fuse_io_priv {
int err;
struct kiocb *iocb;
struct file *file;
+ struct completion *done;
};
/**
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7bc5c82423ea..a6e6990aea39 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,7 +20,7 @@
#include <linux/swap.h>
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include <trace/events/writeback.h>
#include "gfs2.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4d3108792172..8ec43ab5babf 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
#include <asm/uaccess.h>
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
-#include <linux/aio.h>
#include <linux/delay.h>
#include "gfs2.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index d0929bc81782..98d4ea45bb70 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,7 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "hfs_fs.h"
#include "btree.h"
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 6e560d56094b..754fdf8c6356 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -131,13 +131,16 @@ skip:
hfs_bnode_write(node, entry, data_off + key_len, entry_len);
hfs_bnode_dump(node);
- if (new_node) {
- /* update parent key if we inserted a key
- * at the start of the first node
- */
- if (!rec && new_node != node)
- hfs_brec_update_parent(fd);
+ /*
+ * update parent key if we inserted a key
+ * at the start of the node and it is not the new node
+ */
+ if (!rec && new_node != node) {
+ hfs_bnode_read_key(node, fd->search_key, data_off + size);
+ hfs_brec_update_parent(fd);
+ }
+ if (new_node) {
hfs_bnode_put(fd->bnode);
if (!new_node->parent) {
hfs_btree_inc_height(tree);
@@ -168,9 +171,6 @@ skip:
goto again;
}
- if (!rec)
- hfs_brec_update_parent(fd);
-
return 0;
}
@@ -370,6 +370,8 @@ again:
if (IS_ERR(parent))
return PTR_ERR(parent);
__hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+ if (fd->record < 0)
+ return -ENOENT;
hfs_bnode_dump(parent);
rec = fd->record;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 0cf786f2d046..f541196d4ee9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,7 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mpage.h>
#include <linux/sched.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index d72817ac51f6..762c7a3cf43d 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -195,7 +195,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat
/* unchecked xdatum is chained with c->xattr_unchecked */
list_del_init(&xd->xindex);
- dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n",
+ dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n",
xd->xid, xd->version);
return 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index bd3df1ca3c9b..3197aed10614 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -22,8 +22,8 @@
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
#include <linux/quotaops.h>
+#include <linux/uio.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
#include "jfs_filsys.h"
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 5d30c56ae075..4cd9798f4948 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -102,7 +102,7 @@ void jfs_error(struct super_block *sb, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("ERROR: (device %s): %pf: %pV\n",
+ pr_err("ERROR: (device %s): %ps: %pV\n",
sb->s_id, __builtin_return_address(0), &vaf);
va_end(args);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b684e8a132e6..2bacb9988566 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
goto out_free;
}
+ of->event = atomic_read(&of->kn->attr.open->event);
ops = kernfs_ops(of->kn);
if (ops->read)
len = ops->read(of, buf, len, *ppos);
diff --git a/fs/locks.c b/fs/locks.c
index 365c82e1b3a9..40bc384728c0 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1388,9 +1388,8 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
- struct file_lock *new_fl;
struct file_lock_context *ctx = inode->i_flctx;
- struct file_lock *fl;
+ struct file_lock *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
@@ -1420,7 +1419,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
break_time++; /* so that 0 means no break time */
}
- list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
if (!leases_conflict(fl, new_fl))
continue;
if (want_write) {
@@ -1665,7 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
}
if (my_fl != NULL) {
- error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
+ lease = my_fl;
+ error = lease->fl_lmops->lm_change(lease, arg, &dispose);
if (error)
goto out;
goto out_setup;
@@ -1727,7 +1727,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
break;
}
}
- trace_generic_delete_lease(inode, fl);
+ trace_generic_delete_lease(inode, victim);
if (victim)
error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
spin_unlock(&ctx->flc_lock);
diff --git a/fs/namei.c b/fs/namei.c
index c83145af4bfc..76fb76a0818b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -119,15 +119,14 @@
* PATH_MAX includes the nul terminator --RR.
*/
-#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
+#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
struct filename *
getname_flags(const char __user *filename, int flags, int *empty)
{
- struct filename *result, *err;
- int len;
- long max;
+ struct filename *result;
char *kname;
+ int len;
result = audit_reusename(filename);
if (result)
@@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty)
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
- result->refcnt = 1;
/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
- kname = (char *)result + sizeof(*result);
+ kname = (char *)result->iname;
result->name = kname;
- result->separate = false;
- max = EMBEDDED_NAME_MAX;
-recopy:
- len = strncpy_from_user(kname, filename, max);
+ len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
if (unlikely(len < 0)) {
- err = ERR_PTR(len);
- goto error;
+ __putname(result);
+ return ERR_PTR(len);
}
/*
@@ -160,43 +155,49 @@ recopy:
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
- if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) {
+ if (unlikely(len == EMBEDDED_NAME_MAX)) {
+ const size_t size = offsetof(struct filename, iname[1]);
kname = (char *)result;
- result = kzalloc(sizeof(*result), GFP_KERNEL);
- if (!result) {
- err = ERR_PTR(-ENOMEM);
- result = (struct filename *)kname;
- goto error;
+ /*
+ * size is chosen that way we to guarantee that
+ * result->iname[0] is within the same object and that
+ * kname can't be equal to result->iname, no matter what.
+ */
+ result = kzalloc(size, GFP_KERNEL);
+ if (unlikely(!result)) {
+ __putname(kname);
+ return ERR_PTR(-ENOMEM);
}
result->name = kname;
- result->separate = true;
- result->refcnt = 1;
- max = PATH_MAX;
- goto recopy;
+ len = strncpy_from_user(kname, filename, PATH_MAX);
+ if (unlikely(len < 0)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(len);
+ }
+ if (unlikely(len == PATH_MAX)) {
+ __putname(kname);
+ kfree(result);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
}
+ result->refcnt = 1;
/* The empty path is special. */
if (unlikely(!len)) {
if (empty)
*empty = 1;
- err = ERR_PTR(-ENOENT);
- if (!(flags & LOOKUP_EMPTY))
- goto error;
+ if (!(flags & LOOKUP_EMPTY)) {
+ putname(result);
+ return ERR_PTR(-ENOENT);
+ }
}
- err = ERR_PTR(-ENAMETOOLONG);
- if (unlikely(len >= PATH_MAX))
- goto error;
-
result->uptr = filename;
result->aname = NULL;
audit_getname(result);
return result;
-
-error:
- putname(result);
- return err;
}
struct filename *
@@ -216,8 +217,7 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
if (len <= EMBEDDED_NAME_MAX) {
- result->name = (char *)(result) + sizeof(*result);
- result->separate = false;
+ result->name = (char *)result->iname;
} else if (len <= PATH_MAX) {
struct filename *tmp;
@@ -227,7 +227,6 @@ getname_kernel(const char * filename)
return ERR_PTR(-ENOMEM);
}
tmp->name = (char *)result;
- tmp->separate = true;
result = tmp;
} else {
__putname(result);
@@ -249,7 +248,7 @@ void putname(struct filename *name)
if (--name->refcnt > 0)
return;
- if (name->separate) {
+ if (name->name != name->iname) {
__putname(name->name);
kfree(name);
} else
@@ -1851,10 +1850,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
return err;
}
-static int path_init(int dfd, const char *name, unsigned int flags,
+static int path_init(int dfd, const struct filename *name, unsigned int flags,
struct nameidata *nd)
{
int retval = 0;
+ const char *s = name->name;
nd->last_type = LAST_ROOT; /* if there are only slashes... */
nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT;
@@ -1863,7 +1863,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
if (flags & LOOKUP_ROOT) {
struct dentry *root = nd->root.dentry;
struct inode *inode = root->d_inode;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(root))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
@@ -1885,7 +1885,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->root.mnt = NULL;
nd->m_seq = read_seqbegin(&mount_lock);
- if (*name=='/') {
+ if (*s == '/') {
if (flags & LOOKUP_RCU) {
rcu_read_lock();
nd->seq = set_root_rcu(nd);
@@ -1919,7 +1919,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
dentry = f.file->f_path.dentry;
- if (*name) {
+ if (*s) {
if (!d_can_lookup(dentry)) {
fdput(f);
return -ENOTDIR;
@@ -1949,7 +1949,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
return -ECHILD;
done:
current->total_link_count = 0;
- return link_path_walk(name, nd);
+ return link_path_walk(s, nd);
}
static void path_cleanup(struct nameidata *nd)
@@ -1972,7 +1972,7 @@ static inline int lookup_last(struct nameidata *nd, struct path *path)
}
/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int path_lookupat(int dfd, const char *name,
+static int path_lookupat(int dfd, const struct filename *name,
unsigned int flags, struct nameidata *nd)
{
struct path path;
@@ -2027,31 +2027,17 @@ static int path_lookupat(int dfd, const char *name,
static int filename_lookup(int dfd, struct filename *name,
unsigned int flags, struct nameidata *nd)
{
- int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd);
+ int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
if (unlikely(retval == -ECHILD))
- retval = path_lookupat(dfd, name->name, flags, nd);
+ retval = path_lookupat(dfd, name, flags, nd);
if (unlikely(retval == -ESTALE))
- retval = path_lookupat(dfd, name->name,
- flags | LOOKUP_REVAL, nd);
+ retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
if (likely(!retval))
audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT);
return retval;
}
-static int do_path_lookup(int dfd, const char *name,
- unsigned int flags, struct nameidata *nd)
-{
- struct filename *filename = getname_kernel(name);
- int retval = PTR_ERR(filename);
-
- if (!IS_ERR(filename)) {
- retval = filename_lookup(dfd, filename, flags, nd);
- putname(filename);
- }
- return retval;
-}
-
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
@@ -2089,9 +2075,15 @@ out:
int kern_path(const char *name, unsigned int flags, struct path *path)
{
struct nameidata nd;
- int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
- if (!res)
- *path = nd.path;
+ struct filename *filename = getname_kernel(name);
+ int res = PTR_ERR(filename);
+
+ if (!IS_ERR(filename)) {
+ res = filename_lookup(AT_FDCWD, filename, flags, &nd);
+ putname(filename);
+ if (!res)
+ *path = nd.path;
+ }
return res;
}
EXPORT_SYMBOL(kern_path);
@@ -2108,15 +2100,22 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
const char *name, unsigned int flags,
struct path *path)
{
- struct nameidata nd;
- int err;
- nd.root.dentry = dentry;
- nd.root.mnt = mnt;
+ struct filename *filename = getname_kernel(name);
+ int err = PTR_ERR(filename);
+
BUG_ON(flags & LOOKUP_PARENT);
- /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
- err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd);
- if (!err)
- *path = nd.path;
+
+ /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */
+ if (!IS_ERR(filename)) {
+ struct nameidata nd;
+ nd.root.dentry = dentry;
+ nd.root.mnt = mnt;
+ err = filename_lookup(AT_FDCWD, filename,
+ flags | LOOKUP_ROOT, &nd);
+ if (!err)
+ *path = nd.path;
+ putname(filename);
+ }
return err;
}
EXPORT_SYMBOL(vfs_path_lookup);
@@ -2138,9 +2137,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
* @len: maximum length @len should be interpreted to
*
* Note that this routine is purely a helper for filesystem usage and should
- * not be called by generic code. Also note that by using this function the
- * nameidata argument is passed to the filesystem methods and a filesystem
- * using this helper needs to be prepared for that.
+ * not be called by generic code.
*/
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
@@ -2341,7 +2338,8 @@ out:
* Returns 0 and "path" will be valid on success; Returns error otherwise.
*/
static int
-path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
+path_mountpoint(int dfd, const struct filename *name, struct path *path,
+ unsigned int flags)
{
struct nameidata nd;
int err;
@@ -2370,20 +2368,20 @@ out:
}
static int
-filename_mountpoint(int dfd, struct filename *s, struct path *path,
+filename_mountpoint(int dfd, struct filename *name, struct path *path,
unsigned int flags)
{
int error;
- if (IS_ERR(s))
- return PTR_ERR(s);
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+ error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU);
if (unlikely(error == -ECHILD))
- error = path_mountpoint(dfd, s->name, path, flags);
+ error = path_mountpoint(dfd, name, path, flags);
if (unlikely(error == -ESTALE))
- error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
+ error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL);
if (likely(!error))
- audit_inode(s, path->dentry, 0);
- putname(s);
+ audit_inode(name, path->dentry, 0);
+ putname(name);
return error;
}
@@ -3156,7 +3154,7 @@ static int do_tmpfile(int dfd, struct filename *pathname,
static const struct qstr name = QSTR_INIT("/", 1);
struct dentry *dentry, *child;
struct inode *dir;
- int error = path_lookupat(dfd, pathname->name,
+ int error = path_lookupat(dfd, pathname,
flags | LOOKUP_DIRECTORY, nd);
if (unlikely(error))
return error;
@@ -3229,7 +3227,7 @@ static struct file *path_openat(int dfd, struct filename *pathname,
goto out;
}
- error = path_init(dfd, pathname->name, flags, nd);
+ error = path_init(dfd, pathname, flags, nd);
if (unlikely(error))
goto out;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index f9f4845db989..19874151e95c 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -433,7 +433,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
static bool nfs_client_init_is_complete(const struct nfs_client *clp)
{
- return clp->cl_cons_state != NFS_CS_INITING;
+ return clp->cl_cons_state <= NFS_CS_READY;
}
int nfs_wait_client_init_complete(const struct nfs_client *clp)
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index a1f0685b42ff..a6ad68865880 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -181,8 +181,8 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags);
spin_unlock(&delegation->lock);
- put_rpccred(oldcred);
rcu_read_unlock();
+ put_rpccred(oldcred);
trace_nfs4_reclaim_delegation(inode, res->delegation_type);
} else {
/* We appear to have raced with a delegation return. */
@@ -370,7 +370,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation = NULL;
goto out;
}
- freeme = nfs_detach_delegation_locked(nfsi,
+ if (test_and_set_bit(NFS_DELEGATION_RETURNING,
+ &old_delegation->flags))
+ goto out;
+ freeme = nfs_detach_delegation_locked(nfsi,
old_delegation, clp);
if (freeme == NULL)
goto out;
@@ -433,6 +436,8 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
{
bool ret = false;
+ if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+ goto out;
if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
ret = true;
if (test_and_clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags) && !ret) {
@@ -444,6 +449,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
ret = true;
spin_unlock(&delegation->lock);
}
+out:
return ret;
}
@@ -471,14 +477,20 @@ restart:
super_list) {
if (!nfs_delegation_need_return(delegation))
continue;
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL)
+ if (!nfs_sb_active(server->super))
continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
err = nfs_end_delegation_return(inode, delegation, 0);
iput(inode);
+ nfs_sb_deactive(server->super);
if (!err)
goto restart;
set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
@@ -809,19 +821,30 @@ restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry_rcu(delegation, &server->delegations,
super_list) {
+ if (test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags))
+ continue;
if (test_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags) == 0)
continue;
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL)
+ if (!nfs_sb_active(server->super))
continue;
- delegation = nfs_detach_delegation(NFS_I(inode),
- delegation, server);
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
+ rcu_read_unlock();
+ nfs_sb_deactive(server->super);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
rcu_read_unlock();
-
- if (delegation != NULL)
- nfs_free_delegation(delegation);
+ if (delegation != NULL) {
+ delegation = nfs_detach_delegation(NFS_I(inode),
+ delegation, server);
+ if (delegation != NULL)
+ nfs_free_delegation(delegation);
+ }
iput(inode);
+ nfs_sb_deactive(server->super);
goto restart;
}
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9b0c55cb2a2e..c19e16f0b2d0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -408,14 +408,22 @@ static int xdr_decode(nfs_readdir_descriptor_t *desc,
return 0;
}
+/* Match file and dirent using either filehandle or fileid
+ * Note: caller is responsible for checking the fsid
+ */
static
int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
{
+ struct nfs_inode *nfsi;
+
if (dentry->d_inode == NULL)
goto different;
- if (nfs_compare_fh(entry->fh, NFS_FH(dentry->d_inode)) != 0)
- goto different;
- return 1;
+
+ nfsi = NFS_I(dentry->d_inode);
+ if (entry->fattr->fileid == nfsi->fileid)
+ return 1;
+ if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
+ return 1;
different:
return 0;
}
@@ -469,6 +477,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
struct inode *inode;
int status;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FILEID))
+ return;
+ if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
+ return;
if (filename.name[0] == '.') {
if (filename.len == 1)
return;
@@ -479,6 +491,10 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
dentry = d_lookup(parent, &filename);
if (dentry != NULL) {
+ /* Is there a mountpoint here? If so, just exit */
+ if (!nfs_fsid_equal(&NFS_SB(dentry->d_sb)->fsid,
+ &entry->fattr->fsid))
+ goto out;
if (nfs_same_file(dentry, entry)) {
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
status = nfs_refresh_inode(dentry->d_inode, entry->fattr);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index e907c8cf732e..c3929fb2ab26 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
return -EINVAL;
#else
- VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
+ VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (rw == READ)
return nfs_file_direct_read(iocb, iter, pos);
@@ -393,7 +393,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
long res = (long) dreq->error;
if (!res)
res = (long) dreq->count;
- aio_complete(dreq->iocb, res, 0);
+ dreq->iocb->ki_complete(dreq->iocb, res, 0);
}
complete_all(&dreq->completion);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 94712fc781fa..37b15582e0de 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -26,7 +26,6 @@
#include <linux/nfs_mount.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
-#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/swap.h>
@@ -178,7 +177,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+ result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
@@ -199,7 +198,7 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping(inode, filp->f_mapping);
+ res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
@@ -372,6 +371,10 @@ start:
nfs_wait_bit_killable, TASK_KILLABLE);
if (ret)
return ret;
+ /*
+ * Wait for O_DIRECT to complete
+ */
+ nfs_inode_dio_wait(mapping->host);
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
@@ -619,6 +622,9 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
+ wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+
lock_page(page);
mapping = page_file_mapping(page);
if (mapping != inode->i_mapping)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 83107be3dd01..d42dff6d5e98 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -556,6 +556,7 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
* This is a copy of the common vmtruncate, but with the locking
* corrected to take into account the fact that NFS requires
* inode->i_size to be updated under the inode->i_lock.
+ * Note: must be called with inode->i_lock held!
*/
static int nfs_vmtruncate(struct inode * inode, loff_t offset)
{
@@ -565,14 +566,14 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
if (err)
goto out;
- spin_lock(&inode->i_lock);
i_size_write(inode, offset);
/* Optimisation */
if (offset == 0)
NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
truncate_pagecache(inode, offset);
+ spin_lock(&inode->i_lock);
out:
return err;
}
@@ -585,10 +586,15 @@ out:
* Note: we do this in the *proc.c in order to ensure that
* it works for things like exclusive creates too.
*/
-void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
+ struct nfs_fattr *fattr)
{
+ /* Barrier: bump the attribute generation count. */
+ nfs_fattr_set_barrier(fattr);
+
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->attr_gencount = fattr->gencount;
if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
- spin_lock(&inode->i_lock);
if ((attr->ia_valid & ATTR_MODE) != 0) {
int mode = attr->ia_mode & S_IALLUGO;
mode |= inode->i_mode & ~S_IALLUGO;
@@ -600,12 +606,13 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
inode->i_gid = attr->ia_gid;
nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL);
- spin_unlock(&inode->i_lock);
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC);
nfs_vmtruncate(inode, attr->ia_size);
}
+ nfs_update_inode(inode, fattr);
+ spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
@@ -1028,6 +1035,7 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
if (mapping->nrpages != 0) {
if (S_ISREG(inode->i_mode)) {
+ unmap_mapping_range(mapping, 0, 0, 0);
ret = nfs_sync_mapping(mapping);
if (ret < 0)
return ret;
@@ -1060,11 +1068,14 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
}
/**
- * nfs_revalidate_mapping - Revalidate the pagecache
+ * __nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
+ * @may_lock - take inode->i_mutex?
*/
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static int __nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping,
+ bool may_lock)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
@@ -1113,7 +1124,12 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
+ if (may_lock) {
+ mutex_lock(&inode->i_mutex);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ mutex_unlock(&inode->i_mutex);
+ } else
+ ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1123,6 +1139,29 @@ out:
return ret;
}
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+ return __nfs_revalidate_mapping(inode, mapping, false);
+}
+
+/**
+ * nfs_revalidate_mapping_protected - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ *
+ * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
+ * while invalidating the mapping.
+ */
+int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+{
+ return __nfs_revalidate_mapping(inode, mapping, true);
+}
+
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
struct nfs_inode *nfsi = NFS_I(inode);
@@ -1231,13 +1270,6 @@ static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fat
return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
}
-static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
-{
- if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
- return 0;
- return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
-}
-
static atomic_long_t nfs_attr_generation_counter;
static unsigned long nfs_read_attr_generation_counter(void)
@@ -1249,6 +1281,7 @@ unsigned long nfs_inc_attr_generation_counter(void)
{
return atomic_long_inc_return(&nfs_attr_generation_counter);
}
+EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter);
void nfs_fattr_init(struct nfs_fattr *fattr)
{
@@ -1260,6 +1293,22 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
}
EXPORT_SYMBOL_GPL(nfs_fattr_init);
+/**
+ * nfs_fattr_set_barrier
+ * @fattr: attributes
+ *
+ * Used to set a barrier after an attribute was updated. This
+ * barrier ensures that older attributes from RPC calls that may
+ * have raced with our update cannot clobber these new values.
+ * Note that you are still responsible for ensuring that other
+ * operations which change the attribute on the server do not
+ * collide.
+ */
+void nfs_fattr_set_barrier(struct nfs_fattr *fattr)
+{
+ fattr->gencount = nfs_inc_attr_generation_counter();
+}
+
struct nfs_fattr *nfs_alloc_fattr(void)
{
struct nfs_fattr *fattr;
@@ -1370,7 +1419,6 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 ||
nfs_ctime_need_update(inode, fattr) ||
- nfs_size_need_update(inode, fattr) ||
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
@@ -1460,6 +1508,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
int status;
spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
status = nfs_post_op_update_inode_locked(inode, fattr);
spin_unlock(&inode->i_lock);
@@ -1468,7 +1517,7 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr)
EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
/**
- * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache
* @inode - pointer to inode
* @fattr - updated attributes
*
@@ -1478,11 +1527,10 @@ EXPORT_SYMBOL_GPL(nfs_post_op_update_inode);
*
* This function is mainly designed to be used by the ->write_done() functions.
*/
-int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int status;
- spin_lock(&inode->i_lock);
/* Don't do a WCC update if these attributes are already stale */
if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
!nfs_inode_attrs_need_update(inode, fattr)) {
@@ -1514,6 +1562,27 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
}
out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr);
+ return status;
+}
+
+/**
+ * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache
+ * @inode - pointer to inode
+ * @fattr - updated attributes
+ *
+ * After an operation that has changed the inode metadata, mark the
+ * attribute cache as being invalid, then try to update it. Fake up
+ * weak cache consistency data, if none exist.
+ *
+ * This function is mainly designed to be used by the ->write_done() functions.
+ */
+int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr)
+{
+ int status;
+
+ spin_lock(&inode->i_lock);
+ nfs_fattr_set_barrier(fattr);
+ status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
spin_unlock(&inode->i_lock);
return status;
}
@@ -1715,6 +1784,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
+ /* Set barrier to be more recent than all outstanding updates */
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
@@ -1722,6 +1792,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
}
+ /* Set the barrier to be more recent than this fattr */
+ if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+ nfsi->attr_gencount = fattr->gencount;
}
invalid &= ~NFS_INO_INVALID_ATTR;
/* Don't invalidate the data if we were to blame */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index b802fb3a2d99..9e6475bc5ba2 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -459,6 +459,7 @@ void nfs_mark_request_commit(struct nfs_page *req,
struct nfs_commit_info *cinfo,
u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 78e557c3ab87..1f11d2533ee4 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -138,7 +138,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
dprintk("NFS reply setattr: %d\n", status);
return status;
}
@@ -834,7 +834,7 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+ nfs_writeback_update_inode(hdr);
return 0;
}
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 2a932fdc57cb..53852a4bd88b 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1987,6 +1987,11 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
+ if (entry->fattr->fileid != entry->ino) {
+ entry->fattr->mounted_on_fileid = entry->ino;
+ entry->fattr->valid |= NFS_ATTR_FATTR_MOUNTED_ON_FILEID;
+ }
+
/* In fact, a post_op_fh3: */
p = xdr_inline_decode(xdr, 4);
if (unlikely(p == NULL))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8646af9b11d2..86d6214ea022 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -621,6 +621,9 @@ int nfs41_walk_client_list(struct nfs_client *new,
spin_lock(&nn->nfs_client_lock);
list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+ if (pos == new)
+ goto found;
+
if (pos->rpc_ops != new->rpc_ops)
continue;
@@ -639,10 +642,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
prev = pos;
status = nfs_wait_client_init_complete(pos);
- if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
- nfs4_schedule_lease_recovery(pos);
- status = nfs4_wait_clnt_recover(pos);
- }
spin_lock(&nn->nfs_client_lock);
if (status < 0)
break;
@@ -668,7 +667,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
*/
if (!nfs4_match_client_owner_id(pos, new))
continue;
-
+found:
atomic_inc(&pos->cl_count);
*result = pos;
status = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 88180ac5ea0e..627f37c44456 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -901,6 +901,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
if (!cinfo->atomic || cinfo->before != dir->i_version)
nfs_force_lookup_revalidate(dir);
dir->i_version = cinfo->after;
+ nfsi->attr_gencount = nfs_inc_attr_generation_counter();
nfs_fscache_invalidate(dir);
spin_unlock(&dir->i_lock);
}
@@ -1552,6 +1553,9 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
+ opendata->o_arg.share_access = nfs4_map_atomic_open_share(
+ NFS_SB(opendata->dentry->d_sb),
+ fmode, 0);
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -2413,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir,
opendata->o_res.f_attr, sattr,
state, label, olabel);
if (status == 0) {
- nfs_setattr_update_inode(state->inode, sattr);
- nfs_post_op_update_inode(state->inode, opendata->o_res.f_attr);
+ nfs_setattr_update_inode(state->inode, sattr,
+ opendata->o_res.f_attr);
nfs_setsecurity(state->inode, opendata->o_res.f_attr, olabel);
}
}
@@ -2651,7 +2655,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_EXPIRED:
if (!nfs4_stateid_match(&calldata->arg.stateid,
- &state->stateid)) {
+ &state->open_stateid)) {
rpc_restart_call_prepare(task);
goto out_release;
}
@@ -2687,7 +2691,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
- nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
+ nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid);
/* Calculate the change in open mode */
calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
@@ -3288,7 +3292,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
if (status == 0) {
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
nfs_setsecurity(inode, fattr, label);
}
nfs4_label_free(label);
@@ -4234,7 +4238,7 @@ static int nfs4_write_done_cb(struct rpc_task *task,
}
if (task->tk_status >= 0) {
renew_lease(NFS_SERVER(inode), hdr->timestamp);
- nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
+ nfs_writeback_update_inode(hdr);
}
return 0;
}
@@ -6893,9 +6897,13 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
if (status == 0) {
clp->cl_clientid = res.clientid;
- clp->cl_exchange_flags = (res.flags & ~EXCHGID4_FLAG_CONFIRMED_R);
- if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R))
+ clp->cl_exchange_flags = res.flags;
+ /* Client ID is not confirmed */
+ if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+ clear_bit(NFS4_SESSION_ESTABLISHED,
+ &clp->cl_session->session_state);
clp->cl_seqid = res.seqid;
+ }
kfree(clp->cl_serverowner);
clp->cl_serverowner = res.server_owner;
@@ -7227,6 +7235,9 @@ static void nfs4_update_session(struct nfs4_session *session,
struct nfs41_create_session_res *res)
{
nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+ /* Mark client id and session as being confirmed */
+ session->clp->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+ set_bit(NFS4_SESSION_ESTABLISHED, &session->session_state);
session->flags = res->flags;
memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
if (res->flags & SESSION4_BACK_CHAN)
@@ -7322,8 +7333,8 @@ int nfs4_proc_destroy_session(struct nfs4_session *session,
dprintk("--> nfs4_proc_destroy_session\n");
/* session is still being setup */
- if (session->clp->cl_cons_state != NFS_CS_READY)
- return status;
+ if (!test_and_clear_bit(NFS4_SESSION_ESTABLISHED, &session->session_state))
+ return 0;
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
trace_nfs4_destroy_session(session->clp, status);
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index fc46c7455898..e3ea2c5324d6 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -70,6 +70,7 @@ struct nfs4_session {
enum nfs4_session_state {
NFS4_SESSION_INITING,
+ NFS4_SESSION_ESTABLISHED,
};
extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5ad908e9ce9c..f95e3b58bbc3 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -346,9 +346,23 @@ int nfs41_discover_server_trunking(struct nfs_client *clp,
status = nfs4_proc_exchange_id(clp, cred);
if (status != NFS4_OK)
return status;
- set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
- return nfs41_walk_client_list(clp, result, cred);
+ status = nfs41_walk_client_list(clp, result, cred);
+ if (status < 0)
+ return status;
+ if (clp != *result)
+ return 0;
+
+ /* Purge state if the client id was established in a prior instance */
+ if (clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R)
+ set_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state);
+ else
+ set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+ status = nfs_wait_client_init_complete(clp);
+ if (status < 0)
+ nfs_put_client(clp);
+ return status;
}
#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index b09cc23d6f43..c63189acd052 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -139,7 +139,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
nfs_fattr_init(fattr);
status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
if (status == 0)
- nfs_setattr_update_inode(inode, sattr);
+ nfs_setattr_update_inode(inode, sattr, fattr);
dprintk("NFS reply setattr: %d\n", status);
return status;
}
@@ -609,10 +609,8 @@ static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
- struct inode *inode = hdr->inode;
-
if (task->tk_status >= 0)
- nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
+ nfs_writeback_update_inode(hdr);
return 0;
}
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 595d81e354d1..849ed784d6ac 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1377,6 +1377,36 @@ static int nfs_should_remove_suid(const struct inode *inode)
return 0;
}
+static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr,
+ struct nfs_fattr *fattr)
+{
+ struct nfs_pgio_args *argp = &hdr->args;
+ struct nfs_pgio_res *resp = &hdr->res;
+
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ return;
+ if (argp->offset + resp->count != fattr->size)
+ return;
+ if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode))
+ return;
+ /* Set attribute barrier */
+ nfs_fattr_set_barrier(fattr);
+}
+
+void nfs_writeback_update_inode(struct nfs_pgio_header *hdr)
+{
+ struct nfs_fattr *fattr = hdr->res.fattr;
+ struct inode *inode = hdr->inode;
+
+ if (fattr == NULL)
+ return;
+ spin_lock(&inode->i_lock);
+ nfs_writeback_check_extend(hdr, fattr);
+ nfs_post_op_update_inode_force_wcc_locked(inode, fattr);
+ spin_unlock(&inode->i_lock);
+}
+EXPORT_SYMBOL_GPL(nfs_writeback_update_inode);
+
/*
* This function is called when the WRITE call is complete.
*/
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index cdbc78c72542..03d647bf195d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -137,7 +137,7 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
seg->offset = iomap.offset;
seg->length = iomap.length;
- dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+ dprintk("GET: 0x%llx:0x%llx %d\n", bex->foff, bex->len, bex->es);
return 0;
out_error:
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9da89fddab33..9aa2796da90d 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -122,19 +122,19 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
p = xdr_decode_hyper(p, &bex.foff);
if (bex.foff & (block_size - 1)) {
- dprintk("%s: unaligned offset %lld\n",
+ dprintk("%s: unaligned offset 0x%llx\n",
__func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.len);
if (bex.len & (block_size - 1)) {
- dprintk("%s: unaligned length %lld\n",
+ dprintk("%s: unaligned length 0x%llx\n",
__func__, bex.foff);
goto fail;
}
p = xdr_decode_hyper(p, &bex.soff);
if (bex.soff & (block_size - 1)) {
- dprintk("%s: unaligned disk offset %lld\n",
+ dprintk("%s: unaligned disk offset 0x%llx\n",
__func__, bex.soff);
goto fail;
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3c1bfa155571..6904213a4363 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -118,7 +118,7 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
{
struct super_block *sb = exp->ex_path.mnt->mnt_sb;
- if (exp->ex_flags & NFSEXP_NOPNFS)
+ if (!(exp->ex_flags & NFSEXP_PNFS))
return;
if (sb->s_export_op->get_uuid &&
@@ -440,15 +440,14 @@ nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
list_move_tail(&lp->lo_perstate, reaplist);
return;
}
- end = seg->offset;
+ lo->offset = layout_end(seg);
} else {
/* retain the whole layout segment on a split. */
if (layout_end(seg) < end) {
dprintk("%s: split not supported\n", __func__);
return;
}
-
- lo->offset = layout_end(seg);
+ end = seg->offset;
}
layout_update_len(lo, end);
@@ -513,6 +512,9 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp,
spin_lock(&clp->cl_lock);
list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+ if (ls->ls_layout_type != lrp->lr_layout_type)
+ continue;
+
if (lrp->lr_return_type == RETURN_FSID &&
!fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
&cstate->current_fh.fh_handle))
@@ -587,7 +589,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
- nfsd4_cb_layout_fail(ls);
+ trace_layout_recall_fail(&ls->ls_stid.sc_stateid);
printk(KERN_WARNING
"nfsd: client %s failed to respond to layout recall. "
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index d30bea8d0277..92b9d97aff4f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1237,8 +1237,8 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
gdp->gd_notify_types &= ops->notify_types;
- exp_put(exp);
out:
+ exp_put(exp);
return nfserr;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f6b2a09f793f..8ba1d888f1e6 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1638,7 +1638,7 @@ __destroy_client(struct nfs4_client *clp)
nfs4_put_stid(&dp->dl_stid);
}
while (!list_empty(&clp->cl_revoked)) {
- dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+ dp = list_entry(clp->cl_revoked.next, struct nfs4_delegation, dl_recall_lru);
list_del_init(&dp->dl_recall_lru);
nfs4_put_stid(&dp->dl_stid);
}
@@ -3221,7 +3221,7 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
} else
nfs4_free_openowner(&oo->oo_owner);
spin_unlock(&clp->cl_lock);
- return oo;
+ return ret;
}
static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
@@ -5062,7 +5062,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
} else
nfs4_free_lockowner(&lo->lo_owner);
spin_unlock(&clp->cl_lock);
- return lo;
+ return ret;
}
static void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index df5e66caf100..5fb7e78169a6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1562,7 +1562,11 @@ nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
p = xdr_decode_hyper(p, &lgp->lg_seg.length);
p = xdr_decode_hyper(p, &lgp->lg_minlength);
- nfsd4_decode_stateid(argp, &lgp->lg_sid);
+
+ status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ if (status)
+ return status;
+
READ_BUF(4);
lgp->lg_maxcount = be32_to_cpup(p++);
@@ -1580,7 +1584,11 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
p = xdr_decode_hyper(p, &lcp->lc_seg.length);
lcp->lc_reclaim = be32_to_cpup(p++);
- nfsd4_decode_stateid(argp, &lcp->lc_sid);
+
+ status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ if (status)
+ return status;
+
READ_BUF(4);
lcp->lc_newoffset = be32_to_cpup(p++);
if (lcp->lc_newoffset) {
@@ -1628,7 +1636,11 @@ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
READ_BUF(16);
p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
p = xdr_decode_hyper(p, &lrp->lr_seg.length);
- nfsd4_decode_stateid(argp, &lrp->lr_sid);
+
+ status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+ if (status)
+ return status;
+
READ_BUF(4);
lrp->lrf_body_len = be32_to_cpup(p++);
if (lrp->lrf_body_len > 0) {
@@ -4123,7 +4135,7 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr_resource;
*p++ = cpu_to_be32(lrp->lrs_present);
if (lrp->lrs_present)
- nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+ return nfsd4_encode_stateid(xdr, &lrp->lr_sid);
return nfs_ok;
}
#endif /* CONFIG_NFSD_PNFS */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 83a9694ec485..46ec934f5dee 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -165,13 +165,17 @@ int nfsd_reply_cache_init(void)
{
unsigned int hashsize;
unsigned int i;
+ int status = 0;
max_drc_entries = nfsd_cache_size_limit();
atomic_set(&num_drc_entries, 0);
hashsize = nfsd_hashsize(max_drc_entries);
maskbits = ilog2(hashsize);
- register_shrinker(&nfsd_reply_cache_shrinker);
+ status = register_shrinker(&nfsd_reply_cache_shrinker);
+ if (status)
+ return status;
+
drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
0, 0, NULL);
if (!drc_slab)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index b2e3ff347620..ecdbae19a766 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -31,6 +31,8 @@
#include "alloc.h"
#include "dat.h"
+static void __nilfs_btree_init(struct nilfs_bmap *bmap);
+
static struct nilfs_btree_path *nilfs_btree_alloc_path(void)
{
struct nilfs_btree_path *path;
@@ -368,6 +370,34 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
return ret;
}
+/**
+ * nilfs_btree_root_broken - verify consistency of btree root node
+ * @node: btree root node to be examined
+ * @ino: inode number
+ *
+ * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
+ */
+static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
+ unsigned long ino)
+{
+ int level, flags, nchildren;
+ int ret = 0;
+
+ level = nilfs_btree_node_get_level(node);
+ flags = nilfs_btree_node_get_flags(node);
+ nchildren = nilfs_btree_node_get_nchildren(node);
+
+ if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN ||
+ level > NILFS_BTREE_LEVEL_MAX ||
+ nchildren < 0 ||
+ nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
+ pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
+ ino, level, flags, nchildren);
+ ret = 1;
+ }
+ return ret;
+}
+
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
int ret;
@@ -1713,7 +1743,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree,
/* convert and insert */
dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL;
- nilfs_btree_init(btree);
+ __nilfs_btree_init(btree);
if (nreq != NULL) {
nilfs_bmap_commit_alloc_ptr(btree, dreq, dat);
nilfs_bmap_commit_alloc_ptr(btree, nreq, dat);
@@ -2294,12 +2324,23 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
.bop_gather_data = NULL,
};
-int nilfs_btree_init(struct nilfs_bmap *bmap)
+static void __nilfs_btree_init(struct nilfs_bmap *bmap)
{
bmap->b_ops = &nilfs_btree_ops;
bmap->b_nchildren_per_block =
NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap));
- return 0;
+}
+
+int nilfs_btree_init(struct nilfs_bmap *bmap)
+{
+ int ret = 0;
+
+ __nilfs_btree_init(bmap);
+
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
+ bmap->b_inode->i_ino))
+ ret = -EIO;
+ return ret;
}
void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b5969538f39..ab4987bc637f 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -26,7 +26,7 @@
#include <linux/mpage.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b9f99b..0c3f303baf32 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
struct nilfs_inode_info *ii, *n;
+ int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
brelse(ii->i_bh);
ii->i_bh = NULL;
list_del_init(&ii->i_dirty);
- if (!ii->vfs_inode.i_nlink) {
+ if (!ii->vfs_inode.i_nlink || during_mount) {
/*
- * Defer calling iput() to avoid a deadlock
- * over I_SYNC flag for inodes with i_nlink == 0
+ * Defer calling iput() to avoid deadlocks if
+ * i_nlink == 0 or mount is not yet finished.
*/
list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
defer_iput = true;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9a66ff79ff27..d2f97ecca6a5 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -143,7 +143,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
!(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return false;
- if (event_mask & marks_mask & ~marks_ignored_mask)
+ if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+ ~marks_ignored_mask)
return true;
return false;
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 36ae529511c4..2ff263e6d363 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-ccflags-y := -DNTFS_VERSION=\"2.1.31\"
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da9b2d184dc..c1da78dad1af 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
/*
* file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
*
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
*
* This program/include file is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published
@@ -28,7 +28,6 @@
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/writeback.h>
-#include <linux/aio.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -329,62 +328,168 @@ err_out:
return err;
}
-/**
- * ntfs_fault_in_pages_readable -
- *
- * Fault a number of userspace pages into pagetables.
- *
- * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes
- * with more than two userspace pages as well as handling the single page case
- * elegantly.
- *
- * If you find this difficult to understand, then think of the while loop being
- * the following code, except that we do without the integer variable ret:
- *
- * do {
- * ret = __get_user(c, uaddr);
- * uaddr += PAGE_SIZE;
- * } while (!ret && uaddr < end);
- *
- * Note, the final __get_user() may well run out-of-bounds of the user buffer,
- * but _not_ out-of-bounds of the page the user buffer belongs to, and since
- * this is only a read and not a write, and since it is still in the same page,
- * it should not matter and this makes the code much simpler.
- */
-static inline void ntfs_fault_in_pages_readable(const char __user *uaddr,
- int bytes)
+static ssize_t ntfs_prepare_file_for_write(struct file *file, loff_t *ppos,
+ size_t *count)
{
- const char __user *end;
- volatile char c;
-
- /* Set @end to the first byte outside the last page we care about. */
- end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes);
-
- while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end))
- ;
-}
-
-/**
- * ntfs_fault_in_pages_readable_iovec -
- *
- * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs.
- */
-static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
- size_t iov_ofs, int bytes)
-{
- do {
- const char __user *buf;
- unsigned len;
+ loff_t pos;
+ s64 end, ll;
+ ssize_t err;
+ unsigned long flags;
+ struct inode *vi = file_inode(file);
+ ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
- buf = iov->iov_base + iov_ofs;
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- ntfs_fault_in_pages_readable(buf, len);
- bytes -= len;
- iov++;
- iov_ofs = 0;
- } while (bytes);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)*ppos, (unsigned long)*count);
+ /* We can write back this queue in page reclaim. */
+ current->backing_dev_info = inode_to_bdi(vi);
+ err = generic_write_checks(file, ppos, count, S_ISBLK(vi->i_mode));
+ if (unlikely(err))
+ goto out;
+ /*
+ * All checks have passed. Before we start doing any writing we want
+ * to abort any totally illegal writes.
+ */
+ BUG_ON(NInoMstProtected(ni));
+ BUG_ON(ni->type != AT_DATA);
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ /* Only $DATA attributes can be encrypted. */
+ /*
+ * Reminder for later: Encrypted files are _always_
+ * non-resident so that the content can always be encrypted.
+ */
+ ntfs_debug("Denying write access to encrypted file.");
+ err = -EACCES;
+ goto out;
+ }
+ if (NInoCompressed(ni)) {
+ /* Only unnamed $DATA attribute can be compressed. */
+ BUG_ON(ni->name_len);
+ /*
+ * Reminder for later: If resident, the data is not actually
+ * compressed. Only on the switch to non-resident does
+ * compression kick in. This is in contrast to encrypted files
+ * (see above).
+ */
+ ntfs_error(vi->i_sb, "Writing to compressed files is not "
+ "implemented yet. Sorry.");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ if (*count == 0)
+ goto out;
+ base_ni = ni;
+ if (NInoAttr(ni))
+ base_ni = ni->ext.base_ntfs_ino;
+ err = file_remove_suid(file);
+ if (unlikely(err))
+ goto out;
+ /*
+ * Our ->update_time method always succeeds thus file_update_time()
+ * cannot fail either so there is no need to check the return code.
+ */
+ file_update_time(file);
+ pos = *ppos;
+ /* The first byte after the last cluster being written to. */
+ end = (pos + *count + vol->cluster_size_mask) &
+ ~(u64)vol->cluster_size_mask;
+ /*
+ * If the write goes beyond the allocated size, extend the allocation
+ * to cover the whole of the write, rounded up to the nearest cluster.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end > ll) {
+ /*
+ * Extend the allocation without changing the data size.
+ *
+ * Note we ensure the allocation is big enough to at least
+ * write some data but we do not require the allocation to be
+ * complete, i.e. it may be partial.
+ */
+ ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ if (likely(ll >= 0)) {
+ BUG_ON(pos >= ll);
+ /* If the extension was partial truncate the write. */
+ if (end > ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "the allocation was only "
+ "partially extended.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ *count = ll - pos;
+ }
+ } else {
+ err = ll;
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Perform a partial write if possible or fail. */
+ if (pos < ll) {
+ ntfs_debug("Truncating write to inode 0x%lx "
+ "attribute type 0x%x, because "
+ "extending the allocation "
+ "failed (error %d).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (int)-err);
+ *count = ll - pos;
+ } else {
+ if (err != -ENOSPC)
+ ntfs_error(vi->i_sb, "Cannot perform "
+ "write to inode "
+ "0x%lx, attribute "
+ "type 0x%x, because "
+ "extending the "
+ "allocation failed "
+ "(error %ld).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (long)-err);
+ else
+ ntfs_debug("Cannot perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because there is not "
+ "space left.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ goto out;
+ }
+ }
+ }
+ /*
+ * If the write starts beyond the initialized size, extend it up to the
+ * beginning of the write and initialize all non-sparse space between
+ * the old initialized size and the new one. This automatically also
+ * increments the vfs inode->i_size to keep it above or equal to the
+ * initialized_size.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (pos > ll) {
+ /*
+ * Wait for ongoing direct i/o to complete before proceeding.
+ * New direct i/o cannot start as we hold i_mutex.
+ */
+ inode_dio_wait(vi);
+ err = ntfs_attr_extend_initialized(ni, pos);
+ if (unlikely(err < 0))
+ ntfs_error(vi->i_sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "extending the initialized size "
+ "failed (error %d).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (int)-err);
+ }
+out:
+ return err;
}
/**
@@ -421,8 +526,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
goto err_out;
}
}
- err = add_to_page_cache_lru(*cached_page, mapping, index,
- GFP_KERNEL);
+ err = add_to_page_cache_lru(*cached_page, mapping,
+ index, GFP_KERNEL);
if (unlikely(err)) {
if (err == -EEXIST)
continue;
@@ -1268,180 +1373,6 @@ rl_not_mapped_enoent:
return err;
}
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied. If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static inline size_t ntfs_copy_from_user(struct page **pages,
- unsigned nr_pages, unsigned ofs, const char __user *buf,
- size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t total = 0;
- unsigned len;
- int left;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- left = __copy_from_user_inatomic(addr + ofs, buf, len);
- kunmap_atomic(addr);
- if (unlikely(left)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- left = __copy_from_user(addr + ofs, buf, len);
- kunmap(*pages);
- if (unlikely(left))
- goto err_out;
- }
- total += len;
- bytes -= len;
- if (!bytes)
- break;
- buf += len;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- total += len - left;
- /* Zero the rest of the target like __copy_from_user(). */
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
-static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr,
- const struct iovec *iov, size_t iov_ofs, size_t bytes)
-{
- size_t total = 0;
-
- while (1) {
- const char __user *buf = iov->iov_base + iov_ofs;
- unsigned len;
- size_t left;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- left = __copy_from_user_inatomic(vaddr, buf, len);
- total += len;
- bytes -= len;
- vaddr += len;
- if (unlikely(left)) {
- total -= left;
- break;
- }
- if (!bytes)
- break;
- iov++;
- iov_ofs = 0;
- }
- return total;
-}
-
-static inline void ntfs_set_next_iovec(const struct iovec **iovp,
- size_t *iov_ofsp, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t iov_ofs = *iov_ofsp;
-
- while (bytes) {
- unsigned len;
-
- len = iov->iov_len - iov_ofs;
- if (len > bytes)
- len = bytes;
- bytes -= len;
- iov_ofs += len;
- if (iov->iov_len == iov_ofs) {
- iov++;
- iov_ofs = 0;
- }
- }
- *iovp = iov;
- *iov_ofsp = iov_ofs;
-}
-
-/*
- * This has the same side-effects and return value as ntfs_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s
- * single-segment behaviour.
- *
- * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when
- * atomic and when not atomic. This is ok because it calls
- * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In
- * fact, the only difference between __copy_from_user_inatomic() and
- * __copy_from_user() is that the latter calls might_sleep() and the former
- * should not zero the tail of the buffer on error. And on many architectures
- * __copy_from_user_inatomic() is just defined to __copy_from_user() so it
- * makes no difference at all on those architectures.
- */
-static inline size_t ntfs_copy_from_user_iovec(struct page **pages,
- unsigned nr_pages, unsigned ofs, const struct iovec **iov,
- size_t *iov_ofs, size_t bytes)
-{
- struct page **last_page = pages + nr_pages;
- char *addr;
- size_t copied, len, total = 0;
-
- do {
- len = PAGE_CACHE_SIZE - ofs;
- if (len > bytes)
- len = bytes;
- addr = kmap_atomic(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs,
- *iov, *iov_ofs, len);
- kunmap_atomic(addr);
- if (unlikely(copied != len)) {
- /* Do it the slow way. */
- addr = kmap(*pages);
- copied = __ntfs_copy_from_user_iovec_inatomic(addr +
- ofs, *iov, *iov_ofs, len);
- if (unlikely(copied != len))
- goto err_out;
- kunmap(*pages);
- }
- total += len;
- ntfs_set_next_iovec(iov, iov_ofs, len);
- bytes -= len;
- if (!bytes)
- break;
- ofs = 0;
- } while (++pages < last_page);
-out:
- return total;
-err_out:
- BUG_ON(copied > len);
- /* Zero the rest of the target like __copy_from_user(). */
- memset(addr + ofs + copied, 0, len - copied);
- kunmap(*pages);
- total += copied;
- ntfs_set_next_iovec(iov, iov_ofs, copied);
- while (++pages < last_page) {
- bytes -= len;
- if (!bytes)
- break;
- len = PAGE_CACHE_SIZE;
- if (len > bytes)
- len = bytes;
- zero_user(*pages, 0, len);
- }
- goto out;
-}
-
static inline void ntfs_flush_dcache_pages(struct page **pages,
unsigned nr_pages)
{
@@ -1762,86 +1693,83 @@ err_out:
return err;
}
-static void ntfs_write_failed(struct address_space *mapping, loff_t to)
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied. If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+ unsigned ofs, struct iov_iter *i, size_t bytes)
{
- struct inode *inode = mapping->host;
+ struct page **last_page = pages + nr_pages;
+ size_t total = 0;
+ struct iov_iter data = *i;
+ unsigned len, copied;
- if (to > inode->i_size) {
- truncate_pagecache(inode, inode->i_size);
- ntfs_truncate_vfs(inode);
- }
+ do {
+ len = PAGE_CACHE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs,
+ len);
+ total += copied;
+ bytes -= copied;
+ if (!bytes)
+ break;
+ iov_iter_advance(&data, copied);
+ if (copied < len)
+ goto err;
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err:
+ /* Zero the rest of the target like __copy_from_user(). */
+ len = PAGE_CACHE_SIZE - copied;
+ do {
+ if (len > bytes)
+ len = bytes;
+ zero_user(*pages, copied, len);
+ bytes -= len;
+ copied = 0;
+ len = PAGE_CACHE_SIZE;
+ } while (++pages < last_page);
+ goto out;
}
/**
- * ntfs_file_buffered_write -
- *
- * Locking: The vfs is holding ->i_mutex on the inode.
+ * ntfs_perform_write - perform buffered write to a file
+ * @file: file to write to
+ * @i: iov_iter with data to write
+ * @pos: byte offset in file at which to begin writing to
*/
-static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs,
- loff_t pos, loff_t *ppos, size_t count)
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+ loff_t pos)
{
- struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *vi = mapping->host;
ntfs_inode *ni = NTFS_I(vi);
ntfs_volume *vol = ni->vol;
struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
struct page *cached_page = NULL;
- char __user *buf = NULL;
- s64 end, ll;
VCN last_vcn;
LCN lcn;
- unsigned long flags;
- size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */
- ssize_t status, written;
+ size_t bytes;
+ ssize_t status, written = 0;
unsigned nr_pages;
- int err;
- ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
- "pos 0x%llx, count 0x%lx.",
- vi->i_ino, (unsigned)le32_to_cpu(ni->type),
- (unsigned long long)pos, (unsigned long)count);
- if (unlikely(!count))
- return 0;
- BUG_ON(NInoMstProtected(ni));
- /*
- * If the attribute is not an index root and it is encrypted or
- * compressed, we cannot write to it yet. Note we need to check for
- * AT_INDEX_ALLOCATION since this is the type of both directory and
- * index inodes.
- */
- if (ni->type != AT_INDEX_ALLOCATION) {
- /* If file is encrypted, deny access, just like NT4. */
- if (NInoEncrypted(ni)) {
- /*
- * Reminder for later: Encrypted files are _always_
- * non-resident so that the content can always be
- * encrypted.
- */
- ntfs_debug("Denying write access to encrypted file.");
- return -EACCES;
- }
- if (NInoCompressed(ni)) {
- /* Only unnamed $DATA attribute can be compressed. */
- BUG_ON(ni->type != AT_DATA);
- BUG_ON(ni->name_len);
- /*
- * Reminder for later: If resident, the data is not
- * actually compressed. Only on the switch to non-
- * resident does compression kick in. This is in
- * contrast to encrypted files (see above).
- */
- ntfs_error(vi->i_sb, "Writing to compressed files is "
- "not implemented yet. Sorry.");
- return -EOPNOTSUPP;
- }
- }
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)pos,
+ (unsigned long)iov_iter_count(i));
/*
* If a previous ntfs_truncate() failed, repeat it and abort if it
* fails again.
*/
if (unlikely(NInoTruncateFailed(ni))) {
+ int err;
+
inode_dio_wait(vi);
err = ntfs_truncate(vi);
if (err || NInoTruncateFailed(ni)) {
@@ -1855,81 +1783,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
return err;
}
}
- /* The first byte after the write. */
- end = pos + count;
- /*
- * If the write goes beyond the allocated size, extend the allocation
- * to cover the whole of the write, rounded up to the nearest cluster.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (end > ll) {
- /* Extend the allocation without changing the data size. */
- ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
- if (likely(ll >= 0)) {
- BUG_ON(pos >= ll);
- /* If the extension was partial truncate the write. */
- if (end > ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "the allocation was only "
- "partially extended.",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type));
- end = ll;
- count = ll - pos;
- }
- } else {
- err = ll;
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->allocated_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- /* Perform a partial write if possible or fail. */
- if (pos < ll) {
- ntfs_debug("Truncating write to inode 0x%lx, "
- "attribute type 0x%x, because "
- "extending the allocation "
- "failed (error code %i).",
- vi->i_ino, (unsigned)
- le32_to_cpu(ni->type), err);
- end = ll;
- count = ll - pos;
- } else {
- ntfs_error(vol->sb, "Cannot perform write to "
- "inode 0x%lx, attribute type "
- "0x%x, because extending the "
- "allocation failed (error "
- "code %i).", vi->i_ino,
- (unsigned)
- le32_to_cpu(ni->type), err);
- return err;
- }
- }
- }
- written = 0;
- /*
- * If the write starts beyond the initialized size, extend it up to the
- * beginning of the write and initialize all non-sparse space between
- * the old initialized size and the new one. This automatically also
- * increments the vfs inode->i_size to keep it above or equal to the
- * initialized_size.
- */
- read_lock_irqsave(&ni->size_lock, flags);
- ll = ni->initialized_size;
- read_unlock_irqrestore(&ni->size_lock, flags);
- if (pos > ll) {
- err = ntfs_attr_extend_initialized(ni, pos);
- if (err < 0) {
- ntfs_error(vol->sb, "Cannot perform write to inode "
- "0x%lx, attribute type 0x%x, because "
- "extending the initialized size "
- "failed (error code %i).", vi->i_ino,
- (unsigned)le32_to_cpu(ni->type), err);
- status = err;
- goto err_out;
- }
- }
/*
* Determine the number of pages per cluster for non-resident
* attributes.
@@ -1937,10 +1790,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
nr_pages = 1;
if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni))
nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT;
- /* Finally, perform the actual write. */
last_vcn = -1;
- if (likely(nr_segs == 1))
- buf = iov->iov_base;
do {
VCN vcn;
pgoff_t idx, start_idx;
@@ -1965,10 +1815,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
vol->cluster_size_bits, false);
up_read(&ni->runlist.lock);
if (unlikely(lcn < LCN_HOLE)) {
- status = -EIO;
if (lcn == LCN_ENOMEM)
status = -ENOMEM;
- else
+ else {
+ status = -EIO;
ntfs_error(vol->sb, "Cannot "
"perform write to "
"inode 0x%lx, "
@@ -1977,6 +1827,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
"is corrupt.",
vi->i_ino, (unsigned)
le32_to_cpu(ni->type));
+ }
break;
}
if (lcn == LCN_HOLE) {
@@ -1989,8 +1840,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
}
}
- if (bytes > count)
- bytes = count;
+ if (bytes > iov_iter_count(i))
+ bytes = iov_iter_count(i);
+again:
/*
* Bring in the user page(s) that we will copy from _first_.
* Otherwise there is a nasty deadlock on copying from the same
@@ -1999,10 +1851,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (likely(nr_segs == 1))
- ntfs_fault_in_pages_readable(buf, bytes);
- else
- ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
+ if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
/* Get and lock @do_pages starting at index @start_idx. */
status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
pages, &cached_page);
@@ -2018,56 +1870,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
status = ntfs_prepare_pages_for_non_resident_write(
pages, do_pages, pos, bytes);
if (unlikely(status)) {
- loff_t i_size;
-
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- /*
- * The write preparation may have instantiated
- * allocated space outside i_size. Trim this
- * off again. We can ignore any errors in this
- * case as we will just be waisting a bit of
- * allocated space, which is not a disaster.
- */
- i_size = i_size_read(vi);
- if (pos + bytes > i_size) {
- ntfs_write_failed(mapping, pos + bytes);
- }
break;
}
}
u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index;
- if (likely(nr_segs == 1)) {
- copied = ntfs_copy_from_user(pages + u, do_pages - u,
- ofs, buf, bytes);
- buf += copied;
- } else
- copied = ntfs_copy_from_user_iovec(pages + u,
- do_pages - u, ofs, &iov, &iov_ofs,
- bytes);
+ copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+ i, bytes);
ntfs_flush_dcache_pages(pages + u, do_pages - u);
- status = ntfs_commit_pages_after_write(pages, do_pages, pos,
- bytes);
- if (likely(!status)) {
- written += copied;
- count -= copied;
- pos += copied;
- if (unlikely(copied != bytes))
- status = -EFAULT;
+ status = 0;
+ if (likely(copied == bytes)) {
+ status = ntfs_commit_pages_after_write(pages, do_pages,
+ pos, bytes);
+ if (!status)
+ status = bytes;
}
do {
unlock_page(pages[--do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
- if (unlikely(status))
+ if (unlikely(status < 0))
break;
- balance_dirty_pages_ratelimited(mapping);
+ copied = status;
cond_resched();
- } while (count);
-err_out:
- *ppos = pos;
+ if (unlikely(!copied)) {
+ size_t sc;
+
+ /*
+ * We failed to copy anything. Fall back to single
+ * segment length write.
+ *
+ * This is needed to avoid possible livelock in the
+ * case that all segments in the iov cannot be copied
+ * at once without a pagefault.
+ */
+ sc = iov_iter_single_seg_count(i);
+ if (bytes > sc)
+ bytes = sc;
+ goto again;
+ }
+ iov_iter_advance(i, copied);
+ pos += copied;
+ written += copied;
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+ } while (iov_iter_count(i));
if (cached_page)
page_cache_release(cached_page);
ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
@@ -2077,59 +1930,56 @@ err_out:
}
/**
- * ntfs_file_aio_write_nolock -
+ * ntfs_file_write_iter_nolock - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as __generic_file_write_iter() except that it ends
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
*/
-static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs, loff_t *ppos)
+static ssize_t ntfs_file_write_iter_nolock(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- loff_t pos;
- size_t count; /* after file limit checks */
- ssize_t written, err;
+ loff_t pos = iocb->ki_pos;
+ ssize_t written = 0;
+ ssize_t err;
+ size_t count = iov_iter_count(from);
- count = iov_length(iov, nr_segs);
- pos = *ppos;
- /* We can write back this queue in page reclaim. */
- current->backing_dev_info = inode_to_bdi(inode);
- written = 0;
- err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
- if (err)
- goto out;
- if (!count)
- goto out;
- err = file_remove_suid(file);
- if (err)
- goto out;
- err = file_update_time(file);
- if (err)
- goto out;
- written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
- count);
-out:
+ err = ntfs_prepare_file_for_write(file, &pos, &count);
+ if (count && !err) {
+ iov_iter_truncate(from, count);
+ written = ntfs_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ }
current->backing_dev_info = NULL;
return written ? written : err;
}
/**
- * ntfs_file_aio_write -
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * calling ntfs_file_write_iter_nolock() instead of
+ * __generic_file_write_iter().
*/
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
+ struct inode *vi = file_inode(file);
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
- mutex_lock(&inode->i_mutex);
- ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
+ mutex_lock(&vi->i_mutex);
+ ret = ntfs_file_write_iter_nolock(iocb, from);
+ mutex_unlock(&vi->i_mutex);
if (ret > 0) {
- int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ ssize_t err;
+
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
@@ -2197,37 +2047,17 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
#endif /* NTFS_RW */
const struct file_operations ntfs_file_ops = {
- .llseek = generic_file_llseek, /* Seek inside file. */
- .read = new_sync_read, /* Read from file. */
- .read_iter = generic_file_read_iter, /* Async read from file. */
+ .llseek = generic_file_llseek,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
#ifdef NTFS_RW
- .write = do_sync_write, /* Write to file. */
- .aio_write = ntfs_file_aio_write, /* Async write to file. */
- /*.release = ,*/ /* Last file is closed. See
- fs/ext2/file.c::
- ext2_release_file() for
- how to use this to discard
- preallocated space for
- write opened files. */
- .fsync = ntfs_file_fsync, /* Sync a file to disk. */
- /*.aio_fsync = ,*/ /* Sync all outstanding async
- i/o operations on a
- kiocb. */
+ .write = new_sync_write,
+ .write_iter = ntfs_file_write_iter,
+ .fsync = ntfs_file_fsync,
#endif /* NTFS_RW */
- /*.ioctl = ,*/ /* Perform function on the
- mounted filesystem. */
- .mmap = generic_file_mmap, /* Mmap file. */
- .open = ntfs_file_open, /* Open file. */
- .splice_read = generic_file_splice_read /* Zero-copy data send with
- the data source being on
- the ntfs partition. We do
- not need to care about the
- data destination. */
- /*.sendpage = ,*/ /* Zero-copy data send with
- the data destination being
- on the ntfs partition. We
- do not need to care about
- the data source. */
+ .mmap = generic_file_mmap,
+ .open = ntfs_file_open,
+ .splice_read = generic_file_splice_read,
};
const struct inode_operations ntfs_file_inode_ops = {
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 898b9949d363..1d0c21df0d80 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,7 +28,6 @@
#include <linux/quotaops.h>
#include <linux/slab.h>
#include <linux/log2.h>
-#include <linux/aio.h>
#include "aops.h"
#include "attrib.h"
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 44db1808cdb5..e1bf18c5d25e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,6 +29,7 @@
#include <linux/mpage.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/uio.h>
#include <cluster/masklog.h>
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 6cae155d54df..dd59599b022d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,7 +22,7 @@
#ifndef OCFS2_AOPS_H
#define OCFS2_AOPS_H
-#include <linux/aio.h>
+#include <linux/fs.h>
handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
struct page *page,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 46e0d4e857c7..91f03ce98108 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
file->f_path.dentry->d_name.name,
(unsigned int)from->nr_segs); /* GRRRRR */
- if (iocb->ki_nbytes == 0)
+ if (count == 0)
return 0;
appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2330,8 +2330,7 @@ relock:
}
can_do_direct = direct_io;
- ret = ocfs2_prepare_inode_for_write(file, ppos,
- iocb->ki_nbytes, appending,
+ ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending,
&can_do_direct, &has_refcount);
if (ret < 0) {
mlog_errno(ret);
@@ -2339,8 +2338,7 @@ relock:
}
if (direct_io && !is_sync_kiocb(iocb))
- unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
- *ppos);
+ unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos);
/*
* We can't complete the direct I/O as requested, fall back to
@@ -2394,7 +2392,6 @@ relock:
/*
* for completing the rest of the request.
*/
- *ppos += written;
count -= written;
written_buffered = generic_perform_write(file, from, *ppos);
/*
@@ -2409,7 +2406,6 @@ relock:
goto out_dio;
}
- iocb->ki_pos = *ppos + written_buffered;
/* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
* semantics.
@@ -2418,6 +2414,7 @@ relock:
ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
endbyte);
if (ret == 0) {
+ iocb->ki_pos = *ppos + written_buffered;
written += written_buffered;
invalidate_mapping_pages(mapping,
*ppos >> PAGE_CACHE_SHIFT,
@@ -2440,10 +2437,14 @@ out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+ if (unlikely(written <= 0))
+ goto no_sync;
+
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ ret = filemap_fdatawrite_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
if (ret < 0)
written = ret;
@@ -2454,10 +2455,12 @@ out_dio:
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, *ppos,
- *ppos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping,
+ iocb->ki_pos - written,
+ iocb->ki_pos - 1);
}
+no_sync:
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8490c64d34fe..460c6c37e683 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
{
- if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
return 1;
return 0;
}
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 20e37a3ed26f..db64ce2d4667 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,11 +102,11 @@
| OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
| OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
| OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG \
- | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+ | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+ | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -179,6 +179,11 @@
#define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO 0x4000
/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO 0x8000
+
+/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
*/
@@ -200,10 +205,6 @@
#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
-/*
- * Append Direct IO support
- */
-#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
diff --git a/fs/open.c b/fs/open.c
index 33f9cbf2610b..6a83c47d5904 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -570,6 +570,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
uid = make_kuid(current_user_ns(), user);
gid = make_kgid(current_user_ns(), group);
+retry_deleg:
newattrs.ia_valid = ATTR_CTIME;
if (user != (uid_t) -1) {
if (!uid_valid(uid))
@@ -586,7 +587,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chown(path, uid, gid);
if (!error)
@@ -988,9 +988,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(err);
if (flags & O_CREAT)
return ERR_PTR(-EINVAL);
- if (!filename && (flags & O_DIRECTORY))
- if (!dentry->d_inode->i_op->lookup)
- return ERR_PTR(-ENOTDIR);
return do_file_open_root(dentry, mnt, filename, &op);
}
EXPORT_SYMBOL(file_open_root);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b90952f528b1..5f0d1993e6e3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
{
struct ovl_fs *ufs = sb->s_fs_info;
- if (!(*flags & MS_RDONLY) &&
- (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)))
+ if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
return -EROFS;
return 0;
@@ -615,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
break;
default:
+ pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
return -EINVAL;
}
}
+
+ /* Workdir is useless in non-upper mount */
+ if (!config->upperdir && config->workdir) {
+ pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+ config->workdir);
+ kfree(config->workdir);
+ config->workdir = NULL;
+ }
+
return 0;
}
@@ -837,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_stack_depth = 0;
if (ufs->config.upperdir) {
- /* FIXME: workdir is not needed for a R/O mount */
if (!ufs->config.workdir) {
pr_err("overlayfs: missing 'workdir'\n");
goto out_free_config;
@@ -847,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto out_free_config;
+ /* Upper fs should not be r/o */
+ if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+ pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+ err = -EINVAL;
+ goto out_put_upperpath;
+ }
+
err = ovl_mount_dir(ufs->config.workdir, &workpath);
if (err)
goto out_put_upperpath;
@@ -869,8 +884,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
stacklen = ovl_split_lowerdirs(lowertmp);
- if (stacklen > OVL_MAX_STACK)
+ if (stacklen > OVL_MAX_STACK) {
+ pr_err("overlayfs: too many lower directries, limit is %d\n",
+ OVL_MAX_STACK);
goto out_free_lowertmp;
+ } else if (!ufs->config.upperdir && stacklen == 1) {
+ pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+ goto out_free_lowertmp;
+ }
stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
if (!stack)
@@ -932,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
ufs->numlower++;
}
- /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */
- if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))
+ /* If the upper fs is nonexistent, we mark overlayfs r/o too */
+ if (!ufs->upper_mnt)
sb->s_flags |= MS_RDONLY;
sb->s_d_op = &ovl_dentry_operations;
diff --git a/fs/pipe.c b/fs/pipe.c
index 21981e58e2a6..2d084f2d0b83 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,7 +21,6 @@
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
-#include <linux/aio.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 956b75d61809..6dee68d013ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1325,6 +1325,9 @@ out:
static int pagemap_open(struct inode *inode, struct file *file)
{
+ /* do not disclose physical addresses: attack vector */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
"to stop being page-shift some time soon. See the "
"linux/Documentation/vm/pagemap.txt for details.\n");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 39d1373128e9..44a549beeafa 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -539,6 +539,9 @@ static int ramoops_probe(struct platform_device *pdev)
mem_address = pdata->mem_address;
record_size = pdata->record_size;
dump_oops = pdata->dump_oops;
+ ramoops_console_size = pdata->console_size;
+ ramoops_pmsg_size = pdata->pmsg_size;
+ ramoops_ftrace_size = pdata->ftrace_size;
pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
cxt->size, (unsigned long long)cxt->phys_addr,
diff --git a/fs/read_write.c b/fs/read_write.c
index 8e1b68786d66..69128b378646 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,7 +9,6 @@
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
-#include <linux/aio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
@@ -343,13 +342,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = iov_iter_count(iter);
iter->type |= READ;
ret = file->f_op->read_iter(&kiocb, iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
-
+ BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0)
*ppos = kiocb.ki_pos;
return ret;
@@ -366,13 +362,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = iov_iter_count(iter);
iter->type |= WRITE;
ret = file->f_op->write_iter(&kiocb, iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
-
+ BUG_ON(ret == -EIOCBQUEUED);
if (ret > 0)
*ppos = kiocb.ki_pos;
return ret;
@@ -426,11 +419,9 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
@@ -446,12 +437,10 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
iov_iter_init(&iter, READ, &iov, 1, len);
ret = filp->f_op->read_iter(&kiocb, &iter);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
@@ -510,11 +499,9 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
@@ -530,12 +517,10 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
iov_iter_init(&iter, WRITE, &iov, 1, len);
ret = filp->f_op->write_iter(&kiocb, &iter);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
@@ -710,60 +695,47 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
}
EXPORT_SYMBOL(iov_shorten);
-static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
+ loff_t *ppos, iter_fn_t fn)
{
struct kiocb kiocb;
- struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
- iov_iter_init(&iter, rw, iov, nr_segs, len);
- ret = fn(&kiocb, &iter);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
+ ret = fn(&kiocb, iter);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
-static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
- unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
+static ssize_t do_sync_readv_writev(struct file *filp, struct iov_iter *iter,
+ loff_t *ppos, iov_fn_t fn)
{
struct kiocb kiocb;
ssize_t ret;
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
- kiocb.ki_nbytes = len;
- ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
- if (ret == -EIOCBQUEUED)
- ret = wait_on_sync_kiocb(&kiocb);
+ ret = fn(&kiocb, iter->iov, iter->nr_segs, kiocb.ki_pos);
+ BUG_ON(ret == -EIOCBQUEUED);
*ppos = kiocb.ki_pos;
return ret;
}
/* Do it by hand, with file-ops */
-static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
+static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
+ loff_t *ppos, io_fn_t fn)
{
- struct iovec *vector = iov;
ssize_t ret = 0;
- while (nr_segs > 0) {
- void __user *base;
- size_t len;
+ while (iov_iter_count(iter)) {
+ struct iovec iovec = iov_iter_iovec(iter);
ssize_t nr;
- base = vector->iov_base;
- len = vector->iov_len;
- vector++;
- nr_segs--;
-
- nr = fn(filp, base, len, ppos);
+ nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos);
if (nr < 0) {
if (!ret)
@@ -771,8 +743,9 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
break;
}
ret += nr;
- if (nr != len)
+ if (nr != iovec.iov_len)
break;
+ iov_iter_advance(iter, nr);
}
return ret;
@@ -863,17 +836,20 @@ static ssize_t do_readv_writev(int type, struct file *file,
size_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
+ struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
iter_fn_t iter_fn;
- ret = rw_copy_check_uvector(type, uvector, nr_segs,
- ARRAY_SIZE(iovstack), iovstack, &iov);
- if (ret <= 0)
- goto out;
+ ret = import_iovec(type, uvector, nr_segs,
+ ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ return ret;
- tot_len = ret;
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
@@ -891,20 +867,17 @@ static ssize_t do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
else if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
+ ret = do_sync_readv_writev(file, &iter, pos, fnv);
else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn);
if (type != READ)
file_end_write(file);
out:
- if (iov != iovstack)
- kfree(iov);
+ kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
@@ -1043,17 +1016,20 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
compat_ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
+ struct iov_iter iter;
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
iter_fn_t iter_fn;
- ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
- UIO_FASTIOV, iovstack, &iov);
- if (ret <= 0)
- goto out;
+ ret = compat_import_iovec(type, uvector, nr_segs,
+ UIO_FASTIOV, &iov, &iter);
+ if (ret < 0)
+ return ret;
- tot_len = ret;
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
@@ -1071,20 +1047,17 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
}
if (iter_fn)
- ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
- pos, iter_fn);
+ ret = do_iter_readv_writev(file, &iter, pos, iter_fn);
else if (fnv)
- ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
- pos, fnv);
+ ret = do_sync_readv_writev(file, &iter, pos, fnv);
else
- ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
+ ret = do_loop_readv_writev(file, &iter, pos, fn);
if (type != READ)
file_end_write(file);
out:
- if (iov != iovstack)
- kfree(iov);
+ kfree(iov);
if ((ret + (type == READ)) > 0) {
if (type == READ)
fsnotify_access(file);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e72401e1f995..9312b7842e03 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,7 +18,7 @@
#include <linux/writeback.h>
#include <linux/quotaops.h>
#include <linux/swap.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
diff --git a/fs/splice.c b/fs/splice.c
index 7968da96bebb..41cbb16299e0 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,7 +32,6 @@
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/compat.h>
-#include <linux/aio.h>
#include "internal.h"
/*
@@ -1534,34 +1533,29 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t count;
pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- ret = rw_copy_check_uvector(READ, uiov, nr_segs,
- ARRAY_SIZE(iovstack), iovstack, &iov);
- if (ret <= 0)
- goto out;
-
- count = ret;
- iov_iter_init(&iter, READ, iov, nr_segs, count);
+ ret = import_iovec(READ, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), &iov, &iter);
+ if (ret < 0)
+ return ret;
+ sd.total_len = iov_iter_count(&iter);
sd.len = 0;
- sd.total_len = count;
sd.flags = flags;
sd.u.data = &iter;
sd.pos = 0;
- pipe_lock(pipe);
- ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
- pipe_unlock(pipe);
-
-out:
- if (iov != iovstack)
- kfree(iov);
+ if (sd.total_len) {
+ pipe_lock(pipe);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+ pipe_unlock(pipe);
+ }
+ kfree(iov);
return ret;
}
diff --git a/fs/stat.c b/fs/stat.c
index ae0c3cef9927..19636af5e75c 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat)
{
int retval;
- retval = security_inode_getattr(path->mnt, path->dentry);
+ retval = security_inode_getattr(path);
if (retval)
return retval;
return vfs_getattr_nosec(path, stat);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2554d8835b48..b400c04371f0 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -41,7 +41,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
if (grp->attrs) {
for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
- umode_t mode = 0;
+ umode_t mode = (*attr)->mode;
/*
* In update mode, we're changing the permissions or
@@ -55,9 +55,14 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
if (!mode)
continue;
}
+
+ WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ "Attribute %s: Invalid permissions 0%o\n",
+ (*attr)->name, mode);
+
+ mode &= SYSFS_PREALLOC | 0664;
error = sysfs_add_file_mode_ns(parent, *attr, false,
- (*attr)->mode | mode,
- NULL);
+ mode, NULL);
if (unlikely(error))
break;
}
diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile
new file mode 100644
index 000000000000..82fa35b656c4
--- /dev/null
+++ b/fs/tracefs/Makefile
@@ -0,0 +1,4 @@
+tracefs-objs := inode.o
+
+obj-$(CONFIG_TRACING) += tracefs.o
+
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
new file mode 100644
index 000000000000..d92bdf3b079a
--- /dev/null
+++ b/fs/tracefs/inode.c
@@ -0,0 +1,650 @@
+/*
+ * inode.c - part of tracefs, a pseudo file system for activating tracing
+ *
+ * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com>
+ *
+ * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * tracefs is the file system that is used by the tracing infrastructure.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/kobject.h>
+#include <linux/namei.h>
+#include <linux/tracefs.h>
+#include <linux/fsnotify.h>
+#include <linux/seq_file.h>
+#include <linux/parser.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+
+#define TRACEFS_DEFAULT_MODE 0700
+
+static struct vfsmount *tracefs_mount;
+static int tracefs_mount_count;
+static bool tracefs_registered;
+
+static ssize_t default_read_file(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return 0;
+}
+
+static ssize_t default_write_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return count;
+}
+
+static const struct file_operations tracefs_file_operations = {
+ .read = default_read_file,
+ .write = default_write_file,
+ .open = simple_open,
+ .llseek = noop_llseek,
+};
+
+static struct tracefs_dir_ops {
+ int (*mkdir)(const char *name);
+ int (*rmdir)(const char *name);
+} tracefs_ops;
+
+static char *get_dname(struct dentry *dentry)
+{
+ const char *dname;
+ char *name;
+ int len = dentry->d_name.len;
+
+ dname = dentry->d_name.name;
+ name = kmalloc(len + 1, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ memcpy(name, dname, len);
+ name[len] = 0;
+ return name;
+}
+
+static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The mkdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * mkdir routine to handle races.
+ */
+ mutex_unlock(&inode->i_mutex);
+ ret = tracefs_ops.mkdir(name);
+ mutex_lock(&inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
+{
+ char *name;
+ int ret;
+
+ name = get_dname(dentry);
+ if (!name)
+ return -ENOMEM;
+
+ /*
+ * The rmdir call can call the generic functions that create
+ * the files within the tracefs system. It is up to the individual
+ * rmdir routine to handle races.
+ * This time we need to unlock not only the parent (inode) but
+ * also the directory that is being deleted.
+ */
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ ret = tracefs_ops.rmdir(name);
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock(&dentry->d_inode->i_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = tracefs_syscall_mkdir,
+ .rmdir = tracefs_syscall_rmdir,
+};
+
+static struct inode *tracefs_get_inode(struct super_block *sb)
+{
+ struct inode *inode = new_inode(sb);
+ if (inode) {
+ inode->i_ino = get_next_ino();
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ }
+ return inode;
+}
+
+struct tracefs_mount_opts {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+};
+
+enum {
+ Opt_uid,
+ Opt_gid,
+ Opt_mode,
+ Opt_err
+};
+
+static const match_table_t tokens = {
+ {Opt_uid, "uid=%u"},
+ {Opt_gid, "gid=%u"},
+ {Opt_mode, "mode=%o"},
+ {Opt_err, NULL}
+};
+
+struct tracefs_fs_info {
+ struct tracefs_mount_opts mount_opts;
+};
+
+static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
+{
+ substring_t args[MAX_OPT_ARGS];
+ int option;
+ int token;
+ kuid_t uid;
+ kgid_t gid;
+ char *p;
+
+ opts->mode = TRACEFS_DEFAULT_MODE;
+
+ while ((p = strsep(&data, ",")) != NULL) {
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case Opt_uid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ uid = make_kuid(current_user_ns(), option);
+ if (!uid_valid(uid))
+ return -EINVAL;
+ opts->uid = uid;
+ break;
+ case Opt_gid:
+ if (match_int(&args[0], &option))
+ return -EINVAL;
+ gid = make_kgid(current_user_ns(), option);
+ if (!gid_valid(gid))
+ return -EINVAL;
+ opts->gid = gid;
+ break;
+ case Opt_mode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->mode = option & S_IALLUGO;
+ break;
+ /*
+ * We might like to report bad mount options here;
+ * but traditionally tracefs has ignored all mount options
+ */
+ }
+ }
+
+ return 0;
+}
+
+static int tracefs_apply_options(struct super_block *sb)
+{
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+ struct inode *inode = sb->s_root->d_inode;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+
+ inode->i_uid = opts->uid;
+ inode->i_gid = opts->gid;
+
+ return 0;
+}
+
+static int tracefs_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct tracefs_fs_info *fsi = sb->s_fs_info;
+
+ sync_filesystem(sb);
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ tracefs_apply_options(sb);
+
+fail:
+ return err;
+}
+
+static int tracefs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct tracefs_fs_info *fsi = root->d_sb->s_fs_info;
+ struct tracefs_mount_opts *opts = &fsi->mount_opts;
+
+ if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
+ if (!gid_eq(opts->gid, GLOBAL_ROOT_GID))
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
+ if (opts->mode != TRACEFS_DEFAULT_MODE)
+ seq_printf(m, ",mode=%o", opts->mode);
+
+ return 0;
+}
+
+static const struct super_operations tracefs_super_operations = {
+ .statfs = simple_statfs,
+ .remount_fs = tracefs_remount,
+ .show_options = tracefs_show_options,
+};
+
+static int trace_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr trace_files[] = {{""}};
+ struct tracefs_fs_info *fsi;
+ int err;
+
+ save_mount_options(sb, data);
+
+ fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL);
+ sb->s_fs_info = fsi;
+ if (!fsi) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ err = tracefs_parse_options(data, &fsi->mount_opts);
+ if (err)
+ goto fail;
+
+ err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files);
+ if (err)
+ goto fail;
+
+ sb->s_op = &tracefs_super_operations;
+
+ tracefs_apply_options(sb);
+
+ return 0;
+
+fail:
+ kfree(fsi);
+ sb->s_fs_info = NULL;
+ return err;
+}
+
+static struct dentry *trace_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_single(fs_type, flags, data, trace_fill_super);
+}
+
+static struct file_system_type trace_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "tracefs",
+ .mount = trace_mount,
+ .kill_sb = kill_litter_super,
+};
+MODULE_ALIAS_FS("tracefs");
+
+static struct dentry *start_creating(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+ int error;
+
+ pr_debug("tracefs: creating file '%s'\n",name);
+
+ error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
+ &tracefs_mount_count);
+ if (error)
+ return ERR_PTR(error);
+
+ /* If the parent is not specified, we create it in the root.
+ * We need the root dentry to do this, which is in the super
+ * block. A pointer to that is in the struct vfsmount that we
+ * have around.
+ */
+ if (!parent)
+ parent = tracefs_mount->mnt_root;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ dentry = lookup_one_len(name, parent, strlen(name));
+ if (!IS_ERR(dentry) && dentry->d_inode) {
+ dput(dentry);
+ dentry = ERR_PTR(-EEXIST);
+ }
+ if (IS_ERR(dentry))
+ mutex_unlock(&parent->d_inode->i_mutex);
+ return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ dput(dentry);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ return dentry;
+}
+
+/**
+ * tracefs_create_file - create a file in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the tracefs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ *
+ * This is the basic "create a file" function for tracefs. It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the tracefs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If tracefs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_file(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_fop = fops ? fops : &tracefs_file_operations;
+ inode->i_private = data;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+static struct dentry *__create_dir(const char *name, struct dentry *parent,
+ const struct inode_operations *ops)
+{
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = tracefs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = ops;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
+}
+
+/**
+ * tracefs_create_dir - create a directory in the tracefs filesystem
+ * @name: a pointer to a string containing the name of the directory to
+ * create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * directory will be created in the root of the tracefs filesystem.
+ *
+ * This function creates a directory in tracefs with the given name.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the tracefs_remove() function when the file is
+ * to be removed. If an error occurs, %NULL will be returned.
+ *
+ * If tracing is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
+{
+ return __create_dir(name, parent, &simple_dir_inode_operations);
+}
+
+/**
+ * tracefs_create_instance_dir - create the tracing instances directory
+ * @name: The name of the instances directory to create
+ * @parent: The parent directory that the instances directory will exist
+ * @mkdir: The function to call when a mkdir is performed.
+ * @rmdir: The function to call when a rmdir is performed.
+ *
+ * Only one instances directory is allowed.
+ *
+ * The instances directory is special as it allows for mkdir and rmdir to
+ * to be done by userspace. When a mkdir or rmdir is performed, the inode
+ * locks are released and the methhods passed in (@mkdir and @rmdir) are
+ * called without locks and with the name of the directory being created
+ * within the instances directory.
+ *
+ * Returns the dentry of the instances directory.
+ */
+struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent,
+ int (*mkdir)(const char *name),
+ int (*rmdir)(const char *name))
+{
+ struct dentry *dentry;
+
+ /* Only allow one instance of the instances directory. */
+ if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
+ return NULL;
+
+ dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ if (!dentry)
+ return NULL;
+
+ tracefs_ops.mkdir = mkdir;
+ tracefs_ops.rmdir = rmdir;
+
+ return dentry;
+}
+
+static inline int tracefs_positive(struct dentry *dentry)
+{
+ return dentry->d_inode && !d_unhashed(dentry);
+}
+
+static int __tracefs_remove(struct dentry *dentry, struct dentry *parent)
+{
+ int ret = 0;
+
+ if (tracefs_positive(dentry)) {
+ if (dentry->d_inode) {
+ dget(dentry);
+ switch (dentry->d_inode->i_mode & S_IFMT) {
+ case S_IFDIR:
+ ret = simple_rmdir(parent->d_inode, dentry);
+ break;
+ default:
+ simple_unlink(parent->d_inode, dentry);
+ break;
+ }
+ if (!ret)
+ d_delete(dentry);
+ dput(dentry);
+ }
+ }
+ return ret;
+}
+
+/**
+ * tracefs_remove - removes a file or directory from the tracefs filesystem
+ * @dentry: a pointer to a the dentry of the file or directory to be
+ * removed.
+ *
+ * This function removes a file or directory in tracefs that was previously
+ * created with a call to another tracefs function (like
+ * tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove(struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ mutex_lock(&parent->d_inode->i_mutex);
+ ret = __tracefs_remove(dentry, parent);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (!ret)
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+}
+
+/**
+ * tracefs_remove_recursive - recursively removes a directory
+ * @dentry: a pointer to a the dentry of the directory to be removed.
+ *
+ * This function recursively removes a directory tree in tracefs that
+ * was previously created with a call to another tracefs function
+ * (like tracefs_create_file() or variants thereof.)
+ */
+void tracefs_remove_recursive(struct dentry *dentry)
+{
+ struct dentry *child, *parent;
+
+ if (IS_ERR_OR_NULL(dentry))
+ return;
+
+ parent = dentry->d_parent;
+ if (!parent || !parent->d_inode)
+ return;
+
+ parent = dentry;
+ down:
+ mutex_lock(&parent->d_inode->i_mutex);
+ loop:
+ /*
+ * The parent->d_subdirs is protected by the d_lock. Outside that
+ * lock, the child can be unlinked and set to be freed which can
+ * use the d_u.d_child as the rcu head and corrupt this list.
+ */
+ spin_lock(&parent->d_lock);
+ list_for_each_entry(child, &parent->d_subdirs, d_child) {
+ if (!tracefs_positive(child))
+ continue;
+
+ /* perhaps simple_empty(child) makes more sense */
+ if (!list_empty(&child->d_subdirs)) {
+ spin_unlock(&parent->d_lock);
+ mutex_unlock(&parent->d_inode->i_mutex);
+ parent = child;
+ goto down;
+ }
+
+ spin_unlock(&parent->d_lock);
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+
+ /*
+ * The parent->d_lock protects agaist child from unlinking
+ * from d_subdirs. When releasing the parent->d_lock we can
+ * no longer trust that the next pointer is valid.
+ * Restart the loop. We'll skip this one with the
+ * tracefs_positive() check.
+ */
+ goto loop;
+ }
+ spin_unlock(&parent->d_lock);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+ child = parent;
+ parent = parent->d_parent;
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ if (child != dentry)
+ /* go up */
+ goto loop;
+
+ if (!__tracefs_remove(child, parent))
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ mutex_unlock(&parent->d_inode->i_mutex);
+}
+
+/**
+ * tracefs_initialized - Tells whether tracefs has been registered
+ */
+bool tracefs_initialized(void)
+{
+ return tracefs_registered;
+}
+
+static struct kobject *trace_kobj;
+
+static int __init tracefs_init(void)
+{
+ int retval;
+
+ trace_kobj = kobject_create_and_add("tracing", kernel_kobj);
+ if (!trace_kobj)
+ return -EINVAL;
+
+ retval = register_filesystem(&trace_fs_type);
+ if (!retval)
+ tracefs_registered = true;
+
+ return retval;
+}
+core_initcall(tracefs_init);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e627c0acf626..c3d15fe83403 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,7 +50,6 @@
*/
#include "ubifs.h"
-#include <linux/aio.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/slab.h>
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 08f3555fbeac..7f885cc8b0b7 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,7 @@
#include <linux/errno.h>
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "udf_i.h"
#include "udf_sb.h"
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
int err, pos;
- size_t count = iocb->ki_nbytes;
+ size_t count = iov_iter_count(from);
struct udf_inode_info *iinfo = UDF_I(inode);
mutex_lock(&inode->i_mutex);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a445d599098d..9c1fbd23913d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,7 +38,7 @@
#include <linux/slab.h>
#include <linux/crc-itu-t.h>
#include <linux/mpage.h>
-#include <linux/aio.h>
+#include <linux/uio.h>
#include "udf_i.h"
#include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3a9b7a1b8704..4f8cdc59bc38 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,7 +31,6 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
-#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
#include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ce615d12fb44..f44212fae653 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,7 +38,6 @@
#include "xfs_icache.h"
#include "xfs_pnfs.h"
-#include <linux/aio.h>
#include <linux/dcache.h>
#include <linux/falloc.h>
#include <linux/pagevec.h>
@@ -397,7 +396,8 @@ STATIC int /* error (positive) */
xfs_zero_last_block(
struct xfs_inode *ip,
xfs_fsize_t offset,
- xfs_fsize_t isize)
+ xfs_fsize_t isize,
+ bool *did_zeroing)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
@@ -425,6 +425,7 @@ xfs_zero_last_block(
zero_len = mp->m_sb.sb_blocksize - zero_offset;
if (isize + zero_len > offset)
zero_len = offset - isize;
+ *did_zeroing = true;
return xfs_iozero(ip, isize, zero_len);
}
@@ -443,7 +444,8 @@ int /* error (positive) */
xfs_zero_eof(
struct xfs_inode *ip,
xfs_off_t offset, /* starting I/O offset */
- xfs_fsize_t isize) /* current inode size */
+ xfs_fsize_t isize, /* current inode size */
+ bool *did_zeroing)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t start_zero_fsb;
@@ -465,7 +467,7 @@ xfs_zero_eof(
* We only zero a part of that block so it is handled specially.
*/
if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
- error = xfs_zero_last_block(ip, offset, isize);
+ error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
if (error)
return error;
}
@@ -525,6 +527,7 @@ xfs_zero_eof(
if (error)
return error;
+ *did_zeroing = true;
start_zero_fsb = imap.br_startoff + imap.br_blockcount;
ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
}
@@ -567,13 +570,15 @@ restart:
* having to redo all checks before.
*/
if (*pos > i_size_read(inode)) {
+ bool zero = false;
+
if (*iolock == XFS_IOLOCK_SHARED) {
xfs_rw_iunlock(ip, *iolock);
*iolock = XFS_IOLOCK_EXCL;
xfs_rw_ilock(ip, *iolock);
goto restart;
}
- error = xfs_zero_eof(ip, *pos, i_size_read(inode));
+ error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero);
if (error)
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index daafa1f6d260..6163767aa856 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2867,6 +2867,10 @@ xfs_rename(
* Handle RENAME_EXCHANGE flags
*/
if (flags & RENAME_EXCHANGE) {
+ if (target_ip == NULL) {
+ error = -EINVAL;
+ goto error_return;
+ }
error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
target_dp, target_name, target_ip,
&free_list, &first_block, spaceres);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 86cd6b39bed7..a1cd55f3f351 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -384,10 +384,11 @@ enum xfs_prealloc_flags {
XFS_PREALLOC_INVISIBLE = (1 << 4),
};
-int xfs_update_prealloc_flags(struct xfs_inode *,
- enum xfs_prealloc_flags);
-int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
-int xfs_iozero(struct xfs_inode *, loff_t, size_t);
+int xfs_update_prealloc_flags(struct xfs_inode *ip,
+ enum xfs_prealloc_flags flags);
+int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_fsize_t isize, bool *did_zeroing);
+int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count);
#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d919ad7b16bf..e53a90331422 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -751,6 +751,7 @@ xfs_setattr_size(
int error;
uint lock_flags = 0;
uint commit_flags = 0;
+ bool did_zeroing = false;
trace_xfs_setattr(ip);
@@ -794,20 +795,16 @@ xfs_setattr_size(
return error;
/*
- * Now we can make the changes. Before we join the inode to the
- * transaction, take care of the part of the truncation that must be
- * done without the inode lock. This needs to be done before joining
- * the inode to the transaction, because the inode cannot be unlocked
- * once it is a part of the transaction.
+ * File data changes must be complete before we start the transaction to
+ * modify the inode. This needs to be done before joining the inode to
+ * the transaction because the inode cannot be unlocked once it is a
+ * part of the transaction.
+ *
+ * Start with zeroing any data block beyond EOF that we may expose on
+ * file extension.
*/
if (newsize > oldsize) {
- /*
- * Do the first part of growing a file: zero any data in the
- * last block that is beyond the old EOF. We need to do this
- * before the inode is joined to the transaction to modify
- * i_size.
- */
- error = xfs_zero_eof(ip, newsize, oldsize);
+ error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
if (error)
return error;
}
@@ -817,23 +814,18 @@ xfs_setattr_size(
* any previous writes that are beyond the on disk EOF and the new
* EOF that have not been written out need to be written here. If we
* do not write the data out, we expose ourselves to the null files
- * problem.
- *
- * Only flush from the on disk size to the smaller of the in memory
- * file size or the new size as that's the range we really care about
- * here and prevents waiting for other data not within the range we
- * care about here.
+ * problem. Note that this includes any block zeroing we did above;
+ * otherwise those blocks may not be zeroed after a crash.
*/
- if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
+ if (newsize > ip->i_d.di_size &&
+ (oldsize != ip->i_d.di_size || did_zeroing)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
return error;
}
- /*
- * Wait for all direct I/O to complete.
- */
+ /* Now wait for all direct I/O to complete. */
inode_dio_wait(inode);
/*
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 4b33ef112400..365dd57ea760 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -300,8 +300,10 @@ xfs_fs_commit_blocks(
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
- if (error)
+ if (error) {
+ xfs_trans_cancel(tp, 0);
goto out_drop_iolock;
+ }
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 53cc2aaf8d2b..fbbb9e62e274 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -836,6 +836,11 @@ xfs_qm_reset_dqcounts(
*/
xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
"xfs_quotacheck");
+ /*
+ * Reset type in case we are reusing group quota file for
+ * project quotas or vice versa
+ */
+ ddq->d_flags = type;
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;