aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/cmservice.c78
-rw-r--r--fs/afs/fsclient.c221
-rw-r--r--fs/afs/internal.h14
-rw-r--r--fs/afs/rxrpc.c73
-rw-r--r--fs/afs/vlclient.c11
-rw-r--r--fs/aio.c16
-rw-r--r--fs/autofs4/expire.c55
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/delayed-ref.c34
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c56
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c223
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c36
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c83
-rw-r--r--fs/btrfs/ioctl.c14
-rw-r--r--fs/btrfs/qgroup.c62
-rw-r--r--fs/btrfs/qgroup.h36
-rw-r--r--fs/btrfs/relocation.c128
-rw-r--r--fs/btrfs/root-tree.c27
-rw-r--r--fs/btrfs/send.c183
-rw-r--r--fs/btrfs/super.c16
-rw-r--r--fs/btrfs/transaction.c7
-rw-r--r--fs/btrfs/tree-log.c107
-rw-r--r--fs/btrfs/tree-log.h5
-rw-r--r--fs/btrfs/volumes.c27
-rw-r--r--fs/buffer.c3
-rw-r--r--fs/cachefiles/interface.c8
-rw-r--r--fs/cachefiles/internal.h3
-rw-r--r--fs/cachefiles/namei.c8
-rw-r--r--fs/ceph/caps.c5
-rw-r--r--fs/ceph/dir.c2
-rw-r--r--fs/ceph/mds_client.c1
-rw-r--r--fs/cifs/cifsfs.c29
-rw-r--r--fs/cifs/cifsproto.h5
-rw-r--r--fs/cifs/cifssmb.c27
-rw-r--r--fs/cifs/connect.c31
-rw-r--r--fs/cifs/file.c14
-rw-r--r--fs/cifs/link.c2
-rw-r--r--fs/compat.c16
-rw-r--r--fs/configfs/file.c1
-rw-r--r--fs/crypto/policy.c41
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/dlm/debug_fs.c62
-rw-r--r--fs/ext4/inode.c2
-rw-r--r--fs/ext4/ioctl.c2
-rw-r--r--fs/ext4/super.c18
-rw-r--r--fs/ext4/xattr.c53
-rw-r--r--fs/ext4/xattr.h1
-rw-r--r--fs/f2fs/data.c2
-rw-r--r--fs/f2fs/f2fs.h12
-rw-r--r--fs/f2fs/file.c22
-rw-r--r--fs/f2fs/node.c47
-rw-r--r--fs/f2fs/super.c6
-rw-r--r--fs/file.c34
-rw-r--r--fs/fs-writeback.c6
-rw-r--r--fs/fuse/file.c7
-rw-r--r--fs/hpfs/file.c6
-rw-r--r--fs/inode.c41
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c6
-rw-r--r--fs/iomap.c26
-rw-r--r--fs/kernfs/file.c28
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/blocklayout.h3
-rw-r--r--fs/nfs/blocklayout/extent_tree.c10
-rw-r--r--fs/nfs/callback.c1
-rw-r--r--fs/nfs/callback_proc.c8
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c45
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c23
-rw-r--r--fs/nfs/internal.h5
-rw-r--r--fs/nfs/nfs42proc.c36
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c5
-rw-r--r--fs/nfs/nfs4proc.c119
-rw-r--r--fs/nfs/nfs4renewd.c20
-rw-r--r--fs/nfs/nfs4session.c53
-rw-r--r--fs/nfs/nfs4session.h7
-rw-r--r--fs/nfs/nfs4state.c9
-rw-r--r--fs/nfs/pnfs.c44
-rw-r--r--fs/nfs/super.c19
-rw-r--r--fs/nfsd/nfs4state.c65
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/notify/fanotify/fanotify.c13
-rw-r--r--fs/notify/fanotify/fanotify_user.c36
-rw-r--r--fs/notify/group.c19
-rw-r--r--fs/notify/notification.c23
-rw-r--r--fs/ntfs/file.c2
-rw-r--r--fs/ocfs2/alloc.c56
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h5
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c12
-rw-r--r--fs/ocfs2/file.c34
-rw-r--r--fs/ocfs2/suballoc.c14
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/dir.c58
-rw-r--r--fs/overlayfs/inode.c108
-rw-r--r--fs/overlayfs/overlayfs.h17
-rw-r--r--fs/overlayfs/readdir.c63
-rw-r--r--fs/overlayfs/super.c93
-rw-r--r--fs/pipe.c4
-rw-r--r--fs/posix_acl.c18
-rw-r--r--fs/proc/base.c52
-rw-r--r--fs/proc/fd.c8
-rw-r--r--fs/proc/fd.h2
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/kcore.c31
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/task_mmu.c2
-rw-r--r--fs/ramfs/file-mmu.c9
-rw-r--r--fs/seq_file.c4
-rw-r--r--fs/sysfs/file.c8
-rw-r--r--fs/ubifs/tnc_commit.c2
-rw-r--r--fs/ubifs/xattr.c5
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c16
-rw-r--r--fs/xfs/libxfs/xfs_btree.c14
-rw-r--r--fs/xfs/libxfs/xfs_defer.c17
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h13
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_sb.c3
-rw-r--r--fs/xfs/xfs_buf.c3
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_fsops.c1
-rw-r--r--fs/xfs/xfs_iomap.c69
-rw-r--r--fs/xfs/xfs_iomap.h1
-rw-r--r--fs/xfs/xfs_iops.c9
-rw-r--r--fs/xfs/xfs_super.c9
-rw-r--r--fs/xfs/xfs_trace.h3
136 files changed, 2345 insertions, 1159 deletions
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 4b0eff6da674..85737e96ab8b 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -189,11 +189,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
case 1:
_debug("extract FID count");
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("FID count: %u", call->count);
@@ -210,11 +207,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
_debug("extract FID array");
ret = afs_extract_data(call, skb, last, call->buffer,
call->count * 3 * 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
_debug("unmarshall FID array");
call->request = kcalloc(call->count,
@@ -239,11 +233,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
case 3:
_debug("extract CB count");
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
tmp = ntohl(call->tmp);
_debug("CB count: %u", tmp);
@@ -258,11 +249,8 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
_debug("extract CB array");
ret = afs_extract_data(call, skb, last, call->request,
call->count * 3 * 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
_debug("unmarshall CB array");
cb = call->request;
@@ -278,9 +266,9 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
call->unmarshall++;
case 5:
- _debug("trailer");
- if (skb->len != 0)
- return -EBADMSG;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
/* Record that the message was unmarshalled successfully so
* that the call destructor can know do the callback breaking
@@ -294,8 +282,6 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
break;
}
- if (!last)
- return 0;
call->state = AFS_CALL_REPLYING;
@@ -335,13 +321,13 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
{
struct afs_server *server;
struct in_addr addr;
+ int ret;
_enter(",{%u},%d", skb->len, last);
- if (skb->len > 0)
- return -EBADMSG;
- if (!last)
- return 0;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
/* no unmarshalling required */
call->state = AFS_CALL_REPLYING;
@@ -371,8 +357,10 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
_enter(",{%u},%d", skb->len, last);
+ /* There are some arguments that we ignore */
+ afs_data_consumed(call, skb);
if (!last)
- return 0;
+ return -EAGAIN;
/* no unmarshalling required */
call->state = AFS_CALL_REPLYING;
@@ -408,12 +396,13 @@ static void SRXAFSCB_Probe(struct work_struct *work)
static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
bool last)
{
+ int ret;
+
_enter(",{%u},%d", skb->len, last);
- if (skb->len > 0)
- return -EBADMSG;
- if (!last)
- return 0;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
/* no unmarshalling required */
call->state = AFS_CALL_REPLYING;
@@ -460,10 +449,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- if (skb->len > 0)
- return -EBADMSG;
- if (!last)
- return 0;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
switch (call->unmarshall) {
case 0:
@@ -509,8 +497,9 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb,
break;
}
- if (!last)
- return 0;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
call->state = AFS_CALL_REPLYING;
@@ -588,12 +577,13 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call,
struct sk_buff *skb, bool last)
{
+ int ret;
+
_enter(",{%u},%d", skb->len, last);
- if (skb->len > 0)
- return -EBADMSG;
- if (!last)
- return 0;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
/* no unmarshalling required */
call->state = AFS_CALL_REPLYING;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index c2e930ec2888..9312b92e54be 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -240,15 +240,13 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call,
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter(",,%u", last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -335,11 +333,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
case 1:
_debug("extract data length (MSW)");
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("DATA length MSW: %u", call->count);
@@ -353,11 +348,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
case 2:
_debug("extract data length");
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("DATA length: %u", call->count);
@@ -375,11 +367,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
ret = afs_extract_data(call, skb, last, buffer,
call->count);
kunmap_atomic(buffer);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
}
call->offset = 0;
@@ -389,11 +378,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
case 4:
ret = afs_extract_data(call, skb, last, call->buffer,
(21 + 3 + 6) * 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
bp = call->buffer;
xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode, NULL);
@@ -405,15 +391,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call,
call->unmarshall++;
case 5:
- _debug("trailer");
- if (skb->len != 0)
- return -EBADMSG;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
break;
}
- if (!last)
- return 0;
-
if (call->count < PAGE_SIZE) {
_debug("clear");
page = call->reply3;
@@ -537,9 +520,8 @@ static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
{
_enter(",{%u},%d", skb->len, last);
- if (skb->len > 0)
- return -EBADMSG; /* shouldn't be any reply data */
- return 0;
+ /* shouldn't be any reply data */
+ return afs_data_complete(call, skb, last);
}
/*
@@ -622,15 +604,13 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call,
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -721,15 +701,13 @@ static int afs_deliver_fs_remove(struct afs_call *call,
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -804,15 +782,13 @@ static int afs_deliver_fs_link(struct afs_call *call,
{
struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -892,15 +868,13 @@ static int afs_deliver_fs_symlink(struct afs_call *call,
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -999,15 +973,13 @@ static int afs_deliver_fs_rename(struct afs_call *call,
{
struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -1105,20 +1077,13 @@ static int afs_deliver_fs_store_data(struct afs_call *call,
{
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter(",,%u", last);
- afs_transfer_reply(call, skb);
- if (!last) {
- _leave(" = 0 [more]");
- return 0;
- }
-
- if (call->reply_size != call->reply_max) {
- _leave(" = -EBADMSG [%u != %u]",
- call->reply_size, call->reply_max);
- return -EBADMSG;
- }
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
@@ -1292,20 +1257,13 @@ static int afs_deliver_fs_store_status(struct afs_call *call,
afs_dataversion_t *store_version;
struct afs_vnode *vnode = call->reply;
const __be32 *bp;
+ int ret;
_enter(",,%u", last);
- afs_transfer_reply(call, skb);
- if (!last) {
- _leave(" = 0 [more]");
- return 0;
- }
-
- if (call->reply_size != call->reply_max) {
- _leave(" = -EBADMSG [%u != %u]",
- call->reply_size, call->reply_max);
- return -EBADMSG;
- }
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
store_version = NULL;
@@ -1504,11 +1462,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
_debug("extract status");
ret = afs_extract_data(call, skb, last, call->buffer,
12 * 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
bp = call->buffer;
xdr_decode_AFSFetchVolumeStatus(&bp, call->reply2);
@@ -1518,11 +1473,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the volume name length */
case 2:
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("volname length: %u", call->count);
@@ -1537,11 +1489,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
if (call->count > 0) {
ret = afs_extract_data(call, skb, last, call->reply3,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
}
p = call->reply3;
@@ -1561,11 +1510,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 4:
ret = afs_extract_data(call, skb, last, call->buffer,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->offset = 0;
call->unmarshall++;
@@ -1574,11 +1520,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the offline message length */
case 5:
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("offline msg length: %u", call->count);
@@ -1593,11 +1536,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
if (call->count > 0) {
ret = afs_extract_data(call, skb, last, call->reply3,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
}
p = call->reply3;
@@ -1617,11 +1557,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 7:
ret = afs_extract_data(call, skb, last, call->buffer,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->offset = 0;
call->unmarshall++;
@@ -1630,11 +1567,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
/* extract the message of the day length */
case 8:
ret = afs_extract_data(call, skb, last, &call->tmp, 4);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->count = ntohl(call->tmp);
_debug("motd length: %u", call->count);
@@ -1649,11 +1583,8 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
if (call->count > 0) {
ret = afs_extract_data(call, skb, last, call->reply3,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
}
p = call->reply3;
@@ -1673,26 +1604,20 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call,
case 10:
ret = afs_extract_data(call, skb, last, call->buffer,
call->count);
- switch (ret) {
- case 0: break;
- case -EAGAIN: return 0;
- default: return ret;
- }
+ if (ret < 0)
+ return ret;
call->offset = 0;
call->unmarshall++;
no_motd_padding:
case 11:
- _debug("trailer %d", skb->len);
- if (skb->len != 0)
- return -EBADMSG;
+ ret = afs_data_complete(call, skb, last);
+ if (ret < 0)
+ return ret;
break;
}
- if (!last)
- return 0;
-
_leave(" = 0 [done]");
return 0;
}
@@ -1764,15 +1689,13 @@ static int afs_deliver_fs_xxxx_lock(struct afs_call *call,
struct sk_buff *skb, bool last)
{
const __be32 *bp;
+ int ret;
_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 71d5982312f3..df976b2a7f40 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -609,17 +609,29 @@ extern void afs_proc_cell_remove(struct afs_cell *);
*/
extern int afs_open_socket(void);
extern void afs_close_socket(void);
+extern void afs_data_consumed(struct afs_call *, struct sk_buff *);
extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
const struct afs_wait_mode *);
extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
size_t, size_t);
extern void afs_flat_call_destructor(struct afs_call *);
-extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern int afs_transfer_reply(struct afs_call *, struct sk_buff *, bool);
extern void afs_send_empty_reply(struct afs_call *);
extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
size_t);
+static inline int afs_data_complete(struct afs_call *call, struct sk_buff *skb,
+ bool last)
+{
+ if (skb->len > 0)
+ return -EBADMSG;
+ afs_data_consumed(call, skb);
+ if (!last)
+ return -EAGAIN;
+ return 0;
+}
+
/*
* security.c
*/
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 4832de84d52c..14d04c848465 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -150,10 +150,9 @@ void afs_close_socket(void)
}
/*
- * note that the data in a socket buffer is now delivered and that the buffer
- * should be freed
+ * Note that the data in a socket buffer is now consumed.
*/
-static void afs_data_delivered(struct sk_buff *skb)
+void afs_data_consumed(struct afs_call *call, struct sk_buff *skb)
{
if (!skb) {
_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
@@ -161,9 +160,7 @@ static void afs_data_delivered(struct sk_buff *skb)
} else {
_debug("DLVR %p{%u} [%d]",
skb, skb->mark, atomic_read(&afs_outstanding_skbs));
- if (atomic_dec_return(&afs_outstanding_skbs) == -1)
- BUG();
- rxrpc_kernel_data_delivered(skb);
+ rxrpc_kernel_data_consumed(call->rxcall, skb);
}
}
@@ -489,9 +486,15 @@ static void afs_deliver_to_call(struct afs_call *call)
last = rxrpc_kernel_is_data_last(skb);
ret = call->type->deliver(call, skb, last);
switch (ret) {
+ case -EAGAIN:
+ if (last) {
+ _debug("short data");
+ goto unmarshal_error;
+ }
+ break;
case 0:
- if (last &&
- call->state == AFS_CALL_AWAIT_REPLY)
+ ASSERT(last);
+ if (call->state == AFS_CALL_AWAIT_REPLY)
call->state = AFS_CALL_COMPLETE;
break;
case -ENOTCONN:
@@ -501,6 +504,7 @@ static void afs_deliver_to_call(struct afs_call *call)
abort_code = RX_INVALID_OPERATION;
goto do_abort;
default:
+ unmarshal_error:
abort_code = RXGEN_CC_UNMARSHAL;
if (call->state != AFS_CALL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -511,9 +515,7 @@ static void afs_deliver_to_call(struct afs_call *call)
call->state = AFS_CALL_ERROR;
break;
}
- afs_data_delivered(skb);
- skb = NULL;
- continue;
+ break;
case RXRPC_SKB_MARK_FINAL_ACK:
_debug("Rcv ACK");
call->state = AFS_CALL_COMPLETE;
@@ -685,15 +687,35 @@ static void afs_process_async_call(struct afs_call *call)
}
/*
- * empty a socket buffer into a flat reply buffer
+ * Empty a socket buffer into a flat reply buffer.
*/
-void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+int afs_transfer_reply(struct afs_call *call, struct sk_buff *skb, bool last)
{
size_t len = skb->len;
- if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
- BUG();
- call->reply_size += len;
+ if (len > call->reply_max - call->reply_size) {
+ _leave(" = -EBADMSG [%zu > %u]",
+ len, call->reply_max - call->reply_size);
+ return -EBADMSG;
+ }
+
+ if (len > 0) {
+ if (skb_copy_bits(skb, 0, call->buffer + call->reply_size,
+ len) < 0)
+ BUG();
+ call->reply_size += len;
+ }
+
+ afs_data_consumed(call, skb);
+ if (!last)
+ return -EAGAIN;
+
+ if (call->reply_size != call->reply_max) {
+ _leave(" = -EBADMSG [%u != %u]",
+ call->reply_size, call->reply_max);
+ return -EBADMSG;
+ }
+ return 0;
}
/*
@@ -745,7 +767,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
}
/*
- * grab the operation ID from an incoming cache manager call
+ * Grab the operation ID from an incoming cache manager call. The socket
+ * buffer is discarded on error or if we don't yet have sufficient data.
*/
static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
bool last)
@@ -766,12 +789,9 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
call->offset += len;
if (call->offset < 4) {
- if (last) {
- _leave(" = -EBADMSG [op ID short]");
- return -EBADMSG;
- }
- _leave(" = 0 [incomplete]");
- return 0;
+ afs_data_consumed(call, skb);
+ _leave(" = -EAGAIN");
+ return -EAGAIN;
}
call->state = AFS_CALL_AWAIT_REQUEST;
@@ -855,7 +875,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
}
/*
- * extract a piece of data from the received data socket buffers
+ * Extract a piece of data from the received data socket buffers.
*/
int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
bool last, void *buf, size_t count)
@@ -873,10 +893,7 @@ int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
call->offset += len;
if (call->offset < count) {
- if (last) {
- _leave(" = -EBADMSG [%d < %zu]", call->offset, count);
- return -EBADMSG;
- }
+ afs_data_consumed(call, skb);
_leave(" = -EAGAIN");
return -EAGAIN;
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 340afd0cd182..f94d1abdc3eb 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -64,16 +64,13 @@ static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
struct afs_cache_vlocation *entry;
__be32 *bp;
u32 tmp;
- int loop;
+ int loop, ret;
_enter(",,%u", last);
- afs_transfer_reply(call, skb);
- if (!last)
- return 0;
-
- if (call->reply_size != call->reply_max)
- return -EBADMSG;
+ ret = afs_transfer_reply(call, skb, last);
+ if (ret < 0)
+ return ret;
/* unmarshall the reply once we've received all of it */
entry = call->reply;
diff --git a/fs/aio.c b/fs/aio.c
index fb8e45b88cd4..1157e13a36d6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,12 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
static const struct dentry_operations ops = {
.d_dname = simple_dname,
};
- return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
+ struct dentry *root = mount_pseudo(fs_type, "aio:", NULL, &ops,
+ AIO_RING_MAGIC);
+
+ if (!IS_ERR(root))
+ root->d_sb->s_iflags |= SB_I_NOEXEC;
+ return root;
}
/* aio_setup
@@ -269,14 +274,17 @@ __initcall(aio_setup);
static void put_aio_ring_file(struct kioctx *ctx)
{
struct file *aio_ring_file = ctx->aio_ring_file;
+ struct address_space *i_mapping;
+
if (aio_ring_file) {
truncate_setsize(aio_ring_file->f_inode, 0);
/* Prevent further access to the kioctx from migratepages */
- spin_lock(&aio_ring_file->f_inode->i_mapping->private_lock);
- aio_ring_file->f_inode->i_mapping->private_data = NULL;
+ i_mapping = aio_ring_file->f_inode->i_mapping;
+ spin_lock(&i_mapping->private_lock);
+ i_mapping->private_data = NULL;
ctx->aio_ring_file = NULL;
- spin_unlock(&aio_ring_file->f_inode->i_mapping->private_lock);
+ spin_unlock(&i_mapping->private_lock);
fput(aio_ring_file);
}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index b493909e7492..d8e6d421c27f 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -417,6 +417,7 @@ static struct dentry *should_expire(struct dentry *dentry,
}
return NULL;
}
+
/*
* Find an eligible tree to time-out
* A tree is eligible if :-
@@ -432,6 +433,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
struct dentry *root = sb->s_root;
struct dentry *dentry;
struct dentry *expired;
+ struct dentry *found;
struct autofs_info *ino;
if (!root)
@@ -442,31 +444,46 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
dentry = NULL;
while ((dentry = get_next_positive_subdir(dentry, root))) {
+ int flags = how;
+
spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(dentry);
- if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
- expired = NULL;
- else
- expired = should_expire(dentry, mnt, timeout, how);
- if (!expired) {
+ if (ino->flags & AUTOFS_INF_WANT_EXPIRE) {
spin_unlock(&sbi->fs_lock);
continue;
}
+ spin_unlock(&sbi->fs_lock);
+
+ expired = should_expire(dentry, mnt, timeout, flags);
+ if (!expired)
+ continue;
+
+ spin_lock(&sbi->fs_lock);
ino = autofs4_dentry_ino(expired);
ino->flags |= AUTOFS_INF_WANT_EXPIRE;
spin_unlock(&sbi->fs_lock);
synchronize_rcu();
- spin_lock(&sbi->fs_lock);
- if (should_expire(expired, mnt, timeout, how)) {
- if (expired != dentry)
- dput(dentry);
- goto found;
- }
+ /* Make sure a reference is not taken on found if
+ * things have changed.
+ */
+ flags &= ~AUTOFS_EXP_LEAVES;
+ found = should_expire(expired, mnt, timeout, how);
+ if (!found || found != expired)
+ /* Something has changed, continue */
+ goto next;
+
+ if (expired != dentry)
+ dput(dentry);
+
+ spin_lock(&sbi->fs_lock);
+ goto found;
+next:
+ spin_lock(&sbi->fs_lock);
ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+ spin_unlock(&sbi->fs_lock);
if (expired != dentry)
dput(expired);
- spin_unlock(&sbi->fs_lock);
}
return NULL;
@@ -483,6 +500,7 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
struct autofs_info *ino = autofs4_dentry_ino(dentry);
int status;
+ int state;
/* Block on any pending expire */
if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE))
@@ -490,8 +508,19 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
if (rcu_walk)
return -ECHILD;
+retry:
spin_lock(&sbi->fs_lock);
- if (ino->flags & AUTOFS_INF_EXPIRING) {
+ state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING);
+ if (state == AUTOFS_INF_WANT_EXPIRE) {
+ spin_unlock(&sbi->fs_lock);
+ /*
+ * Possibly being selected for expire, wait until
+ * it's selected or not.
+ */
+ schedule_timeout_uninterruptible(HZ/10);
+ goto retry;
+ }
+ if (state & AUTOFS_INF_EXPIRING) {
spin_unlock(&sbi->fs_lock);
pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7f6aff3f72eb..e5495f37c6ed 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -853,6 +853,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
+ install_exec_creds(bprm);
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
@@ -1044,7 +1045,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- install_exec_creds(bprm);
retval = create_elf_tables(bprm, &loc->elf_ex,
load_addr, interp_load_addr);
if (retval < 0)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c3cdde87cc8c..08ae99343d92 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -249,7 +249,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
* thaw_bdev drops it.
*/
sb = get_super(bdev);
- drop_super(sb);
+ if (sb)
+ drop_super(sb);
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return sb;
}
@@ -646,7 +647,7 @@ static struct dentry *bd_mount(struct file_system_type *fs_type,
{
struct dentry *dent;
dent = mount_pseudo(fs_type, "bdev:", &bdev_sops, NULL, BDEVFS_MAGIC);
- if (dent)
+ if (!IS_ERR(dent))
dent->d_sb->s_iflags |= SB_I_CGROUPWB;
return dent;
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2b88439c2ee8..455a6b2fd539 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -589,6 +589,7 @@ static void __merge_refs(struct list_head *head, int mode)
list_del(&ref2->list);
kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+ cond_resched();
}
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2fe8f89091a3..33fe03551105 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -427,6 +427,7 @@ struct btrfs_space_info {
struct list_head ro_bgs;
struct list_head priority_tickets;
struct list_head tickets;
+ u64 tickets_id;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1028,6 +1029,7 @@ struct btrfs_fs_info {
struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
+ bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */
/* filesystem state */
unsigned long fs_state;
@@ -1079,6 +1081,8 @@ struct btrfs_fs_info {
struct list_head pinned_chunks;
int creating_free_space_tree;
+ /* Used to record internally whether fs has been frozen */
+ int fs_frozen;
};
struct btrfs_subvolume_writers {
@@ -2578,7 +2582,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index b6d210e7a993..ac02e041464b 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -541,7 +541,6 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_qgroup_extent_record *qexisting;
int count_mod = 1;
int must_insert_reserved = 0;
@@ -606,10 +605,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
- qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
- delayed_refs,
- qrecord);
- if (qexisting)
+ if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info,
+ delayed_refs, qrecord))
kfree(qrecord);
}
@@ -862,33 +859,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
return 0;
}
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- u64 ref_root, u64 bytenr, u64 num_bytes)
-{
- struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_head *ref_head;
- int ret = 0;
-
- if (!fs_info->quota_enabled || !is_fstree(ref_root))
- return 0;
-
- delayed_refs = &trans->transaction->delayed_refs;
-
- spin_lock(&delayed_refs->lock);
- ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
- if (!ref_head) {
- ret = -ENOENT;
- goto out;
- }
- WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
- ref_head->qgroup_ref_root = ref_root;
- ref_head->qgroup_reserved = num_bytes;
-out:
- spin_unlock(&delayed_refs->lock);
- return ret;
-}
-
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 5fca9534a271..43f3629760e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -250,9 +250,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 parent, u64 ref_root,
u64 owner, u64 offset, u64 reserved, int action,
struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 59febfb8d04a..54bc8c7c6bcd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -559,8 +559,29 @@ static noinline int check_leaf(struct btrfs_root *root,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
- if (nritems == 0)
+ if (nritems == 0) {
+ struct btrfs_root *check_root;
+
+ key.objectid = btrfs_header_owner(leaf);
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = (u64)-1;
+
+ check_root = btrfs_get_fs_root(root->fs_info, &key, false);
+ /*
+ * The only reason we also check NULL here is that during
+ * open_ctree() some roots has not yet been set up.
+ */
+ if (!IS_ERR_OR_NULL(check_root)) {
+ /* if leaf is the root, then it's fine */
+ if (leaf->start !=
+ btrfs_root_bytenr(&check_root->root_item)) {
+ CORRUPT("non-root leaf's nritems is 0",
+ leaf, root, 0);
+ return -EIO;
+ }
+ }
return 0;
+ }
/* Check the 0 item */
if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
@@ -612,6 +633,19 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0;
}
+static int check_node(struct btrfs_root *root, struct extent_buffer *node)
+{
+ unsigned long nr = btrfs_header_nritems(node);
+
+ if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) {
+ btrfs_crit(root->fs_info,
+ "corrupt node: block %llu root %llu nritems %lu",
+ node->start, root->objectid, nr);
+ return -EIO;
+ }
+ return 0;
+}
+
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
@@ -682,6 +716,9 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
ret = -EIO;
}
+ if (found_level > 0 && check_node(root, eb))
+ ret = -EIO;
+
if (!ret)
set_extent_buffer_uptodate(eb);
err:
@@ -1618,8 +1655,8 @@ fail:
return ret;
}
-static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
- u64 root_id)
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id)
{
struct btrfs_root *root;
@@ -2298,6 +2335,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
fs_info->qgroup_ulist = NULL;
+ fs_info->qgroup_rescan_running = false;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2624,6 +2662,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->qgroup_op_seq, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
+ fs_info->fs_frozen = 0;
fs_info->sb = sb;
fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
fs_info->metadata_ratio = 0;
@@ -3739,8 +3778,15 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
btrfs_free_log(NULL, root);
+ if (root->reloc_root) {
+ free_extent_buffer(root->reloc_root->node);
+ free_extent_buffer(root->reloc_root->commit_root);
+ btrfs_put_fs_root(root->reloc_root);
+ root->reloc_root = NULL;
+ }
+ }
if (root->free_ino_pinned)
__btrfs_remove_free_space_cache(root->free_ino_pinned);
@@ -3851,7 +3897,7 @@ void close_ctree(struct btrfs_root *root)
smp_mb();
/* wait for the qgroup rescan worker to stop */
- btrfs_qgroup_wait_for_completion(fs_info);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b3207a0e09f7..f19a982f5a4f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,6 +68,8 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
struct btrfs_key *location);
int btrfs_init_fs_root(struct btrfs_root *root);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+ u64 root_id);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 61b494e8e604..665da8f66ff1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,21 +60,6 @@ enum {
CHUNK_ALLOC_FORCE = 2,
};
-/*
- * Control how reservations are dealt with.
- *
- * RESERVE_FREE - freeing a reservation.
- * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
- * ENOSPC accounting
- * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
- * bytes_may_use as the ENOSPC accounting is done elsewhere
- */
-enum {
- RESERVE_FREE = 0,
- RESERVE_ALLOC = 1,
- RESERVE_ALLOC_NO_ACCOUNT = 2,
-};
-
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
u64 num_bytes, int alloc);
@@ -104,9 +89,10 @@ static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve,
- int delalloc);
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc);
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -3501,7 +3487,6 @@ again:
dcs = BTRFS_DC_SETUP;
else if (ret == -ENOSPC)
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
- btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -4286,13 +4271,10 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
if (ret < 0)
return ret;
- /*
- * Use new btrfs_qgroup_reserve_data to reserve precious data space
- *
- * TODO: Find a good method to avoid reserve data space for NOCOW
- * range, but don't impact performance on quota disable case.
- */
+ /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
ret = btrfs_qgroup_reserve_data(inode, start, len);
+ if (ret)
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
return ret;
}
@@ -4472,6 +4454,15 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
}
}
+/*
+ * If force is CHUNK_ALLOC_FORCE:
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ * If force is NOT CHUNK_ALLOC_FORCE:
+ * - return 0 if it doesn't need to allocate a new chunk,
+ * - return 1 if it successfully allocates a chunk,
+ * - return errors including -ENOSPC otherwise.
+ */
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags, int force)
{
@@ -4882,7 +4873,7 @@ static int flush_space(struct btrfs_root *root,
btrfs_get_alloc_profile(root, 0),
CHUNK_ALLOC_NO_FORCE);
btrfs_end_transaction(trans, root);
- if (ret == -ENOSPC)
+ if (ret > 0 || ret == -ENOSPC)
ret = 0;
break;
case COMMIT_TRANS:
@@ -4907,11 +4898,6 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
u64 expected;
u64 to_reclaim = 0;
- to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL))
- return 0;
-
list_for_each_entry(ticket, &space_info->tickets, list)
to_reclaim += ticket->bytes;
list_for_each_entry(ticket, &space_info->priority_tickets, list)
@@ -4919,6 +4905,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
if (to_reclaim)
return to_reclaim;
+ to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL))
+ return 0;
+
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
@@ -4972,12 +4963,12 @@ static void wake_all_tickets(struct list_head *head)
*/
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
- struct reserve_ticket *last_ticket = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
int flush_state;
int commit_cycles = 0;
+ u64 last_tickets_id;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
@@ -4990,8 +4981,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
spin_unlock(&space_info->lock);
return;
}
- last_ticket = list_first_entry(&space_info->tickets,
- struct reserve_ticket, list);
+ last_tickets_id = space_info->tickets_id;
spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
@@ -5011,10 +5001,10 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info);
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
- if (last_ticket == ticket) {
+ if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
- last_ticket = ticket;
+ last_tickets_id = space_info->tickets_id;
flush_state = FLUSH_DELAYED_ITEMS_NR;
if (commit_cycles)
commit_cycles--;
@@ -5390,6 +5380,7 @@ again:
list_del_init(&ticket->list);
num_bytes -= ticket->bytes;
ticket->bytes = 0;
+ space_info->tickets_id++;
wake_up(&ticket->wait);
} else {
ticket->bytes -= num_bytes;
@@ -5432,6 +5423,7 @@ again:
num_bytes -= ticket->bytes;
space_info->bytes_may_use += ticket->bytes;
ticket->bytes = 0;
+ space_info->tickets_id++;
wake_up(&ticket->wait);
} else {
trace_btrfs_space_reservation(fs_info, "space_info",
@@ -6497,19 +6489,15 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
}
/**
- * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * btrfs_add_reserved_bytes - update the block_group and space info counters
* @cache: The cache we are manipulating
+ * @ram_bytes: The number of bytes of file content, and will be same to
+ * @num_bytes except for the compress path.
* @num_bytes: The number of bytes in question
- * @reserve: One of the reservation enums
* @delalloc: The blocks are allocated for the delalloc write
*
- * This is called by the allocator when it reserves space, or by somebody who is
- * freeing space that was never actually used on disk. For example if you
- * reserve some space for a new leaf in transaction A and before transaction A
- * commits you free that leaf, you call this with reserve set to 0 in order to
- * clear the reservation.
- *
- * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * This is called by the allocator when it reserves space. Metadata
+ * reservations should be called with RESERVE_ALLOC so we do the proper
* ENOSPC accounting. For data we handle the reservation through clearing the
* delalloc bits in the io_tree. We have to do this since we could end up
* allocating less disk space for the amount of data we have reserved in the
@@ -6519,44 +6507,63 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
* make the reservation and return -EAGAIN, otherwise this function always
* succeeds.
*/
-static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve, int delalloc)
+static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 ram_bytes, u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
- if (reserve != RESERVE_FREE) {
- if (cache->ro) {
- ret = -EAGAIN;
- } else {
- cache->reserved += num_bytes;
- space_info->bytes_reserved += num_bytes;
- if (reserve == RESERVE_ALLOC) {
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- num_bytes, 0);
- space_info->bytes_may_use -= num_bytes;
- }
-
- if (delalloc)
- cache->delalloc_bytes += num_bytes;
- }
+ if (cache->ro) {
+ ret = -EAGAIN;
} else {
- if (cache->ro)
- space_info->bytes_readonly += num_bytes;
- cache->reserved -= num_bytes;
- space_info->bytes_reserved -= num_bytes;
+ cache->reserved += num_bytes;
+ space_info->bytes_reserved += num_bytes;
+ trace_btrfs_space_reservation(cache->fs_info,
+ "space_info", space_info->flags,
+ ram_bytes, 0);
+ space_info->bytes_may_use -= ram_bytes;
if (delalloc)
- cache->delalloc_bytes -= num_bytes;
+ cache->delalloc_bytes += num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
return ret;
}
+/**
+ * btrfs_free_reserved_bytes - update the block_group and space info counters
+ * @cache: The cache we are manipulating
+ * @num_bytes: The number of bytes in question
+ * @delalloc: The blocks are allocated for the delalloc write
+ *
+ * This is called by somebody who is freeing space that was never actually used
+ * on disk. For example if you reserve some space for a new leaf in transaction
+ * A and before transaction A commits you free that leaf, you call this with
+ * reserve set to 0 in order to clear the reservation.
+ */
+
+static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
+{
+ struct btrfs_space_info *space_info = cache->space_info;
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ spin_lock(&cache->lock);
+ if (cache->ro)
+ space_info->bytes_readonly += num_bytes;
+ cache->reserved -= num_bytes;
+ space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
+ spin_unlock(&cache->lock);
+ spin_unlock(&space_info->lock);
+ return ret;
+}
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -7191,7 +7198,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_free_reserved_bytes(cache, buf->len, 0);
btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
@@ -7416,9 +7423,9 @@ btrfs_release_block_group(struct btrfs_block_group_cache *cache,
* the free space extent currently.
*/
static noinline int find_free_extent(struct btrfs_root *orig_root,
- u64 num_bytes, u64 empty_size,
- u64 hint_byte, struct btrfs_key *ins,
- u64 flags, int delalloc)
+ u64 ram_bytes, u64 num_bytes, u64 empty_size,
+ u64 hint_byte, struct btrfs_key *ins,
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -7430,8 +7437,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
- int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
- RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -7763,8 +7768,8 @@ checks:
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(block_group, num_bytes,
- alloc_type, delalloc);
+ ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
+ num_bytes, delalloc);
if (ret == -EAGAIN) {
btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
@@ -7936,7 +7941,7 @@ again:
up_read(&info->groups_sem);
}
-int btrfs_reserve_extent(struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
@@ -7948,8 +7953,8 @@ int btrfs_reserve_extent(struct btrfs_root *root,
flags = btrfs_get_alloc_profile(root, is_data);
again:
WARN_ON(num_bytes < root->sectorsize);
- ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags, delalloc);
+ ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
+ hint_byte, ins, flags, delalloc);
if (!ret && !is_data) {
btrfs_dec_block_group_reservations(root->fs_info,
ins->objectid);
@@ -7958,6 +7963,7 @@ again:
num_bytes = min(num_bytes >> 1, ins->offset);
num_bytes = round_down(num_bytes, root->sectorsize);
num_bytes = max(num_bytes, min_alloc_size);
+ ram_bytes = num_bytes;
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
@@ -7995,7 +8001,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ btrfs_free_reserved_bytes(cache, len, delalloc);
trace_btrfs_reserved_extent_free(root, start, len);
}
@@ -8208,6 +8214,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
+ struct btrfs_space_info *space_info;
/*
* Mixed block groups will exclude before processing the log so we only
@@ -8223,9 +8230,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
if (!block_group)
return -EINVAL;
- ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT, 0);
- BUG_ON(ret); /* logic error */
+ space_info = block_group->space_info;
+ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ space_info->bytes_reserved += ins->offset;
+ block_group->reserved += ins->offset;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
btrfs_put_block_group(block_group);
@@ -8368,7 +8380,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
- ret = btrfs_reserve_extent(root, blocksize, blocksize,
+ ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
empty_size, hint, &ins, 0, 0);
if (ret)
goto out_unuse;
@@ -8521,35 +8533,6 @@ reada:
wc->reada_slot = slot;
}
-/*
- * These may not be seen by the usual inc/dec ref code so we have to
- * add them here.
- */
-static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes)
-{
- struct btrfs_qgroup_extent_record *qrecord;
- struct btrfs_delayed_ref_root *delayed_refs;
-
- qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
- if (!qrecord)
- return -ENOMEM;
-
- qrecord->bytenr = bytenr;
- qrecord->num_bytes = num_bytes;
- qrecord->old_roots = NULL;
-
- delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
- if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
- delayed_refs, qrecord))
- kfree(qrecord);
- spin_unlock(&delayed_refs->lock);
-
- return 0;
-}
-
static int account_leaf_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *eb)
@@ -8583,7 +8566,8 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
- ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+ ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+ bytenr, num_bytes, GFP_NOFS);
if (ret)
return ret;
}
@@ -8732,8 +8716,9 @@ walk_down:
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
- ret = record_one_subtree_extent(trans, root, child_bytenr,
- root->nodesize);
+ ret = btrfs_qgroup_insert_dirty_extent(trans,
+ root->fs_info, child_bytenr,
+ root->nodesize, GFP_NOFS);
if (ret)
goto out;
}
@@ -9906,6 +9891,7 @@ static int find_first_block_group(struct btrfs_root *root,
} else {
ret = 0;
}
+ free_extent_map(em);
goto out;
}
path->slots[0]++;
@@ -9942,6 +9928,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group->iref = 0;
block_group->inode = NULL;
spin_unlock(&block_group->lock);
+ ASSERT(block_group->io_ctl.inode == NULL);
iput(inode);
last = block_group->key.objectid + block_group->key.offset;
btrfs_put_block_group(block_group);
@@ -9999,6 +9986,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
free_excluded_extents(info->extent_root, block_group);
btrfs_remove_free_space_cache(block_group);
+ ASSERT(list_empty(&block_group->dirty_list));
+ ASSERT(list_empty(&block_group->io_list));
+ ASSERT(list_empty(&block_group->bg_list));
+ ASSERT(atomic_read(&block_group->count) == 1);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bc2729a7612d..28cd88fccc7e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,6 +20,7 @@
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
#define EXTENT_QGROUP_RESERVED (1U << 16)
+#define EXTENT_CLEAR_DATA_RESV (1U << 17)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9404121fd5f7..fea31a4a6e36 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2033,6 +2033,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+ /*
+ * An ordered extent might have started before and completed
+ * already with io errors, in which case the inode was not
+ * updated and we end up here. So check the inode's mapping
+ * flags for any errors that might have happened while doing
+ * writeback of file data.
+ */
+ ret = btrfs_inode_check_errors(inode);
inode_unlock(inode);
goto out;
}
@@ -2062,7 +2070,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
}
trans->sync = true;
- btrfs_init_log_ctx(&ctx);
+ btrfs_init_log_ctx(&ctx, inode);
ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
if (ret < 0) {
@@ -2667,6 +2675,7 @@ static long btrfs_fallocate(struct file *file, int mode,
alloc_start = round_down(offset, blocksize);
alloc_end = round_up(offset + len, blocksize);
+ cur_offset = alloc_start;
/* Make sure we aren't being give some crap mode */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2759,7 +2768,6 @@ static long btrfs_fallocate(struct file *file, int mode,
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
- cur_offset = alloc_start;
while (1) {
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
@@ -2786,6 +2794,14 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte - cur_offset);
if (ret < 0)
break;
+ } else {
+ /*
+ * Do not need to reserve unwritten extent for this
+ * range, free reserved data space first, otherwise
+ * it'll result in false ENOSPC error.
+ */
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ last_byte - cur_offset);
}
free_extent_map(em);
cur_offset = last_byte;
@@ -2803,6 +2819,9 @@ static long btrfs_fallocate(struct file *file, int mode,
range->start,
range->len, 1 << inode->i_blkbits,
offset + len, &alloc_hint);
+ else
+ btrfs_free_reserved_data_space(inode, range->start,
+ range->len);
list_del(&range->list);
kfree(range);
}
@@ -2837,18 +2856,11 @@ out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_KERNEL);
out:
- /*
- * As we waited the extent range, the data_rsv_map must be empty
- * in the range, as written data range will be released from it.
- * And for prealloacted extent, it will also be released when
- * its metadata is written.
- * So this is completely used as cleanup.
- */
- btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
inode_unlock(inode);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_start,
- alloc_end - alloc_start);
+ if (ret != 0)
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - cur_offset);
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index aa6fabaee72e..359ee861b5a4 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -495,10 +495,9 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, 0, prealloc);
+ btrfs_delalloc_release_metadata(inode, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2f5975954ccf..e6811c42e41e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -566,6 +566,8 @@ cont:
PAGE_SET_WRITEBACK |
page_error_op |
PAGE_END_WRITEBACK);
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ end - start + 1);
goto free_pages_out;
}
}
@@ -742,7 +744,7 @@ retry:
lock_extent(io_tree, async_extent->start,
async_extent->start + async_extent->ram_size - 1);
- ret = btrfs_reserve_extent(root,
+ ret = btrfs_reserve_extent(root, async_extent->ram_size,
async_extent->compressed_size,
async_extent->compressed_size,
0, alloc_hint, &ins, 1, 1);
@@ -969,7 +971,8 @@ static noinline int cow_file_range(struct inode *inode,
EXTENT_DEFRAG, PAGE_UNLOCK |
PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
PAGE_END_WRITEBACK);
-
+ btrfs_free_reserved_data_space_noquota(inode, start,
+ end - start + 1);
*nr_written = *nr_written +
(end - start + PAGE_SIZE) / PAGE_SIZE;
*page_started = 1;
@@ -989,7 +992,7 @@ static noinline int cow_file_range(struct inode *inode,
unsigned long op;
cur_alloc_size = disk_num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
@@ -1489,8 +1492,10 @@ out_check:
extent_clear_unlock_delalloc(inode, cur_offset,
cur_offset + num_bytes - 1,
locked_page, EXTENT_LOCKED |
- EXTENT_DELALLOC, PAGE_UNLOCK |
- PAGE_SET_PRIVATE2);
+ EXTENT_DELALLOC |
+ EXTENT_CLEAR_DATA_RESV,
+ PAGE_UNLOCK | PAGE_SET_PRIVATE2);
+
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
cur_offset = extent_end;
@@ -1807,7 +1812,9 @@ static void btrfs_clear_bit_hook(struct inode *inode,
return;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
- && do_list && !(state->state & EXTENT_NORESERVE))
+ && do_list && !(state->state & EXTENT_NORESERVE)
+ && (*bits & (EXTENT_DO_ACCOUNTING |
+ EXTENT_CLEAR_DATA_RESV)))
btrfs_free_reserved_data_space_noquota(inode,
state->start, len);
@@ -3435,10 +3442,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
found_key.offset = 0;
inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
ret = PTR_ERR_OR_ZERO(inode);
- if (ret && ret != -ESTALE)
+ if (ret && ret != -ENOENT)
goto out;
- if (ret == -ESTALE && root == root->fs_info->tree_root) {
+ if (ret == -ENOENT && root == root->fs_info->tree_root) {
struct btrfs_root *dead_root;
struct btrfs_fs_info *fs_info = root->fs_info;
int is_dead_root = 0;
@@ -3474,7 +3481,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
* Inode is already gone but the orphan item is still there,
* kill the orphan item.
*/
- if (ret == -ESTALE) {
+ if (ret == -ENOENT) {
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -3633,7 +3640,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
/*
* read an inode from the btree into the in-memory inode
*/
-static void btrfs_read_locked_inode(struct inode *inode)
+static int btrfs_read_locked_inode(struct inode *inode)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
@@ -3652,14 +3659,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
filled = true;
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ ret = -ENOMEM;
goto make_bad;
+ }
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
- if (ret)
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
goto make_bad;
+ }
leaf = path->nodes[0];
@@ -3812,11 +3824,12 @@ cache_acl:
}
btrfs_update_iflags(inode);
- return;
+ return 0;
make_bad:
btrfs_free_path(path);
make_bad_inode(inode);
+ return ret;
}
/*
@@ -4204,6 +4217,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
int err = 0;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_trans_handle *trans;
+ u64 last_unlink_trans;
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
@@ -4226,11 +4240,27 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
if (err)
goto out;
+ last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
+
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry),
dentry->d_name.name, dentry->d_name.len);
- if (!err)
+ if (!err) {
btrfs_i_size_write(inode, 0);
+ /*
+ * Propagate the last_unlink_trans value of the deleted dir to
+ * its parent directory. This is to prevent an unrecoverable
+ * log tree in the case we do something like this:
+ * 1) create dir foo
+ * 2) create snapshot under dir foo
+ * 3) delete the snapshot
+ * 4) rmdir foo
+ * 5) mkdir foo
+ * 6) fsync foo or some file inside foo
+ */
+ if (last_unlink_trans >= trans->transid)
+ BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
+ }
out:
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
@@ -5606,7 +5636,9 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- btrfs_read_locked_inode(inode);
+ int ret;
+
+ ret = btrfs_read_locked_inode(inode);
if (!is_bad_inode(inode)) {
inode_tree_add(inode);
unlock_new_inode(inode);
@@ -5615,7 +5647,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
} else {
unlock_new_inode(inode);
iput(inode);
- inode = ERR_PTR(-ESTALE);
+ ASSERT(ret < 0);
+ inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
}
}
@@ -7225,7 +7258,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
int ret;
alloc_hint = get_extent_allocation_hint(inode, start, len);
- ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+ ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0,
alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
@@ -7725,6 +7758,13 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
ret = PTR_ERR(em2);
goto unlock_err;
}
+ /*
+ * For inode marked NODATACOW or extent marked PREALLOC,
+ * use the existing or preallocated extent, so does not
+ * need to adjust btrfs_space_info's bytes_may_use.
+ */
+ btrfs_free_reserved_data_space_noquota(inode,
+ start, len);
goto unlock;
}
}
@@ -7759,7 +7799,6 @@ unlock:
i_size_write(inode, start + len);
adjust_dio_outstanding_extents(inode, dio_data, len);
- btrfs_free_reserved_data_space(inode, start, len);
WARN_ON(dio_data->reserve < len);
dio_data->reserve -= len;
dio_data->unsubmitted_oe_range_end = start + len;
@@ -10280,6 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
+ u64 end = start + num_bytes - 1;
if (trans)
own_trans = false;
@@ -10301,8 +10341,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
* sized chunks.
*/
cur_bytes = min(cur_bytes, last_alloc);
- ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1, 0);
+ ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
+ min_size, 0, *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -10388,6 +10428,9 @@ next:
if (own_trans)
btrfs_end_transaction(trans, root);
}
+ if (cur_offset < end)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ end - cur_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 14ed1e9e6bc8..7fd939bfbd99 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1634,6 +1634,9 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
int namelen;
int ret = 0;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
ret = mnt_want_write_file(file);
if (ret)
goto out;
@@ -1691,6 +1694,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
struct btrfs_ioctl_vol_args *vol_args;
int ret;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -1714,6 +1720,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
+ if (!S_ISDIR(file_inode(file)->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -2357,6 +2366,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
int ret;
int err = 0;
+ if (!S_ISDIR(dir->i_mode))
+ return -ENOTDIR;
+
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
@@ -5084,7 +5096,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- return btrfs_qgroup_wait_for_completion(root->fs_info);
+ return btrfs_qgroup_wait_for_completion(root->fs_info, true);
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 93ee1c18ef9d..8db2e29fdcf4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -995,7 +995,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
goto out;
fs_info->quota_enabled = 0;
fs_info->pending_quota_state = 0;
- btrfs_qgroup_wait_for_completion(fs_info);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1453,10 +1453,9 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
return ret;
}
-struct btrfs_qgroup_extent_record *
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
@@ -1475,12 +1474,42 @@ btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
else if (bytenr > entry->bytenr)
p = &(*p)->rb_right;
else
- return entry;
+ return 1;
}
rb_link_node(&record->node, parent_node, p);
rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
- return NULL;
+ return 0;
+}
+
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+ gfp_t gfp_flag)
+{
+ struct btrfs_qgroup_extent_record *record;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ if (!fs_info->quota_enabled || bytenr == 0 || num_bytes == 0)
+ return 0;
+ if (WARN_ON(trans == NULL))
+ return -EINVAL;
+ record = kmalloc(sizeof(*record), gfp_flag);
+ if (!record)
+ return -ENOMEM;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ record->bytenr = bytenr;
+ record->num_bytes = num_bytes;
+ record->old_roots = NULL;
+
+ spin_lock(&delayed_refs->lock);
+ ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs,
+ record);
+ spin_unlock(&delayed_refs->lock);
+ if (ret > 0)
+ kfree(record);
+ return 0;
}
#define UPDATE_NEW 0
@@ -2303,6 +2332,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
path = btrfs_alloc_path();
if (!path)
goto out;
@@ -2369,6 +2402,9 @@ out:
}
done:
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = false;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
complete_all(&fs_info->qgroup_rescan_completion);
}
@@ -2487,20 +2523,26 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info)
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible)
{
int running;
int ret = 0;
mutex_lock(&fs_info->qgroup_rescan_lock);
spin_lock(&fs_info->qgroup_lock);
- running = fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ running = fs_info->qgroup_rescan_running;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- if (running)
+ if (!running)
+ return 0;
+
+ if (interruptible)
ret = wait_for_completion_interruptible(
&fs_info->qgroup_rescan_completion);
+ else
+ wait_for_completion(&fs_info->qgroup_rescan_completion);
return ret;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 710887c06aaf..1bc64c864b62 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -46,7 +46,8 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
+ bool interruptible);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
@@ -63,10 +64,35 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record *
-btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * No lock version, caller must acquire delayed ref lock and allocate memory.
+ *
+ * Return 0 for success insert
+ * Return >0 for existing record, caller can free @record safely.
+ * Error is not possible
+ */
+int btrfs_qgroup_insert_dirty_extent_nolock(
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
+
+/*
+ * Insert one dirty extent record into @delayed_refs, informing qgroup to
+ * account that extent at commit trans time.
+ *
+ * Better encapsulated version.
+ *
+ * Return 0 if the operation is done.
+ * Return <0 for error, like memory allocation failure or invalid parameter
+ * (NULL trans)
+ */
+int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes,
+ gfp_t gfp_flag);
+
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b26a5aea41b4..c0c13dc6fe12 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -31,6 +31,7 @@
#include "async-thread.h"
#include "free-space-cache.h"
#include "inode-map.h"
+#include "qgroup.h"
/*
* backref_node, mapping_node and tree_block start with this
@@ -3037,15 +3038,19 @@ int prealloc_file_extent_cluster(struct inode *inode,
u64 num_bytes;
int nr = 0;
int ret = 0;
+ u64 prealloc_start = cluster->start - offset;
+ u64 prealloc_end = cluster->end - offset;
+ u64 cur_offset;
BUG_ON(cluster->start != cluster->boundary[0]);
inode_lock(inode);
- ret = btrfs_check_data_free_space(inode, cluster->start,
- cluster->end + 1 - cluster->start);
+ ret = btrfs_check_data_free_space(inode, prealloc_start,
+ prealloc_end + 1 - prealloc_start);
if (ret)
goto out;
+ cur_offset = prealloc_start;
while (nr < cluster->nr) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
@@ -3055,16 +3060,21 @@ int prealloc_file_extent_cluster(struct inode *inode,
lock_extent(&BTRFS_I(inode)->io_tree, start, end);
num_bytes = end + 1 - start;
+ if (cur_offset < start)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ start - cur_offset);
ret = btrfs_prealloc_file_range(inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
+ cur_offset = end + 1;
unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
if (ret)
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->start,
- cluster->end + 1 - cluster->start);
+ if (cur_offset < prealloc_end)
+ btrfs_free_reserved_data_space(inode, cur_offset,
+ prealloc_end + 1 - cur_offset);
out:
inode_unlock(inode);
return ret;
@@ -3916,6 +3926,90 @@ int prepare_to_relocate(struct reloc_control *rc)
return 0;
}
+/*
+ * Qgroup fixer for data chunk relocation.
+ * The data relocation is done in the following steps
+ * 1) Copy data extents into data reloc tree
+ * 2) Create tree reloc tree(special snapshot) for related subvolumes
+ * 3) Modify file extents in tree reloc tree
+ * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks
+ *
+ * The problem is, data and tree reloc tree are not accounted to qgroup,
+ * and 4) will only info qgroup to track tree blocks change, not file extents
+ * in the tree blocks.
+ *
+ * The good news is, related data extents are all in data reloc tree, so we
+ * only need to info qgroup to track all file extents in data reloc tree
+ * before commit trans.
+ */
+static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc)
+{
+ struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+ struct inode *inode = rc->data_inode;
+ struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ /*
+ * Only for stage where we update data pointers the qgroup fix is
+ * valid.
+ * For MOVING_DATA stage, we will miss the timing of swapping tree
+ * blocks, and won't fix it.
+ */
+ if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found))
+ return 0;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1);
+ while (1) {
+ struct btrfs_file_extent_item *fi;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid > btrfs_ino(inode))
+ break;
+ if (key.type != BTRFS_EXTENT_DATA_KEY)
+ goto next;
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(path->nodes[0], fi) !=
+ BTRFS_FILE_EXTENT_REG)
+ goto next;
+ ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info,
+ btrfs_file_extent_disk_bytenr(path->nodes[0], fi),
+ btrfs_file_extent_disk_num_bytes(path->nodes[0], fi),
+ GFP_NOFS);
+ if (ret < 0)
+ break;
+next:
+ ret = btrfs_next_item(data_reloc_root, path);
+ if (ret < 0)
+ break;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1);
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
@@ -4102,10 +4196,18 @@ restart:
/* get rid of pinned extents */
trans = btrfs_join_transaction(rc->extent_root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- else
- btrfs_commit_transaction(trans, rc->extent_root);
+ goto out_free;
+ }
+ ret = qgroup_fix_relocated_data_extents(trans, rc);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ goto out_free;
+ }
+ btrfs_commit_transaction(trans, rc->extent_root);
out_free:
btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
btrfs_free_path(path);
@@ -4468,10 +4570,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
unset_reloc_control(rc);
trans = btrfs_join_transaction(rc->extent_root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- else
- err = btrfs_commit_transaction(trans, rc->extent_root);
+ goto out_free;
+ }
+ err = qgroup_fix_relocated_data_extents(trans, rc);
+ if (err < 0) {
+ btrfs_abort_transaction(trans, err);
+ goto out_free;
+ }
+ err = btrfs_commit_transaction(trans, rc->extent_root);
out_free:
kfree(rc);
out:
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 7fd7e1830cfe..091296062456 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -272,6 +272,23 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid = key.offset;
key.offset++;
+ /*
+ * The root might have been inserted already, as before we look
+ * for orphan roots, log replay might have happened, which
+ * triggers a transaction commit and qgroup accounting, which
+ * in turn reads and inserts fs roots while doing backref
+ * walking.
+ */
+ root = btrfs_lookup_fs_root(tree_root->fs_info,
+ root_key.objectid);
+ if (root) {
+ WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state));
+ if (btrfs_root_refs(&root->root_item) == 0)
+ btrfs_add_dead_root(root);
+ continue;
+ }
+
root = btrfs_read_fs_root(tree_root, &root_key);
err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
@@ -310,16 +327,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
- /*
- * The root might have been inserted already, as before we look
- * for orphan roots, log replay might have happened, which
- * triggers a transaction commit and qgroup accounting, which
- * in turn reads and inserts fs roots while doing backref
- * walking.
- */
- if (err == -EEXIST)
- err = 0;
if (err) {
+ BUG_ON(err == -EEXIST);
btrfs_free_fs_root(root);
break;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index b71dd298385c..1379e59277e2 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -231,7 +231,6 @@ struct pending_dir_move {
u64 parent_ino;
u64 ino;
u64 gen;
- bool is_orphan;
struct list_head update_refs;
};
@@ -274,6 +273,39 @@ struct name_cache_entry {
char name[];
};
+static void inconsistent_snapshot_error(struct send_ctx *sctx,
+ enum btrfs_compare_tree_result result,
+ const char *what)
+{
+ const char *result_string;
+
+ switch (result) {
+ case BTRFS_COMPARE_TREE_NEW:
+ result_string = "new";
+ break;
+ case BTRFS_COMPARE_TREE_DELETED:
+ result_string = "deleted";
+ break;
+ case BTRFS_COMPARE_TREE_CHANGED:
+ result_string = "updated";
+ break;
+ case BTRFS_COMPARE_TREE_SAME:
+ ASSERT(0);
+ result_string = "unchanged";
+ break;
+ default:
+ ASSERT(0);
+ result_string = "unexpected";
+ }
+
+ btrfs_err(sctx->send_root->fs_info,
+ "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
+ result_string, what, sctx->cmp_key->objectid,
+ sctx->send_root->root_key.objectid,
+ (sctx->parent_root ?
+ sctx->parent_root->root_key.objectid : 0));
+}
+
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
static struct waiting_dir_move *
@@ -1861,7 +1893,8 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
* was already unlinked/moved, so we can safely assume that we will not
* overwrite anything at this point in time.
*/
- if (other_inode > sctx->send_progress) {
+ if (other_inode > sctx->send_progress ||
+ is_waiting_for_move(sctx, other_inode)) {
ret = get_inode_info(sctx->parent_root, other_inode, NULL,
who_gen, NULL, NULL, NULL, NULL);
if (ret < 0)
@@ -2502,6 +2535,8 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret > 0)
+ ret = -ENOENT;
if (ret < 0)
goto out;
@@ -2947,6 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
}
if (loc.objectid > send_progress) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, dir);
+ free_orphan_dir_info(sctx, odi);
ret = 0;
goto out;
}
@@ -3047,7 +3086,6 @@ static int add_pending_dir_move(struct send_ctx *sctx,
pm->parent_ino = parent_ino;
pm->ino = ino;
pm->gen = ino_gen;
- pm->is_orphan = is_orphan;
INIT_LIST_HEAD(&pm->list);
INIT_LIST_HEAD(&pm->update_refs);
RB_CLEAR_NODE(&pm->node);
@@ -3113,6 +3151,48 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
return NULL;
}
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
struct fs_path *from_path = NULL;
@@ -3123,6 +3203,8 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
u64 parent_ino, parent_gen;
struct waiting_dir_move *dm = NULL;
u64 rmdir_ino = 0;
+ u64 ancestor;
+ bool is_orphan;
int ret;
name = fs_path_alloc();
@@ -3135,9 +3217,10 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
dm = get_waiting_dir_move(sctx, pm->ino);
ASSERT(dm);
rmdir_ino = dm->rmdir_ino;
+ is_orphan = dm->orphanized;
free_waiting_dir_move(sctx, dm);
- if (pm->is_orphan) {
+ if (is_orphan) {
ret = gen_unique_name(sctx, pm->ino,
pm->gen, from_path);
} else {
@@ -3155,6 +3238,24 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
goto out;
sctx->send_progress = sctx->cur_ino + 1;
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs,
+ is_orphan);
+ if (ret < 0)
+ goto out;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ }
+ goto out;
+ }
fs_path_reset(name);
to_path = name;
name = NULL;
@@ -3174,7 +3275,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
/* already deleted */
goto finish;
}
- ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino);
if (ret < 0)
goto out;
if (!ret)
@@ -3204,8 +3305,18 @@ finish:
* and old parent(s).
*/
list_for_each_entry(cur, &pm->update_refs, list) {
- if (cur->dir == rmdir_ino)
+ /*
+ * The parent inode might have been deleted in the send snapshot
+ */
+ ret = get_inode_info(sctx->send_root, cur->dir, NULL,
+ NULL, NULL, NULL, NULL, NULL);
+ if (ret == -ENOENT) {
+ ret = 0;
continue;
+ }
+ if (ret < 0)
+ goto out;
+
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
@@ -3325,6 +3436,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
u64 left_gen;
u64 right_gen;
int ret = 0;
+ struct waiting_dir_move *wdm;
if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
return 0;
@@ -3383,7 +3495,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- if (is_waiting_for_move(sctx, di_key.objectid)) {
+ wdm = get_waiting_dir_move(sctx, di_key.objectid);
+ if (wdm && !wdm->orphanized) {
ret = add_pending_dir_move(sctx,
sctx->cur_ino,
sctx->cur_inode_gen,
@@ -3470,7 +3583,8 @@ static int wait_for_parent_move(struct send_ctx *sctx,
ret = is_ancestor(sctx->parent_root,
sctx->cur_ino, sctx->cur_inode_gen,
ino, path_before);
- break;
+ if (ret)
+ break;
}
fs_path_reset(path_before);
@@ -3643,11 +3757,26 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
goto out;
if (ret) {
struct name_cache_entry *nce;
+ struct waiting_dir_move *wdm;
ret = orphanize_inode(sctx, ow_inode, ow_gen,
cur->full_path);
if (ret < 0)
goto out;
+
+ /*
+ * If ow_inode has its rename operation delayed
+ * make sure that its orphanized name is used in
+ * the source path when performing its rename
+ * operation.
+ */
+ if (is_waiting_for_move(sctx, ow_inode)) {
+ wdm = get_waiting_dir_move(sctx,
+ ow_inode);
+ ASSERT(wdm);
+ wdm->orphanized = true;
+ }
+
/*
* Make sure we clear our orphanized inode's
* name from the name cache. This is because the
@@ -3663,6 +3792,19 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
name_cache_delete(sctx, nce);
kfree(nce);
}
+
+ /*
+ * ow_inode might currently be an ancestor of
+ * cur_ino, therefore compute valid_path (the
+ * current path of cur_ino) again because it
+ * might contain the pre-orphanization name of
+ * ow_inode, which is no longer valid.
+ */
+ fs_path_reset(valid_path);
+ ret = get_cur_path(sctx, sctx->cur_ino,
+ sctx->cur_inode_gen, valid_path);
+ if (ret < 0)
+ goto out;
} else {
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
@@ -4126,10 +4268,12 @@ static int process_all_refs(struct send_ctx *sctx,
}
btrfs_release_path(path);
+ /*
+ * We don't actually care about pending_move as we are simply
+ * re-creating this inode and will be rename'ing it into place once we
+ * rename the parent directory.
+ */
ret = process_recorded_refs(sctx, &pending_move);
- /* Only applicable to an incremental send. */
- ASSERT(pending_move == 0);
-
out:
btrfs_free_path(path);
return ret;
@@ -4185,7 +4329,7 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
int ret;
struct send_ctx *sctx = ctx;
struct fs_path *p;
- posix_acl_xattr_header dummy_acl;
+ struct posix_acl_xattr_header dummy_acl;
p = fs_path_alloc();
if (!p)
@@ -5602,7 +5746,10 @@ static int changed_ref(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "reference");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen &&
sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
@@ -5627,7 +5774,10 @@ static int changed_xattr(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "xattr");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result == BTRFS_COMPARE_TREE_NEW)
@@ -5651,7 +5801,10 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- BUG_ON(sctx->cur_ino != sctx->cmp_key->objectid);
+ if (sctx->cur_ino != sctx->cmp_key->objectid) {
+ inconsistent_snapshot_error(sctx, result, "extent");
+ return -EIO;
+ }
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 864ce334f696..4071fe2bd098 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2241,6 +2241,13 @@ static int btrfs_freeze(struct super_block *sb)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+ root->fs_info->fs_frozen = 1;
+ /*
+ * We don't need a barrier here, we'll wait for any transaction that
+ * could be in progress on other threads (and do delayed iputs that
+ * we want to avoid on a frozen filesystem), or do the commit
+ * ourselves.
+ */
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
/* no transaction, don't bother */
@@ -2251,6 +2258,14 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans, root);
}
+static int btrfs_unfreeze(struct super_block *sb)
+{
+ struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+ root->fs_info->fs_frozen = 0;
+ return 0;
+}
+
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2299,6 +2314,7 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
+ .unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9cca0a721961..95d41919d034 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2278,8 +2278,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ /*
+ * If fs has been frozen, we can not handle delayed iputs, otherwise
+ * it'll result in deadlock about SB_FREEZE_FS.
+ */
if (current != root->fs_info->transaction_kthread &&
- current != root->fs_info->cleaner_kthread)
+ current != root->fs_info->cleaner_kthread &&
+ !root->fs_info->fs_frozen)
btrfs_run_delayed_iputs(root);
return ret;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d31a0c4f56be..ef9c55bc7907 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -27,6 +27,7 @@
#include "backref.h"
#include "hash.h"
#include "compression.h"
+#include "qgroup.h"
/* magic values for the inode_only field in btrfs_log_inode:
*
@@ -680,6 +681,21 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
offset = key->offset - btrfs_file_extent_offset(eb, item);
+ /*
+ * Manually record dirty extent, as here we did a shallow
+ * file extent item copy and skip normal backref update,
+ * but modifying extent tree all by ourselves.
+ * So need to manually record dirty extent for qgroup,
+ * as the owner of the file extent changed from log tree
+ * (doesn't affect qgroup) to fs/file tree(affects qgroup)
+ */
+ ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
+ btrfs_file_extent_disk_bytenr(eb, item),
+ btrfs_file_extent_disk_num_bytes(eb, item),
+ GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
if (ins.objectid > 0) {
u64 csum_start;
u64 csum_end;
@@ -2807,7 +2823,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
- btrfs_init_log_ctx(&root_log_ctx);
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch);
@@ -2851,6 +2867,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
blk_finish_plug(&plug);
+ list_del_init(&root_log_ctx.list);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
goto out;
@@ -4469,7 +4486,8 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
static int btrfs_check_ref_name_override(struct extent_buffer *eb,
const int slot,
const struct btrfs_key *key,
- struct inode *inode)
+ struct inode *inode,
+ u64 *other_ino)
{
int ret;
struct btrfs_path *search_path;
@@ -4528,7 +4546,16 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
search_path, parent,
name, this_name_len, 0);
if (di && !IS_ERR(di)) {
- ret = 1;
+ struct btrfs_key di_key;
+
+ btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+ di, &di_key);
+ if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+ ret = 1;
+ *other_ino = di_key.objectid;
+ } else {
+ ret = -EAGAIN;
+ }
goto out;
} else if (IS_ERR(di)) {
ret = PTR_ERR(di);
@@ -4722,16 +4749,72 @@ again:
if ((min_key.type == BTRFS_INODE_REF_KEY ||
min_key.type == BTRFS_INODE_EXTREF_KEY) &&
BTRFS_I(inode)->generation == trans->transid) {
+ u64 other_ino = 0;
+
ret = btrfs_check_ref_name_override(path->nodes[0],
path->slots[0],
- &min_key, inode);
+ &min_key, inode,
+ &other_ino);
if (ret < 0) {
err = ret;
goto out_unlock;
- } else if (ret > 0) {
- err = 1;
- btrfs_set_log_full_commit(root->fs_info, trans);
- goto out_unlock;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(ctx->inode)) {
+ struct btrfs_key inode_key;
+ struct inode *other_inode;
+
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
+ ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
+ ins_nr = 0;
+ btrfs_release_path(path);
+ inode_key.objectid = other_ino;
+ inode_key.type = BTRFS_INODE_ITEM_KEY;
+ inode_key.offset = 0;
+ other_inode = btrfs_iget(root->fs_info->sb,
+ &inode_key, root,
+ NULL);
+ /*
+ * If the other inode that had a conflicting dir
+ * entry was deleted in the current transaction,
+ * we don't need to do more work nor fallback to
+ * a transaction commit.
+ */
+ if (IS_ERR(other_inode) &&
+ PTR_ERR(other_inode) == -ENOENT) {
+ goto next_key;
+ } else if (IS_ERR(other_inode)) {
+ err = PTR_ERR(other_inode);
+ goto out_unlock;
+ }
+ /*
+ * We are safe logging the other inode without
+ * acquiring its i_mutex as long as we log with
+ * the LOG_INODE_EXISTS mode. We're safe against
+ * concurrent renames of the other inode as well
+ * because during a rename we pin the log and
+ * update the log with the new name before we
+ * unpin it.
+ */
+ err = btrfs_log_inode(trans, root, other_inode,
+ LOG_INODE_EXISTS,
+ 0, LLONG_MAX, ctx);
+ iput(other_inode);
+ if (err)
+ goto out_unlock;
+ else
+ goto next_key;
}
}
@@ -4799,7 +4882,7 @@ next_slot:
ins_nr = 0;
}
btrfs_release_path(path);
-
+next_key:
if (min_key.offset < (u64)-1) {
min_key.offset++;
} else if (min_key.type < max_key.type) {
@@ -4993,8 +5076,12 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
break;
- if (IS_ROOT(parent))
+ if (IS_ROOT(parent)) {
+ inode = d_inode(parent);
+ if (btrfs_must_commit_transaction(trans, inode))
+ ret = 1;
break;
+ }
parent = dget_parent(parent);
dput(old_parent);
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index a9f1b75d080d..ab858e31ccbc 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -30,15 +30,18 @@ struct btrfs_log_ctx {
int log_transid;
int io_err;
bool log_new_dentries;
+ struct inode *inode;
struct list_head list;
};
-static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
+ struct inode *inode)
{
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->io_err = 0;
ctx->log_new_dentries = false;
+ ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 51f125508771..035efce603a9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -834,10 +834,6 @@ static void __free_device(struct work_struct *work)
struct btrfs_device *device;
device = container_of(work, struct btrfs_device, rcu_work);
-
- if (device->bdev)
- blkdev_put(device->bdev, device->mode);
-
rcu_string_free(device->name);
kfree(device);
}
@@ -852,6 +848,17 @@ static void free_device(struct rcu_head *head)
schedule_work(&device->rcu_work);
}
+static void btrfs_close_bdev(struct btrfs_device *device)
+{
+ if (device->bdev && device->writeable) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+
+ if (device->bdev)
+ blkdev_put(device->bdev, device->mode);
+}
+
static void btrfs_close_one_device(struct btrfs_device *device)
{
struct btrfs_fs_devices *fs_devices = device->fs_devices;
@@ -870,10 +877,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->missing)
fs_devices->missing_devices--;
- if (device->bdev && device->writeable) {
- sync_blockdev(device->bdev);
- invalidate_bdev(device->bdev);
- }
+ btrfs_close_bdev(device);
new_device = btrfs_alloc_device(NULL, &device->devid,
device->uuid);
@@ -1932,6 +1936,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
+ btrfs_close_bdev(device);
+
call_rcu(&device->rcu, free_device);
num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
@@ -2025,6 +2031,9 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
/* zero out the old super if it is writable */
btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
}
+
+ btrfs_close_bdev(srcdev);
+
call_rcu(&srcdev->rcu, free_device);
/*
@@ -2080,6 +2089,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
* the device_list_mutex lock.
*/
btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
+
+ btrfs_close_bdev(tgtdev);
call_rcu(&tgtdev->rcu, free_device);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 9c8eb9b6db6a..7dad8713fac8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1078,7 +1078,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
return grow_dev_page(bdev, block, index, size, sizebits, gfp);
}
-struct buffer_head *
+static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
unsigned size, gfp_t gfp)
{
@@ -1109,7 +1109,6 @@ __getblk_slow(struct block_device *bdev, sector_t block,
free_more_memory();
}
}
-EXPORT_SYMBOL(__getblk_slow);
/*
* The relationship between dirty buffers and dirty pages:
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index ce5f345d70f5..e7f16a77a22a 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -253,6 +253,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
struct cachefiles_object *object;
struct cachefiles_cache *cache;
const struct cred *saved_cred;
+ struct inode *inode;
+ blkcnt_t i_blocks = 0;
ASSERT(_object);
@@ -279,6 +281,10 @@ static void cachefiles_drop_object(struct fscache_object *_object)
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
+ inode = d_backing_inode(object->dentry);
+ if (inode)
+ i_blocks = inode->i_blocks;
+
cachefiles_begin_secure(cache, &saved_cred);
cachefiles_delete_object(cache, object);
cachefiles_end_secure(cache, saved_cred);
@@ -292,7 +298,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/* note that the object is now inactive */
if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
- cachefiles_mark_object_inactive(cache, object);
+ cachefiles_mark_object_inactive(cache, object, i_blocks);
dput(object->dentry);
object->dentry = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 2fcde1a34b7c..cd1effee8a49 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -160,7 +160,8 @@ extern char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type);
* namei.c
*/
extern void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object);
+ struct cachefiles_object *object,
+ blkcnt_t i_blocks);
extern int cachefiles_delete_object(struct cachefiles_cache *cache,
struct cachefiles_object *object);
extern int cachefiles_walk_to_object(struct cachefiles_object *parent,
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 3f7c2cd41f8f..c6ee4b5fb7e6 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -261,10 +261,9 @@ requeue:
* Mark an object as being inactive.
*/
void cachefiles_mark_object_inactive(struct cachefiles_cache *cache,
- struct cachefiles_object *object)
+ struct cachefiles_object *object,
+ blkcnt_t i_blocks)
{
- blkcnt_t i_blocks = d_backing_inode(object->dentry)->i_blocks;
-
write_lock(&cache->active_lock);
rb_erase(&object->active_node, &cache->active_nodes);
clear_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags);
@@ -707,7 +706,8 @@ mark_active_timed_out:
check_error:
_debug("check error %d", ret);
- cachefiles_mark_object_inactive(cache, object);
+ cachefiles_mark_object_inactive(
+ cache, object, d_backing_inode(object->dentry)->i_blocks);
release_dentry:
dput(object->dentry);
object->dentry = NULL;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 99115cae1652..16e6ded0b7f2 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1347,9 +1347,12 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
{
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = *psession;
+ struct ceph_mds_session *session = NULL;
int mds;
+
dout("ceph_flush_snaps %p\n", inode);
+ if (psession)
+ session = *psession;
retry:
spin_lock(&ci->i_ceph_lock);
if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c64a0b794d49..df4b3e6fa563 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -597,7 +597,7 @@ static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos)
if (is_hash_order(new_pos)) {
/* no need to reset last_name for a forward seek when
* dentries are sotred in hash order */
- } else if (fi->frag |= fpos_frag(new_pos)) {
+ } else if (fi->frag != fpos_frag(new_pos)) {
return true;
}
rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fa59a85226b2..f72d4ae303b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2759,6 +2759,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
} else {
path = NULL;
pathlen = 0;
+ pathbase = 0;
}
spin_lock(&ci->i_ceph_lock);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6bbec5e784cd..14ae4b8e1a3c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -609,6 +609,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
char *s, *p;
char sep;
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ return dget(sb->s_root);
+
full_path = cifs_build_path_to_root(vol, cifs_sb,
cifs_sb_master_tcon(cifs_sb));
if (full_path == NULL)
@@ -686,26 +689,22 @@ cifs_do_mount(struct file_system_type *fs_type,
cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL);
if (cifs_sb->mountdata == NULL) {
root = ERR_PTR(-ENOMEM);
- goto out_cifs_sb;
+ goto out_free;
}
- if (volume_info->prepath) {
- cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL);
- if (cifs_sb->prepath == NULL) {
- root = ERR_PTR(-ENOMEM);
- goto out_cifs_sb;
- }
+ rc = cifs_setup_cifs_sb(volume_info, cifs_sb);
+ if (rc) {
+ root = ERR_PTR(rc);
+ goto out_free;
}
- cifs_setup_cifs_sb(volume_info, cifs_sb);
-
rc = cifs_mount(cifs_sb, volume_info);
if (rc) {
if (!(flags & MS_SILENT))
cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
rc);
root = ERR_PTR(rc);
- goto out_mountdata;
+ goto out_free;
}
mnt_data.vol = volume_info;
@@ -735,11 +734,7 @@ cifs_do_mount(struct file_system_type *fs_type,
sb->s_flags |= MS_ACTIVE;
}
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
- root = dget(sb->s_root);
- else
- root = cifs_get_root(volume_info, sb);
-
+ root = cifs_get_root(volume_info, sb);
if (IS_ERR(root))
goto out_super;
@@ -752,9 +747,9 @@ out:
cifs_cleanup_volume_info(volume_info);
return root;
-out_mountdata:
+out_free:
+ kfree(cifs_sb->prepath);
kfree(cifs_sb->mountdata);
-out_cifs_sb:
kfree(cifs_sb);
out_nls:
unload_nls(volume_info->local_nls);
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1243bd326591..4ead72a001f9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -184,7 +184,7 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
struct page *page, unsigned int to_read);
-extern void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+extern int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb);
extern int cifs_match_super(struct super_block *, void *);
extern void cifs_cleanup_volume_info(struct smb_vol *pvolume_info);
@@ -392,8 +392,7 @@ extern int CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, char **buf,
int *return_buf_type);
extern int CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
- unsigned int *nbytes, const char *buf,
- const char __user *ubuf, const int long_op);
+ unsigned int *nbytes, const char *buf);
extern int CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, struct kvec *iov, const int nvec);
extern int CIFSGetSrvInodeNumber(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d47197ea4ab6..f82d2823622f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1228,7 +1228,6 @@ OldOpenRetry:
inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
- /* long_op set to 1 to allow for oplock break timeouts */
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *)pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
@@ -1768,8 +1767,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
int
CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
- unsigned int *nbytes, const char *buf,
- const char __user *ubuf, const int long_op)
+ unsigned int *nbytes, const char *buf)
{
int rc = -EACCES;
WRITE_REQ *pSMB = NULL;
@@ -1838,12 +1836,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
if (buf)
memcpy(pSMB->Data, buf, bytes_sent);
- else if (ubuf) {
- if (copy_from_user(pSMB->Data, ubuf, bytes_sent)) {
- cifs_buf_release(pSMB);
- return -EFAULT;
- }
- } else if (count != 0) {
+ else if (count != 0) {
/* No buffer */
cifs_buf_release(pSMB);
return -EINVAL;
@@ -1867,7 +1860,7 @@ CIFSSMBWrite(const unsigned int xid, struct cifs_io_parms *io_parms,
}
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *) pSMBr, &bytes_returned, long_op);
+ (struct smb_hdr *) pSMBr, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
if (rc) {
cifs_dbg(FYI, "Send error in write = %d\n", rc);
@@ -3334,7 +3327,7 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
#ifdef CONFIG_CIFS_POSIX
/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
-static void cifs_convert_ace(posix_acl_xattr_entry *ace,
+static void cifs_convert_ace(struct posix_acl_xattr_entry *ace,
struct cifs_posix_ace *cifs_ace)
{
/* u8 cifs fields do not need le conversion */
@@ -3358,7 +3351,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
__u16 count;
struct cifs_posix_ace *pACE;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)src;
- posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)trgt;
+ struct posix_acl_xattr_header *local_acl = (void *)trgt;
if (le16_to_cpu(cifs_acl->version) != CIFS_ACL_VERSION)
return -EOPNOTSUPP;
@@ -3396,9 +3389,11 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
} else if (size > buflen) {
return -ERANGE;
} else /* buffer big enough */ {
+ struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
+
local_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
for (i = 0; i < count ; i++) {
- cifs_convert_ace(&local_acl->a_entries[i], pACE);
+ cifs_convert_ace(&ace[i], pACE);
pACE++;
}
}
@@ -3406,7 +3401,7 @@ static int cifs_copy_posix_acl(char *trgt, char *src, const int buflen,
}
static __u16 convert_ace_to_cifs_ace(struct cifs_posix_ace *cifs_ace,
- const posix_acl_xattr_entry *local_ace)
+ const struct posix_acl_xattr_entry *local_ace)
{
__u16 rc = 0; /* 0 = ACL converted ok */
@@ -3431,7 +3426,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
{
__u16 rc = 0;
struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
- posix_acl_xattr_header *local_acl = (posix_acl_xattr_header *)pACL;
+ struct posix_acl_xattr_header *local_acl = (void *)pACL;
int count;
int i;
@@ -3459,7 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
}
for (i = 0; i < count; i++) {
rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
- &local_acl->a_entries[i]);
+ (struct posix_acl_xattr_entry *)(local_acl + 1));
if (rc != 0) {
/* ACE not converted */
break;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7ae03283bd61..2e4f4bad8b1e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2781,6 +2781,24 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
return 1;
}
+static int
+match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
+{
+ struct cifs_sb_info *old = CIFS_SB(sb);
+ struct cifs_sb_info *new = mnt_data->cifs_sb;
+
+ if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
+ if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
+ return 0;
+ /* The prepath should be null terminated strings */
+ if (strcmp(new->prepath, old->prepath))
+ return 0;
+
+ return 1;
+ }
+ return 0;
+}
+
int
cifs_match_super(struct super_block *sb, void *data)
{
@@ -2808,7 +2826,8 @@ cifs_match_super(struct super_block *sb, void *data)
if (!match_server(tcp_srv, volume_info) ||
!match_session(ses, volume_info) ||
- !match_tcon(tcon, volume_info->UNC)) {
+ !match_tcon(tcon, volume_info->UNC) ||
+ !match_prepath(sb, mnt_data)) {
rc = 0;
goto out;
}
@@ -3222,7 +3241,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
}
}
-void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
+int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
struct cifs_sb_info *cifs_sb)
{
INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks);
@@ -3316,6 +3335,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
cifs_dbg(VFS, "mount option dynperm ignored if cifsacl mount option supported\n");
+
+ if (pvolume_info->prepath) {
+ cifs_sb->prepath = kstrdup(pvolume_info->prepath, GFP_KERNEL);
+ if (cifs_sb->prepath == NULL)
+ return -ENOMEM;
+ }
+
+ return 0;
}
static void
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 579e41b350a2..42b99af74e0a 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2478,7 +2478,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
size_t cur_len;
unsigned long nr_pages, num_pages, i;
struct cifs_writedata *wdata;
- struct iov_iter saved_from;
+ struct iov_iter saved_from = *from;
loff_t saved_offset = offset;
pid_t pid;
struct TCP_Server_Info *server;
@@ -2489,7 +2489,6 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
pid = current->tgid;
server = tlink_tcon(open_file->tlink)->ses->server;
- memcpy(&saved_from, from, sizeof(struct iov_iter));
do {
unsigned int wsize, credits;
@@ -2551,8 +2550,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
kref_put(&wdata->refcount,
cifs_uncached_writedata_release);
if (rc == -EAGAIN) {
- memcpy(from, &saved_from,
- sizeof(struct iov_iter));
+ *from = saved_from;
iov_iter_advance(from, offset - saved_offset);
continue;
}
@@ -2576,7 +2574,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
struct cifs_sb_info *cifs_sb;
struct cifs_writedata *wdata, *tmp;
struct list_head wdata_list;
- struct iov_iter saved_from;
+ struct iov_iter saved_from = *from;
int rc;
/*
@@ -2597,8 +2595,6 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
if (!tcon->ses->server->ops->async_writev)
return -ENOSYS;
- memcpy(&saved_from, from, sizeof(struct iov_iter));
-
rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from,
open_file, cifs_sb, &wdata_list);
@@ -2631,13 +2627,11 @@ restart_loop:
/* resend call if it's a retryable error */
if (rc == -EAGAIN) {
struct list_head tmp_list;
- struct iov_iter tmp_from;
+ struct iov_iter tmp_from = saved_from;
INIT_LIST_HEAD(&tmp_list);
list_del_init(&wdata->list);
- memcpy(&tmp_from, &saved_from,
- sizeof(struct iov_iter));
iov_iter_advance(&tmp_from,
wdata->offset - iocb->ki_pos);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 062c2375549a..d031af8d3d4d 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -399,7 +399,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
io_parms.offset = 0;
io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
- rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+ rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf);
CIFSSMBClose(xid, tcon, fid.netfid);
return rc;
}
diff --git a/fs/compat.c b/fs/compat.c
index be6e48b0a46c..bd064a2c3550 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -54,20 +54,6 @@
#include <asm/ioctls.h>
#include "internal.h"
-int compat_log = 1;
-
-int compat_printk(const char *fmt, ...)
-{
- va_list ap;
- int ret;
- if (!compat_log)
- return 0;
- va_start(ap, fmt);
- ret = vprintk(fmt, ap);
- va_end(ap);
- return ret;
-}
-
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
@@ -562,7 +548,7 @@ ssize_t compat_rw_copy_check_uvector(int type,
goto out;
ret = -EINVAL;
- if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+ if (nr_segs > UIO_MAXIOV)
goto out;
if (nr_segs > fast_segs) {
ret = -ENOMEM;
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index c30cf49b69d2..2c6312db8516 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -333,6 +333,7 @@ configfs_write_bin_file(struct file *file, const char __user *buf,
if (bin_attr->cb_max_size &&
*ppos + count > bin_attr->cb_max_size) {
len = -EFBIG;
+ goto out;
}
tbuf = vmalloc(*ppos + count);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 0f9961eede1e..ed115acb5dee 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -11,6 +11,7 @@
#include <linux/random.h>
#include <linux/string.h>
#include <linux/fscrypto.h>
+#include <linux/mount.h>
static int inode_has_encryption_context(struct inode *inode)
{
@@ -92,26 +93,42 @@ static int create_encryption_context_from_policy(struct inode *inode,
return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
}
-int fscrypt_process_policy(struct inode *inode,
+int fscrypt_process_policy(struct file *filp,
const struct fscrypt_policy *policy)
{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
if (policy->version != 0)
return -EINVAL;
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
if (!inode_has_encryption_context(inode)) {
- if (!inode->i_sb->s_cop->empty_dir)
- return -EOPNOTSUPP;
- if (!inode->i_sb->s_cop->empty_dir(inode))
- return -ENOTEMPTY;
- return create_encryption_context_from_policy(inode, policy);
+ if (!S_ISDIR(inode->i_mode))
+ ret = -EINVAL;
+ else if (!inode->i_sb->s_cop->empty_dir)
+ ret = -EOPNOTSUPP;
+ else if (!inode->i_sb->s_cop->empty_dir(inode))
+ ret = -ENOTEMPTY;
+ else
+ ret = create_encryption_context_from_policy(inode,
+ policy);
+ } else if (!is_encryption_context_consistent_with_policy(inode,
+ policy)) {
+ printk(KERN_WARNING
+ "%s: Policy inconsistent with encryption context\n",
+ __func__);
+ ret = -EINVAL;
}
- if (is_encryption_context_consistent_with_policy(inode, policy))
- return 0;
-
- printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
- __func__);
- return -EINVAL;
+ mnt_drop_write_file(filp);
+ return ret;
}
EXPORT_SYMBOL(fscrypt_process_policy);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index d116453b0276..79a5941c2474 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -585,7 +585,8 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv)
*/
void *devpts_get_priv(struct dentry *dentry)
{
- WARN_ON_ONCE(dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC);
+ if (dentry->d_sb->s_magic != DEVPTS_SUPER_MAGIC)
+ return NULL;
return dentry->d_fsdata;
}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index eea64912c9c0..466f7d60edc2 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -607,20 +607,54 @@ static const struct file_operations format2_fops;
static const struct file_operations format3_fops;
static const struct file_operations format4_fops;
-static int table_open(struct inode *inode, struct file *file)
+static int table_open1(struct inode *inode, struct file *file)
{
struct seq_file *seq;
- int ret = -1;
+ int ret;
- if (file->f_op == &format1_fops)
- ret = seq_open(file, &format1_seq_ops);
- else if (file->f_op == &format2_fops)
- ret = seq_open(file, &format2_seq_ops);
- else if (file->f_op == &format3_fops)
- ret = seq_open(file, &format3_seq_ops);
- else if (file->f_op == &format4_fops)
- ret = seq_open(file, &format4_seq_ops);
+ ret = seq_open(file, &format1_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open2(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format2_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open3(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+
+ ret = seq_open(file, &format3_seq_ops);
+ if (ret)
+ return ret;
+
+ seq = file->private_data;
+ seq->private = inode->i_private; /* the dlm_ls */
+ return 0;
+}
+
+static int table_open4(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int ret;
+ ret = seq_open(file, &format4_seq_ops);
if (ret)
return ret;
@@ -631,7 +665,7 @@ static int table_open(struct inode *inode, struct file *file)
static const struct file_operations format1_fops = {
.owner = THIS_MODULE,
- .open = table_open,
+ .open = table_open1,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
@@ -639,7 +673,7 @@ static const struct file_operations format1_fops = {
static const struct file_operations format2_fops = {
.owner = THIS_MODULE,
- .open = table_open,
+ .open = table_open2,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
@@ -647,7 +681,7 @@ static const struct file_operations format2_fops = {
static const struct file_operations format3_fops = {
.owner = THIS_MODULE,
- .open = table_open,
+ .open = table_open3,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
@@ -655,7 +689,7 @@ static const struct file_operations format3_fops = {
static const struct file_operations format4_fops = {
.owner = THIS_MODULE,
- .open = table_open,
+ .open = table_open4,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3131747199e1..c6ea25a190f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5466,8 +5466,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
sbi->s_want_extra_isize,
iloc, handle);
if (ret) {
- ext4_set_inode_state(inode,
- EXT4_STATE_NO_EXPAND);
if (mnt_count !=
le16_to_cpu(sbi->s_es->s_mnt_count)) {
ext4_warning(inode->i_sb,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 10686fd67fb4..1bb7df5e4536 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -776,7 +776,7 @@ resizefs_out:
(struct fscrypt_policy __user *)arg,
sizeof(policy)))
return -EFAULT;
- return fscrypt_process_policy(inode, &policy);
+ return fscrypt_process_policy(filp, &policy);
#else
return -EOPNOTSUPP;
#endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1c593aa0218e..3ec8708989ca 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2211,6 +2211,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
/* Called at mount-time, super-block is locked */
static int ext4_check_descriptors(struct super_block *sb,
+ ext4_fsblk_t sb_block,
ext4_group_t *first_not_zeroed)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2241,6 +2242,11 @@ static int ext4_check_descriptors(struct super_block *sb,
grp = i;
block_bitmap = ext4_block_bitmap(sb, gdp);
+ if (block_bitmap == sb_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Block bitmap for group %u overlaps "
+ "superblock", i);
+ }
if (block_bitmap < first_block || block_bitmap > last_block) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Block bitmap for group %u not in group "
@@ -2248,6 +2254,11 @@ static int ext4_check_descriptors(struct super_block *sb,
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
+ if (inode_bitmap == sb_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode bitmap for group %u overlaps "
+ "superblock", i);
+ }
if (inode_bitmap < first_block || inode_bitmap > last_block) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Inode bitmap for group %u not in group "
@@ -2255,6 +2266,11 @@ static int ext4_check_descriptors(struct super_block *sb,
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
+ if (inode_table == sb_block) {
+ ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+ "Inode table for group %u overlaps "
+ "superblock", i);
+ }
if (inode_table < first_block ||
inode_table + sbi->s_itb_per_group - 1 > last_block) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
@@ -3757,7 +3773,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount2;
}
}
- if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
+ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
ret = -EFSCORRUPTED;
goto failed_mount2;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 39e9cfb1b371..2eb935ca5d9e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1353,15 +1353,19 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
size_t min_offs, free;
int total_ino;
void *base, *start, *end;
- int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
+ int error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
+ int isize_diff; /* How much do we need to grow i_extra_isize */
down_write(&EXT4_I(inode)->xattr_sem);
+ /*
+ * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty
+ */
+ ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
retry:
- if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) {
- up_write(&EXT4_I(inode)->xattr_sem);
- return 0;
- }
+ isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
+ if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
+ goto out;
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
@@ -1382,7 +1386,7 @@ retry:
goto cleanup;
free = ext4_xattr_free_space(last, &min_offs, base, &total_ino);
- if (free >= new_extra_isize) {
+ if (free >= isize_diff) {
entry = IFIRST(header);
ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize
- new_extra_isize, (void *)raw_inode +
@@ -1390,8 +1394,7 @@ retry:
(void *)header, total_ino,
inode->i_sb->s_blocksize);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
- error = 0;
- goto cleanup;
+ goto out;
}
/*
@@ -1414,7 +1417,7 @@ retry:
end = bh->b_data + bh->b_size;
min_offs = end - base;
free = ext4_xattr_free_space(first, &min_offs, base, NULL);
- if (free < new_extra_isize) {
+ if (free < isize_diff) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
@@ -1428,7 +1431,7 @@ retry:
free = inode->i_sb->s_blocksize;
}
- while (new_extra_isize > 0) {
+ while (isize_diff > 0) {
size_t offs, size, entry_size;
struct ext4_xattr_entry *small_entry = NULL;
struct ext4_xattr_info i = {
@@ -1459,7 +1462,7 @@ retry:
EXT4_XATTR_SIZE(le32_to_cpu(last->e_value_size)) +
EXT4_XATTR_LEN(last->e_name_len);
if (total_size <= free && total_size < min_total_size) {
- if (total_size < new_extra_isize) {
+ if (total_size < isize_diff) {
small_entry = last;
} else {
entry = last;
@@ -1514,22 +1517,22 @@ retry:
error = ext4_xattr_ibody_set(handle, inode, &i, is);
if (error)
goto cleanup;
+ total_ino -= entry_size;
entry = IFIRST(header);
- if (entry_size + EXT4_XATTR_SIZE(size) >= new_extra_isize)
- shift_bytes = new_extra_isize;
+ if (entry_size + EXT4_XATTR_SIZE(size) >= isize_diff)
+ shift_bytes = isize_diff;
else
- shift_bytes = entry_size + size;
+ shift_bytes = entry_size + EXT4_XATTR_SIZE(size);
/* Adjust the offsets and shift the remaining entries ahead */
- ext4_xattr_shift_entries(entry, EXT4_I(inode)->i_extra_isize -
- shift_bytes, (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE + extra_isize + shift_bytes,
- (void *)header, total_ino - entry_size,
- inode->i_sb->s_blocksize);
+ ext4_xattr_shift_entries(entry, -shift_bytes,
+ (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
+ EXT4_I(inode)->i_extra_isize + shift_bytes,
+ (void *)header, total_ino, inode->i_sb->s_blocksize);
- extra_isize += shift_bytes;
- new_extra_isize -= shift_bytes;
- EXT4_I(inode)->i_extra_isize = extra_isize;
+ isize_diff -= shift_bytes;
+ EXT4_I(inode)->i_extra_isize += shift_bytes;
+ header = IHDR(inode, raw_inode);
i.name = b_entry_name;
i.value = buffer;
@@ -1551,6 +1554,8 @@ retry:
kfree(bs);
}
brelse(bh);
+out:
+ ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
up_write(&EXT4_I(inode)->xattr_sem);
return 0;
@@ -1562,6 +1567,10 @@ cleanup:
kfree(is);
kfree(bs);
brelse(bh);
+ /*
+ * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
+ * size expansion failed.
+ */
up_write(&EXT4_I(inode)->xattr_sem);
return error;
}
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 69dd3e6566e0..a92e783fa057 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -24,6 +24,7 @@
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
#define EXT4_XATTR_INDEX_ENCRYPTION 9
+#define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d64d2a515cb2..ccb401eebc11 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1699,11 +1699,11 @@ static int f2fs_write_end(struct file *file,
trace_f2fs_write_end(inode, pos, len, copied);
set_page_dirty(page);
- f2fs_put_page(page, 1);
if (pos + copied > i_size_read(inode))
f2fs_i_size_write(inode, pos + copied);
+ f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
return copied;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 675fa79d86f6..14f5fe2b841e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -538,7 +538,7 @@ struct f2fs_nm_info {
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
struct radix_tree_root nat_set_root;/* root of the nat set cache */
- struct percpu_rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
+ struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */
struct list_head nat_entries; /* cached nat entry list (clean) */
unsigned int nat_cnt; /* the # of cached nat entries */
unsigned int dirty_nat_cnt; /* total num of nat entries in set */
@@ -787,7 +787,7 @@ struct f2fs_sb_info {
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
- struct percpu_rw_semaphore cp_rwsem; /* blocking FS operations */
+ struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
wait_queue_head_t cp_wait;
unsigned long last_time[MAX_TIME]; /* to store time in jiffies */
@@ -1074,22 +1074,22 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- percpu_down_read(&sbi->cp_rwsem);
+ down_read(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- percpu_up_read(&sbi->cp_rwsem);
+ up_read(&sbi->cp_rwsem);
}
static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- percpu_down_write(&sbi->cp_rwsem);
+ down_write(&sbi->cp_rwsem);
}
static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- percpu_up_write(&sbi->cp_rwsem);
+ up_write(&sbi->cp_rwsem);
}
static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 0e493f63ea41..28f4f4cbb8d8 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1757,21 +1757,14 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
{
struct fscrypt_policy policy;
struct inode *inode = file_inode(filp);
- int ret;
if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
sizeof(policy)))
return -EFAULT;
- ret = mnt_want_write_file(filp);
- if (ret)
- return ret;
-
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
- ret = fscrypt_process_policy(inode, &policy);
- mnt_drop_write_file(filp);
- return ret;
+ return fscrypt_process_policy(filp, &policy);
}
static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
@@ -2086,15 +2079,19 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
if (unlikely(f2fs_readonly(src->i_sb)))
return -EROFS;
- if (S_ISDIR(src->i_mode) || S_ISDIR(dst->i_mode))
- return -EISDIR;
+ if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode))
+ return -EINVAL;
if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst))
return -EOPNOTSUPP;
inode_lock(src);
- if (src != dst)
- inode_lock(dst);
+ if (src != dst) {
+ if (!inode_trylock(dst)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
ret = -EINVAL;
if (pos_in + len > src->i_size || pos_in + len < pos_in)
@@ -2152,6 +2149,7 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
out_unlock:
if (src != dst)
inode_unlock(dst);
+out:
inode_unlock(src);
return ret;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index b2fa4b615925..f75d197d5beb 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -206,14 +206,14 @@ int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool need = false;
- percpu_down_read(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
if (!get_nat_flag(e, IS_CHECKPOINTED) &&
!get_nat_flag(e, HAS_FSYNCED_INODE))
need = true;
}
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return need;
}
@@ -223,11 +223,11 @@ bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
struct nat_entry *e;
bool is_cp = true;
- percpu_down_read(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e && !get_nat_flag(e, IS_CHECKPOINTED))
is_cp = false;
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return is_cp;
}
@@ -237,13 +237,13 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
struct nat_entry *e;
bool need_update = true;
- percpu_down_read(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ino);
if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
(get_nat_flag(e, IS_CHECKPOINTED) ||
get_nat_flag(e, HAS_FSYNCED_INODE)))
need_update = false;
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return need_update;
}
@@ -284,7 +284,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
- percpu_down_write(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
@@ -334,7 +334,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
set_nat_flag(e, HAS_FSYNCED_INODE, true);
set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
}
- percpu_up_write(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
@@ -342,7 +342,8 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
struct f2fs_nm_info *nm_i = NM_I(sbi);
int nr = nr_shrink;
- percpu_down_write(&nm_i->nat_tree_lock);
+ if (!down_write_trylock(&nm_i->nat_tree_lock))
+ return 0;
while (nr_shrink && !list_empty(&nm_i->nat_entries)) {
struct nat_entry *ne;
@@ -351,7 +352,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
__del_from_nat_cache(nm_i, ne);
nr_shrink--;
}
- percpu_up_write(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
return nr - nr_shrink;
}
@@ -373,13 +374,13 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
ni->nid = nid;
/* Check nat cache */
- percpu_down_read(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
e = __lookup_nat_cache(nm_i, nid);
if (e) {
ni->ino = nat_get_ino(e);
ni->blk_addr = nat_get_blkaddr(e);
ni->version = nat_get_version(e);
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
return;
}
@@ -403,11 +404,11 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
node_info_from_raw_nat(ni, &ne);
f2fs_put_page(page, 1);
cache:
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
/* cache nat entry */
- percpu_down_write(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
cache_nat_entry(sbi, nid, &ne);
- percpu_up_write(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
}
/*
@@ -1788,7 +1789,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
META_NAT, true);
- percpu_down_read(&nm_i->nat_tree_lock);
+ down_read(&nm_i->nat_tree_lock);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1820,7 +1821,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
up_read(&curseg->journal_rwsem);
- percpu_up_read(&nm_i->nat_tree_lock);
+ up_read(&nm_i->nat_tree_lock);
ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
nm_i->ra_nid_pages, META_NAT, false);
@@ -2209,7 +2210,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
if (!nm_i->dirty_nat_cnt)
return;
- percpu_down_write(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
/*
* if there are no enough space in journal to store dirty nat
@@ -2232,7 +2233,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
list_for_each_entry_safe(set, tmp, &sets, set_list)
__flush_nat_entry_set(sbi, set);
- percpu_up_write(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
}
@@ -2268,8 +2269,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->free_nid_list_lock);
- if (percpu_init_rwsem(&nm_i->nat_tree_lock))
- return -ENOMEM;
+ init_rwsem(&nm_i->nat_tree_lock);
nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
nm_i->bitmap_size = __bitmap_size(sbi, NAT_BITMAP);
@@ -2326,7 +2326,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
- percpu_down_write(&nm_i->nat_tree_lock);
+ down_write(&nm_i->nat_tree_lock);
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
@@ -2351,9 +2351,8 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
kmem_cache_free(nat_entry_set_slab, setvec[idx]);
}
}
- percpu_up_write(&nm_i->nat_tree_lock);
+ up_write(&nm_i->nat_tree_lock);
- percpu_free_rwsem(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
sbi->nm_info = NULL;
kfree(nm_i);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 1b86d3f638ef..7f863a645ab1 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -706,8 +706,6 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi)
percpu_counter_destroy(&sbi->nr_pages[i]);
percpu_counter_destroy(&sbi->alloc_valid_block_count);
percpu_counter_destroy(&sbi->total_valid_inode_count);
-
- percpu_free_rwsem(&sbi->cp_rwsem);
}
static void f2fs_put_super(struct super_block *sb)
@@ -1483,9 +1481,6 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
{
int i, err;
- if (percpu_init_rwsem(&sbi->cp_rwsem))
- return -ENOMEM;
-
for (i = 0; i < NR_COUNT_TYPE; i++) {
err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
if (err)
@@ -1686,6 +1681,7 @@ try_onemore:
sbi->write_io[i].bio = NULL;
}
+ init_rwsem(&sbi->cp_rwsem);
init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
diff --git a/fs/file.c b/fs/file.c
index 6b1acdfe59da..69d6990e3021 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -23,12 +23,12 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
-int sysctl_nr_open __read_mostly = 1024*1024;
-int sysctl_nr_open_min = BITS_PER_LONG;
+unsigned int sysctl_nr_open __read_mostly = 1024*1024;
+unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
- -BITS_PER_LONG;
+unsigned int sysctl_nr_open_max =
+ __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
{
@@ -163,7 +163,7 @@ out:
* Return <0 error code on error; 1 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
-static int expand_fdtable(struct files_struct *files, int nr)
+static int expand_fdtable(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -208,7 +208,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
* expanded and execution may have blocked.
* The files->file_lock should be held on entry, and will be held on exit.
*/
-static int expand_files(struct files_struct *files, int nr)
+static int expand_files(struct files_struct *files, unsigned int nr)
__releases(files->file_lock)
__acquires(files->file_lock)
{
@@ -243,12 +243,12 @@ repeat:
return expanded;
}
-static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->close_on_exec);
}
-static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
+static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
{
if (test_bit(fd, fdt->close_on_exec))
__clear_bit(fd, fdt->close_on_exec);
@@ -268,10 +268,10 @@ static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
__clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
-static int count_open_files(struct fdtable *fdt)
+static unsigned int count_open_files(struct fdtable *fdt)
{
- int size = fdt->max_fds;
- int i;
+ unsigned int size = fdt->max_fds;
+ unsigned int i;
/* Find the last open fd */
for (i = size / BITS_PER_LONG; i > 0; ) {
@@ -291,7 +291,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, i;
+ unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -391,7 +391,7 @@ static struct fdtable *close_files(struct files_struct * files)
* files structure.
*/
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
- int i, j = 0;
+ unsigned int i, j = 0;
for (;;) {
unsigned long set;
@@ -477,11 +477,11 @@ struct files_struct init_files = {
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
-static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
- unsigned long maxfd = fdt->max_fds;
- unsigned long maxbit = maxfd / BITS_PER_LONG;
- unsigned long bitbit = start / BITS_PER_LONG;
+ unsigned int maxfd = fdt->max_fds;
+ unsigned int maxbit = maxfd / BITS_PER_LONG;
+ unsigned int bitbit = start / BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit > maxfd)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4d09d4441e3e..05713a5da083 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1949,6 +1949,12 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
struct backing_dev_info *bdi;
+ /*
+ * If we are expecting writeback progress we must submit plugged IO.
+ */
+ if (blk_needs_flush_plug(current))
+ blk_schedule_flush_plug(current);
+
if (!nr_pages)
nr_pages = get_nr_dirty_pages();
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f394aff59c36..3988b43c2f5a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -530,13 +530,13 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
req->out.args[0].size = count;
}
-static void fuse_release_user_pages(struct fuse_req *req, int write)
+static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
{
unsigned i;
for (i = 0; i < req->num_pages; i++) {
struct page *page = req->pages[i];
- if (write)
+ if (should_dirty)
set_page_dirty_lock(page);
put_page(page);
}
@@ -1320,6 +1320,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
loff_t *ppos, int flags)
{
int write = flags & FUSE_DIO_WRITE;
+ bool should_dirty = !write && iter_is_iovec(iter);
int cuse = flags & FUSE_DIO_CUSE;
struct file *file = io->file;
struct inode *inode = file->f_mapping->host;
@@ -1363,7 +1364,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
nres = fuse_send_read(req, io, pos, nbytes, owner);
if (!io->async)
- fuse_release_user_pages(req, !write);
+ fuse_release_user_pages(req, should_dirty);
if (req->out.h.error) {
err = req->out.h.error;
break;
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index d3bcdd975700..b3be1b5a62e2 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -189,6 +189,11 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
return generic_block_bmap(mapping, block, hpfs_get_block);
}
+static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block);
+}
+
const struct address_space_operations hpfs_aops = {
.readpage = hpfs_readpage,
.writepage = hpfs_writepage,
@@ -214,4 +219,5 @@ const struct file_operations hpfs_file_ops =
const struct inode_operations hpfs_file_iops =
{
.setattr = hpfs_setattr,
+ .fiemap = hpfs_fiemap,
};
diff --git a/fs/inode.c b/fs/inode.c
index 7e3ef3af3db9..691c00747e72 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1021,13 +1021,17 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
{
struct hlist_head *head = inode_hashtable + hash(sb, hashval);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode(sb, head, test, data);
spin_unlock(&inode_hash_lock);
if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
return inode;
}
@@ -1064,6 +1068,10 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
}
return inode;
@@ -1091,12 +1099,16 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
return inode;
}
@@ -1131,6 +1143,10 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
destroy_inode(inode);
inode = old;
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
}
return inode;
}
@@ -1266,10 +1282,16 @@ EXPORT_SYMBOL(ilookup5_nowait);
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
{
- struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
-
- if (inode)
+ struct inode *inode;
+again:
+ inode = ilookup5_nowait(sb, hashval, test, data);
+ if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ }
return inode;
}
EXPORT_SYMBOL(ilookup5);
@@ -1286,13 +1308,18 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
struct hlist_head *head = inode_hashtable + hash(sb, ino);
struct inode *inode;
-
+again:
spin_lock(&inode_hash_lock);
inode = find_inode_fast(sb, head, ino);
spin_unlock(&inode_hash_lock);
- if (inode)
+ if (inode) {
wait_on_inode(inode);
+ if (unlikely(inode_unhashed(inode))) {
+ iput(inode);
+ goto again;
+ }
+ }
return inode;
}
EXPORT_SYMBOL(ilookup);
diff --git a/fs/internal.h b/fs/internal.h
index ba0737649d4a..395887882315 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -156,7 +156,7 @@ extern void mnt_pin_kill(struct mount *m);
/*
* fs/nsfs.c
*/
-extern struct dentry_operations ns_dentry_operations;
+extern const struct dentry_operations ns_dentry_operations;
/*
* fs/ioctl.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0f56deb24ce6..c415668c86d4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -568,7 +568,7 @@ static int ioctl_fsthaw(struct file *filp)
return thaw_super(sb);
}
-static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+static int ioctl_file_dedupe_range(struct file *file, void __user *arg)
{
struct file_dedupe_range __user *argp = arg;
struct file_dedupe_range *same = NULL;
@@ -582,6 +582,10 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
}
size = offsetof(struct file_dedupe_range __user, info[count]);
+ if (size > PAGE_SIZE) {
+ ret = -ENOMEM;
+ goto out;
+ }
same = memdup_user(argp, size);
if (IS_ERR(same)) {
diff --git a/fs/iomap.c b/fs/iomap.c
index 48141b8eff5f..706270f21b35 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -84,8 +84,11 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* Now the data has been copied, commit the range we've copied. This
* should not fail unless the filesystem has had a fatal error.
*/
- ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
- flags, &iomap);
+ if (ops->iomap_end) {
+ ret = ops->iomap_end(inode, pos, length,
+ written > 0 ? written : 0,
+ flags, &iomap);
+ }
return written ? written : ret;
}
@@ -194,12 +197,9 @@ again:
if (mapping_writably_mapped(inode->i_mapping))
flush_dcache_page(page);
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
status = iomap_write_end(inode, pos, bytes, copied, page);
if (unlikely(status < 0))
@@ -428,9 +428,12 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
break;
}
+ if (iomap->flags & IOMAP_F_MERGED)
+ flags |= FIEMAP_EXTENT_MERGED;
+
return fiemap_fill_next_extent(fi, iomap->offset,
iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
- iomap->length, flags | FIEMAP_EXTENT_MERGED);
+ iomap->length, flags);
}
@@ -470,13 +473,18 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
if (ret)
return ret;
- ret = filemap_write_and_wait(inode->i_mapping);
- if (ret)
- return ret;
+ if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret)
+ return ret;
+ }
while (len > 0) {
ret = iomap_apply(inode, start, len, 0, ops, &ctx,
iomap_fiemap_actor);
+ /* inode with no (attribute) mapping will give ENOENT */
+ if (ret == -ENOENT)
+ break;
if (ret < 0)
return ret;
if (ret == 0)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e1574008adc9..2bcb86e6e6ca 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -840,21 +840,35 @@ repeat:
mutex_lock(&kernfs_mutex);
list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
+ struct kernfs_node *parent;
struct inode *inode;
- struct dentry *dentry;
+ /*
+ * We want fsnotify_modify() on @kn but as the
+ * modifications aren't originating from userland don't
+ * have the matching @file available. Look up the inodes
+ * and generate the events manually.
+ */
inode = ilookup(info->sb, kn->ino);
if (!inode)
continue;
- dentry = d_find_any_alias(inode);
- if (dentry) {
- fsnotify_parent(NULL, dentry, FS_MODIFY);
- fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
- NULL, 0);
- dput(dentry);
+ parent = kernfs_get_parent(kn);
+ if (parent) {
+ struct inode *p_inode;
+
+ p_inode = ilookup(info->sb, parent->ino);
+ if (p_inode) {
+ fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
+ inode, FSNOTIFY_EVENT_INODE, kn->name, 0);
+ iput(p_inode);
+ }
+
+ kernfs_put(parent);
}
+ fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+ kn->name, 0);
iput(inode);
}
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index f55a4e756047..217847679f0e 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -346,7 +346,7 @@ static void bl_write_cleanup(struct work_struct *work)
PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
- (end - start) >> SECTOR_SHIFT);
+ (end - start) >> SECTOR_SHIFT, end);
}
pnfs_ld_write_done(hdr);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 18e6fd0b9506..efc007f00742 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -141,6 +141,7 @@ struct pnfs_block_layout {
struct rb_root bl_ext_ro;
spinlock_t bl_ext_lock; /* Protects list manipulation */
bool bl_scsi_layout;
+ u64 bl_lwb;
};
static inline struct pnfs_block_layout *
@@ -182,7 +183,7 @@ int ext_tree_insert(struct pnfs_block_layout *bl,
int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
sector_t end);
int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len);
+ sector_t len, u64 lwb);
bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
struct pnfs_block_extent *ret, bool rw);
int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 992bcb19c11e..c85fbfd2d0d9 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -402,7 +402,7 @@ ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
int
ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
- sector_t len)
+ sector_t len, u64 lwb)
{
struct rb_root *root = &bl->bl_ext_rw;
sector_t end = start + len;
@@ -471,6 +471,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
}
out:
+ if (bl->bl_lwb < lwb)
+ bl->bl_lwb = lwb;
spin_unlock(&bl->bl_ext_lock);
__ext_put_deviceids(&tmp);
@@ -518,7 +520,7 @@ static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
}
static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
- size_t buffer_size, size_t *count)
+ size_t buffer_size, size_t *count, __u64 *lastbyte)
{
struct pnfs_block_extent *be;
int ret = 0;
@@ -542,6 +544,8 @@ static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
p = encode_block_extent(be, p);
be->be_tag = EXTENT_COMMITTING;
}
+ *lastbyte = bl->bl_lwb - 1;
+ bl->bl_lwb = 0;
spin_unlock(&bl->bl_ext_lock);
return ret;
@@ -564,7 +568,7 @@ ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
arg->layoutupdate_pages = &arg->layoutupdate_page;
retry:
- ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
+ ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index a7f2e6e33305..52a28311e2a4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -275,6 +275,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
err_socks:
svc_rpcb_cleanup(serv, net);
err_bind:
+ nn->cb_users[minorversion]--;
dprintk("NFS: Couldn't create callback socket: err = %d; "
"net = %p\n", ret, net);
return ret;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c92a75e066a6..f953ef6b2f2e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -454,11 +454,8 @@ static bool referring_call_exists(struct nfs_client *clp,
((u32 *)&rclist->rcl_sessionid.data)[3],
ref->rc_sequenceid, ref->rc_slotid);
- spin_lock(&tbl->slot_tbl_lock);
- status = (test_bit(ref->rc_slotid, tbl->used_slots) &&
- tbl->slots[ref->rc_slotid].seq_nr ==
- ref->rc_sequenceid);
- spin_unlock(&tbl->slot_tbl_lock);
+ status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid,
+ ref->rc_sequenceid, HZ >> 1) < 0;
if (status)
goto out;
}
@@ -487,7 +484,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
goto out;
tbl = &clp->cl_session->bc_slot_table;
- slot = tbl->slots + args->csa_slotid;
/* Set up res before grabbing the spinlock */
memcpy(&res->csr_sessionid, &args->csa_sessionid,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 003ebce4bbc4..1e106780a237 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -426,7 +426,7 @@ EXPORT_SYMBOL_GPL(nfs_mark_client_ready);
* Initialise the timeout values for a connection
*/
void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
- unsigned int timeo, unsigned int retrans)
+ int timeo, int retrans)
{
to->to_initval = timeo * HZ / 10;
to->to_retries = retrans;
@@ -434,9 +434,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
switch (proto) {
case XPRT_TRANSPORT_TCP:
case XPRT_TRANSPORT_RDMA:
- if (to->to_retries == 0)
+ if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_TCP_RETRANS;
- if (to->to_initval == 0)
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_retries == 0)
to->to_initval = NFS_DEF_TCP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_TCP_TIMEOUT)
to->to_initval = NFS_MAX_TCP_TIMEOUT;
@@ -449,9 +449,9 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
to->to_exponential = 0;
break;
case XPRT_TRANSPORT_UDP:
- if (to->to_retries == 0)
+ if (retrans == NFS_UNSPEC_RETRANS)
to->to_retries = NFS_DEF_UDP_RETRANS;
- if (!to->to_initval)
+ if (timeo == NFS_UNSPEC_TIMEO || to->to_initval == 0)
to->to_initval = NFS_DEF_UDP_TIMEO * HZ / 10;
if (to->to_initval > NFS_MAX_UDP_TIMEOUT)
to->to_initval = NFS_MAX_UDP_TIMEOUT;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7d620970f2e1..ca699ddc11c1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -657,7 +657,10 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (result <= 0)
goto out;
- written = generic_write_sync(iocb, result);
+ result = generic_write_sync(iocb, result);
+ if (result < 0)
+ goto out;
+ written = result;
iocb->ki_pos += written;
/* Return error values */
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index e6206eaf2bdf..51b51369704c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -37,6 +37,7 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
if (ffl) {
INIT_LIST_HEAD(&ffl->error_list);
INIT_LIST_HEAD(&ffl->mirrors);
+ ffl->last_report_time = ktime_get();
return &ffl->generic_hdr;
} else
return NULL;
@@ -640,19 +641,18 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
{
static const ktime_t notime = {0};
s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
if (ktime_equal(mirror->start_time, notime))
mirror->start_time = now;
- if (ktime_equal(mirror->last_report_time, notime))
- mirror->last_report_time = now;
if (mirror->report_interval != 0)
report_interval = (s64)mirror->report_interval * 1000LL;
else if (layoutstats_timer != 0)
report_interval = (s64)layoutstats_timer * 1000LL;
- if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
+ if (ktime_to_ms(ktime_sub(now, ffl->last_report_time)) >=
report_interval) {
- mirror->last_report_time = now;
+ ffl->last_report_time = now;
return true;
}
@@ -806,11 +806,14 @@ ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
{
struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_pnfs_ds *ds;
+ bool fail_return = false;
int idx;
/* mirrors are sorted by efficiency */
for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
- ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ if (idx+1 == fls->mirror_array_cnt)
+ fail_return = true;
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, fail_return);
if (ds) {
*best_idx = idx;
return ds;
@@ -859,6 +862,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs4_pnfs_ds *ds;
int ds_idx;
+retry:
/* Use full layout for now */
if (!pgio->pg_lseg)
ff_layout_pg_get_read(pgio, req, false);
@@ -871,10 +875,13 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
if (!ds) {
- if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
- goto out_pnfs;
- else
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
}
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -890,12 +897,6 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_read_mds(pgio);
- return;
-
-out_pnfs:
- pnfs_set_lo_fail(pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
}
static void
@@ -909,6 +910,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
int i;
int status;
+retry:
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
@@ -940,10 +942,13 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
for (i = 0; i < pgio->pg_mirror_count; i++) {
ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
if (!ds) {
- if (ff_layout_no_fallback_to_mds(pgio->pg_lseg))
- goto out_pnfs;
- else
+ if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg))
goto out_mds;
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ /* Sleep for 1 second before retrying */
+ ssleep(1);
+ goto retry;
}
pgm = &pgio->pg_mirrors[i];
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -956,12 +961,6 @@ out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
- return;
-
-out_pnfs:
- pnfs_set_lo_fail(pgio->pg_lseg);
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
}
static unsigned int
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 1bcdb15d0c41..3ee0c9fcea76 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -84,7 +84,6 @@ struct nfs4_ff_layout_mirror {
struct nfs4_ff_layoutstat read_stat;
struct nfs4_ff_layoutstat write_stat;
ktime_t start_time;
- ktime_t last_report_time;
u32 report_interval;
};
@@ -101,6 +100,7 @@ struct nfs4_flexfile_layout {
struct pnfs_ds_commit_info commit_info;
struct list_head mirrors;
struct list_head error_list; /* nfs4_ff_layout_ds_err */
+ ktime_t last_report_time; /* Layoutstat report times */
};
static inline struct nfs4_flexfile_layout *
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index 0aa36be71fce..f7a3f6b05369 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -17,8 +17,8 @@
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
-static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
-static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
+static unsigned int dataserver_retrans;
void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
{
@@ -379,7 +379,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
devid = &mirror->mirror_ds->id_node;
if (ff_layout_test_devid_unavailable(devid))
- goto out;
+ goto out_fail;
ds = mirror->mirror_ds->ds;
/* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
@@ -405,15 +405,16 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
mirror->mirror_ds->ds_versions[0].rsize = max_payload;
if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
mirror->mirror_ds->ds_versions[0].wsize = max_payload;
- } else {
- ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, lseg->pls_range.offset,
- lseg->pls_range.length, NFS4ERR_NXIO,
- OP_ILLEGAL, GFP_NOIO);
- if (fail_return || !ff_layout_has_available_ds(lseg))
- pnfs_error_mark_layout_for_return(ino, lseg);
- ds = NULL;
+ goto out;
}
+ ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, lseg->pls_range.offset,
+ lseg->pls_range.length, NFS4ERR_NXIO,
+ OP_ILLEGAL, GFP_NOIO);
+out_fail:
+ if (fail_return || !ff_layout_has_available_ds(lseg))
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ ds = NULL;
out:
return ds;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7ce5e023c3c3..74935a19e4bf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -58,6 +58,9 @@ struct nfs_clone_mount {
*/
#define NFS_UNSPEC_PORT (-1)
+#define NFS_UNSPEC_RETRANS (UINT_MAX)
+#define NFS_UNSPEC_TIMEO (UINT_MAX)
+
/*
* Maximum number of pages that readdir can use for creating
* a vmapped array of pages.
@@ -156,7 +159,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
void nfs_server_remove_lists(struct nfs_server *);
-void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
+void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans);
int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
rpc_authflavor_t);
struct nfs_server *nfs_alloc_server(void);
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 33da841a21bb..64b43b4ad9dd 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -318,10 +318,22 @@ static void
nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata)
{
struct nfs42_layoutstat_data *data = calldata;
- struct nfs_server *server = NFS_SERVER(data->args.inode);
+ struct inode *inode = data->inode;
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct pnfs_layout_hdr *lo;
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (!pnfs_layout_is_valid(lo)) {
+ spin_unlock(&inode->i_lock);
+ rpc_exit(task, 0);
+ return;
+ }
+ nfs4_stateid_copy(&data->args.stateid, &lo->plh_stateid);
+ spin_unlock(&inode->i_lock);
nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args,
&data->res.seq_res, task);
+
}
static void
@@ -338,12 +350,14 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
case 0:
break;
case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
spin_lock(&inode->i_lock);
lo = NFS_I(inode)->layout;
- if (lo && nfs4_stateid_match(&data->args.stateid,
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match(&data->args.stateid,
&lo->plh_stateid)) {
LIST_HEAD(head);
@@ -357,11 +371,23 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
} else
spin_unlock(&inode->i_lock);
break;
+ case -NFS4ERR_OLD_STATEID:
+ spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
+ if (pnfs_layout_is_valid(lo) &&
+ nfs4_stateid_match_other(&data->args.stateid,
+ &lo->plh_stateid)) {
+ /* Do we need to delay before resending? */
+ if (!nfs4_stateid_is_newer(&lo->plh_stateid,
+ &data->args.stateid))
+ rpc_delay(task, HZ);
+ rpc_restart_call_prepare(task);
+ }
+ spin_unlock(&inode->i_lock);
+ break;
case -ENOTSUPP:
case -EOPNOTSUPP:
NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
- default:
- break;
}
dprintk("%s server returns %d\n", __func__, task->tk_status);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 324bfdc21250..9bf64eacba5b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -396,6 +396,10 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
+extern void nfs4_set_lease_period(struct nfs_client *clp,
+ unsigned long lease,
+ unsigned long lastrenewed);
+
/* nfs4state.c */
struct rpc_cred *nfs4_get_clid_cred(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 8d7d08d4f95f..cd3b7cfdde16 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -817,6 +817,11 @@ static int nfs4_set_client(struct nfs_server *server,
goto error;
}
+ if (server->nfs_client == clp) {
+ error = -ELOOP;
+ goto error;
+ }
+
/*
* Query for the lease time on clientid setup or renewal
*
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a036e93bdf96..a9dec32ba9ba 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -634,15 +634,11 @@ out_sleep:
}
EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
-static int nfs40_sequence_done(struct rpc_task *task,
- struct nfs4_sequence_res *res)
+static void nfs40_sequence_free_slot(struct nfs4_sequence_res *res)
{
struct nfs4_slot *slot = res->sr_slot;
struct nfs4_slot_table *tbl;
- if (slot == NULL)
- goto out;
-
tbl = slot->table;
spin_lock(&tbl->slot_tbl_lock);
if (!nfs41_wake_and_assign_slot(tbl, slot))
@@ -650,7 +646,13 @@ static int nfs40_sequence_done(struct rpc_task *task,
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
-out:
+}
+
+static int nfs40_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
return 1;
}
@@ -666,6 +668,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
tbl = slot->table;
session = tbl->session;
+ /* Bump the slot sequence number */
+ if (slot->seq_done)
+ slot->seq_nr++;
+ slot->seq_done = 0;
+
spin_lock(&tbl->slot_tbl_lock);
/* Be nice to the server: try to ensure that the last transmitted
* value for highest_user_slotid <= target_highest_slotid
@@ -686,9 +693,12 @@ out_unlock:
res->sr_slot = NULL;
if (send_new_highest_used_slotid)
nfs41_notify_server(session->clp);
+ if (waitqueue_active(&tbl->slot_waitq))
+ wake_up_all(&tbl->slot_waitq);
}
-int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+static int nfs41_sequence_process(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot *slot = res->sr_slot;
@@ -714,7 +724,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
switch (res->sr_status) {
case 0:
/* Update the slot's sequence and clientid lease timer */
- ++slot->seq_nr;
+ slot->seq_done = 1;
clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
@@ -769,16 +779,16 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
goto retry_nowait;
default:
/* Just update the slot sequence no. */
- ++slot->seq_nr;
+ slot->seq_done = 1;
}
out:
/* The session may be reset by one of the error handlers. */
dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
- nfs41_sequence_free_slot(res);
out_noaction:
return ret;
retry_nowait:
if (rpc_restart_call_prepare(task)) {
+ nfs41_sequence_free_slot(res);
task->tk_status = 0;
ret = 0;
}
@@ -789,8 +799,37 @@ out_retry:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
return 0;
}
+
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (!nfs41_sequence_process(task, res))
+ return 0;
+ if (res->sr_slot != NULL)
+ nfs41_sequence_free_slot(res);
+ return 1;
+
+}
EXPORT_SYMBOL_GPL(nfs41_sequence_done);
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot == NULL)
+ return 1;
+ if (res->sr_slot->table->session != NULL)
+ return nfs41_sequence_process(task, res);
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL) {
+ if (res->sr_slot->table->session != NULL)
+ nfs41_sequence_free_slot(res);
+ else
+ nfs40_sequence_free_slot(res);
+ }
+}
+
int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
{
if (res->sr_slot == NULL)
@@ -920,6 +959,17 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
args, res, task);
}
+static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res)
+{
+ return nfs40_sequence_done(task, res);
+}
+
+static void nfs4_sequence_free_slot(struct nfs4_sequence_res *res)
+{
+ if (res->sr_slot != NULL)
+ nfs40_sequence_free_slot(res);
+}
+
int nfs4_sequence_done(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
@@ -1197,6 +1247,7 @@ static void nfs4_opendata_free(struct kref *kref)
struct super_block *sb = p->dentry->d_sb;
nfs_free_seqid(p->o_arg.seqid);
+ nfs4_sequence_free_slot(&p->o_res.seq_res);
if (p->state != NULL)
nfs4_put_open_state(p->state);
nfs4_put_state_owner(p->owner);
@@ -1656,9 +1707,14 @@ err:
static struct nfs4_state *
nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
{
+ struct nfs4_state *ret;
+
if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS)
- return _nfs4_opendata_reclaim_to_nfs4_state(data);
- return _nfs4_opendata_to_nfs4_state(data);
+ ret =_nfs4_opendata_reclaim_to_nfs4_state(data);
+ else
+ ret = _nfs4_opendata_to_nfs4_state(data);
+ nfs4_sequence_free_slot(&data->o_res.seq_res);
+ return ret;
}
static struct nfs_open_context *nfs4_state_find_open_context(struct nfs4_state *state)
@@ -2056,7 +2112,7 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
- if (!nfs4_sequence_done(task, &data->o_res.seq_res))
+ if (!nfs4_sequence_process(task, &data->o_res.seq_res))
return;
if (task->tk_status == 0) {
@@ -4237,12 +4293,9 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
if (err == 0) {
- struct nfs_client *clp = server->nfs_client;
-
- spin_lock(&clp->cl_lock);
- clp->cl_lease_time = fsinfo->lease_time * HZ;
- clp->cl_last_renewal = now;
- spin_unlock(&clp->cl_lock);
+ nfs4_set_lease_period(server->nfs_client,
+ fsinfo->lease_time * HZ,
+ now);
break;
}
err = nfs4_handle_exception(server, err, &exception);
@@ -7517,12 +7570,20 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
status = rpc_call_sync(session->clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
trace_nfs4_create_session(clp, status);
+ switch (status) {
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_DELAY:
+ case -ETIMEDOUT:
+ case -EACCES:
+ case -EAGAIN:
+ goto out;
+ };
+
+ clp->cl_seqid++;
if (!status) {
/* Verify the session's negotiated channel_attrs values */
status = nfs4_verify_channel_attrs(&args, &res);
/* Increment the clientid slot sequence id */
- if (clp->cl_seqid == res.seqid)
- clp->cl_seqid++;
if (status)
goto out;
nfs4_update_session(session, &res);
@@ -7867,7 +7928,7 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs4_layoutget *lgp = calldata;
dprintk("--> %s\n", __func__);
- nfs41_sequence_done(task, &lgp->res.seq_res);
+ nfs41_sequence_process(task, &lgp->res.seq_res);
dprintk("<-- %s\n", __func__);
}
@@ -8083,6 +8144,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout, gfp_t gfp_flags)
/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
if (status == 0 && lgp->res.layoutp->len)
lseg = pnfs_layout_process(lgp);
+ nfs4_sequence_free_slot(&lgp->res.seq_res);
rpc_put_task(task);
dprintk("<-- %s status=%d\n", __func__, status);
if (status)
@@ -8109,7 +8171,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
dprintk("--> %s\n", __func__);
- if (!nfs41_sequence_done(task, &lrp->res.seq_res))
+ if (!nfs41_sequence_process(task, &lrp->res.seq_res))
return;
server = NFS_SERVER(lrp->args.inode);
@@ -8121,6 +8183,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_DELAY:
if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
break;
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
rpc_restart_call_prepare(task);
return;
}
@@ -8135,12 +8198,16 @@ static void nfs4_layoutreturn_release(void *calldata)
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
- be32_to_cpu(lrp->args.stateid.seqid));
- if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
+ if (lrp->res.lrs_present) {
+ pnfs_mark_matching_lsegs_invalid(lo, &freeme,
+ &lrp->args.range,
+ be32_to_cpu(lrp->args.stateid.seqid));
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ } else
+ pnfs_mark_layout_stateid_invalid(lo, &freeme);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&lo->plh_inode->i_lock);
+ nfs4_sequence_free_slot(&lrp->res.seq_res);
pnfs_free_lseg_list(&freeme);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index e1ba58c3d1ad..82e77198d17e 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -136,6 +136,26 @@ nfs4_kill_renewd(struct nfs_client *clp)
cancel_delayed_work_sync(&clp->cl_renewd);
}
+/**
+ * nfs4_set_lease_period - Sets the lease period on a nfs_client
+ *
+ * @clp: pointer to nfs_client
+ * @lease: new value for lease period
+ * @lastrenewed: time at which lease was last renewed
+ */
+void nfs4_set_lease_period(struct nfs_client *clp,
+ unsigned long lease,
+ unsigned long lastrenewed)
+{
+ spin_lock(&clp->cl_lock);
+ clp->cl_lease_time = lease;
+ clp->cl_last_renewal = lastrenewed;
+ spin_unlock(&clp->cl_lock);
+
+ /* Cap maximum reconnect timeout at 1/2 lease period */
+ rpc_cap_max_reconnect_timeout(clp->cl_rpcclient, lease >> 1);
+}
+
/*
* Local variables:
* c-basic-offset: 8
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index 332d06e64fa9..b62973045a3e 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -28,6 +28,7 @@ static void nfs4_init_slot_table(struct nfs4_slot_table *tbl, const char *queue)
tbl->highest_used_slotid = NFS4_NO_SLOT;
spin_lock_init(&tbl->slot_tbl_lock);
rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, queue);
+ init_waitqueue_head(&tbl->slot_waitq);
init_completion(&tbl->complete);
}
@@ -172,6 +173,58 @@ struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
return ERR_PTR(-E2BIG);
}
+static int nfs4_slot_get_seqid(struct nfs4_slot_table *tbl, u32 slotid,
+ u32 *seq_nr)
+ __must_hold(&tbl->slot_tbl_lock)
+{
+ struct nfs4_slot *slot;
+
+ slot = nfs4_lookup_slot(tbl, slotid);
+ if (IS_ERR(slot))
+ return PTR_ERR(slot);
+ *seq_nr = slot->seq_nr;
+ return 0;
+}
+
+/*
+ * nfs4_slot_seqid_in_use - test if a slot sequence id is still in use
+ *
+ * Given a slot table, slot id and sequence number, determine if the
+ * RPC call in question is still in flight. This function is mainly
+ * intended for use by the callback channel.
+ */
+static bool nfs4_slot_seqid_in_use(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr)
+{
+ u32 cur_seq;
+ bool ret = false;
+
+ spin_lock(&tbl->slot_tbl_lock);
+ if (nfs4_slot_get_seqid(tbl, slotid, &cur_seq) == 0 &&
+ cur_seq == seq_nr && test_bit(slotid, tbl->used_slots))
+ ret = true;
+ spin_unlock(&tbl->slot_tbl_lock);
+ return ret;
+}
+
+/*
+ * nfs4_slot_wait_on_seqid - wait until a slot sequence id is complete
+ *
+ * Given a slot table, slot id and sequence number, wait until the
+ * corresponding RPC call completes. This function is mainly
+ * intended for use by the callback channel.
+ */
+int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout)
+{
+ if (wait_event_timeout(tbl->slot_waitq,
+ !nfs4_slot_seqid_in_use(tbl, slotid, seq_nr),
+ timeout) == 0)
+ return -ETIMEDOUT;
+ return 0;
+}
+
/*
* nfs4_alloc_slot - efficiently look for a free slot
*
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 5b51298d1d03..f703b755351b 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -21,7 +21,8 @@ struct nfs4_slot {
unsigned long generation;
u32 slot_nr;
u32 seq_nr;
- unsigned int interrupted : 1;
+ unsigned int interrupted : 1,
+ seq_done : 1;
};
/* Sessions */
@@ -36,6 +37,7 @@ struct nfs4_slot_table {
unsigned long used_slots[SLOT_TABLE_SZ]; /* used/unused bitmap */
spinlock_t slot_tbl_lock;
struct rpc_wait_queue slot_tbl_waitq; /* allocators may wait here */
+ wait_queue_head_t slot_waitq; /* Completion wait on slot */
u32 max_slots; /* # slots in table */
u32 max_slotid; /* Max allowed slotid value */
u32 highest_used_slotid; /* sent to server on each SEQ.
@@ -78,6 +80,9 @@ extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid);
+extern int nfs4_slot_wait_on_seqid(struct nfs4_slot_table *tbl,
+ u32 slotid, u32 seq_nr,
+ unsigned long timeout);
extern bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 834b875900d6..cada00aa5096 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -277,20 +277,17 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
{
int status;
struct nfs_fsinfo fsinfo;
+ unsigned long now;
if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
nfs4_schedule_state_renewal(clp);
return 0;
}
+ now = jiffies;
status = nfs4_proc_get_lease_time(clp, &fsinfo);
if (status == 0) {
- /* Update lease time and schedule renewal */
- spin_lock(&clp->cl_lock);
- clp->cl_lease_time = fsinfo.lease_time * HZ;
- clp->cl_last_renewal = jiffies;
- spin_unlock(&clp->cl_lock);
-
+ nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
nfs4_schedule_state_renewal(clp);
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 70806cae0d36..2c93a85eda51 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -365,7 +365,8 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
atomic_dec(&lo->plh_refcount);
if (list_empty(&lo->plh_segs)) {
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ if (atomic_read(&lo->plh_outstanding) == 0)
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
}
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
@@ -768,17 +769,32 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
bool update_barrier)
{
u32 oldseq, newseq, new_barrier = 0;
- bool invalid = !pnfs_layout_is_valid(lo);
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
newseq = be32_to_cpu(new->seqid);
- if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
+
+ if (!pnfs_layout_is_valid(lo)) {
+ nfs4_stateid_copy(&lo->plh_stateid, new);
+ lo->plh_barrier = newseq;
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ return;
+ }
+ if (pnfs_seqid_is_newer(newseq, oldseq)) {
nfs4_stateid_copy(&lo->plh_stateid, new);
/*
* Because of wraparound, we want to keep the barrier
@@ -790,7 +806,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
new_barrier = be32_to_cpu(new->seqid);
else if (new_barrier == 0)
return;
- if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+ if (pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
lo->plh_barrier = new_barrier;
}
@@ -886,19 +902,14 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
- lo->plh_return_iomode = 0;
- lo->plh_return_seq = 0;
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
-
static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
nfs4_stateid *stateid,
enum pnfs_iomode *iomode)
{
+ /* Serialise LAYOUTGET/LAYOUTRETURN */
+ if (atomic_read(&lo->plh_outstanding) != 0)
+ return false;
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
pnfs_get_layout_hdr(lo);
@@ -1555,6 +1566,7 @@ pnfs_update_layout(struct inode *ino,
}
lookup_again:
+ nfs4_client_recover_expired_lease(clp);
first = false;
spin_lock(&ino->i_lock);
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
@@ -1797,16 +1809,11 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
*/
pnfs_mark_layout_stateid_invalid(lo, &free_me);
- nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
- lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
+ pnfs_set_layout_stateid(lo, &res->stateid, true);
}
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg, &free_me);
- if (!pnfs_layout_is_valid(lo)) {
- pnfs_clear_layoutreturn_info(lo);
- clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- }
if (res->return_on_close)
@@ -2510,7 +2517,6 @@ pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
data->args.fh = NFS_FH(inode);
data->args.inode = inode;
- nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid);
status = ld->prepare_layoutstats(&data->args);
if (status)
goto out_free;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 18d446e1a82b..d39601381adf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -923,6 +923,8 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (data) {
+ data->timeo = NFS_UNSPEC_TIMEO;
+ data->retrans = NFS_UNSPEC_RETRANS;
data->acregmin = NFS_DEF_ACREGMIN;
data->acregmax = NFS_DEF_ACREGMAX;
data->acdirmin = NFS_DEF_ACDIRMIN;
@@ -1189,6 +1191,19 @@ static int nfs_get_option_ul(substring_t args[], unsigned long *option)
return rc;
}
+static int nfs_get_option_ul_bound(substring_t args[], unsigned long *option,
+ unsigned long l_bound, unsigned long u_bound)
+{
+ int ret;
+
+ ret = nfs_get_option_ul(args, option);
+ if (ret != 0)
+ return ret;
+ if (*option < l_bound || *option > u_bound)
+ return -ERANGE;
+ return 0;
+}
+
/*
* Error-check and convert a string of mount options from user space into
* a data structure. The whole mount string is processed; bad options are
@@ -1352,12 +1367,12 @@ static int nfs_parse_mount_options(char *raw,
mnt->bsize = option;
break;
case Opt_timeo:
- if (nfs_get_option_ul(args, &option) || option == 0)
+ if (nfs_get_option_ul_bound(args, &option, 1, INT_MAX))
goto out_invalid_value;
mnt->timeo = option;
break;
case Opt_retrans:
- if (nfs_get_option_ul(args, &option) || option == 0)
+ if (nfs_get_option_ul_bound(args, &option, 0, INT_MAX))
goto out_invalid_value;
mnt->retrans = option;
break;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 8410ca275db1..a204d7e109d4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4903,6 +4903,32 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfs_ok;
}
+static __be32
+nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s)
+{
+ struct nfs4_ol_stateid *stp = openlockstateid(s);
+ __be32 ret;
+
+ mutex_lock(&stp->st_mutex);
+
+ ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
+ if (ret)
+ goto out;
+
+ ret = nfserr_locks_held;
+ if (check_for_locks(stp->st_stid.sc_file,
+ lockowner(stp->st_stateowner)))
+ goto out;
+
+ release_lock_stateid(stp);
+ ret = nfs_ok;
+
+out:
+ mutex_unlock(&stp->st_mutex);
+ nfs4_put_stid(s);
+ return ret;
+}
+
__be32
nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_free_stateid *free_stateid)
@@ -4910,7 +4936,6 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
stateid_t *stateid = &free_stateid->fr_stateid;
struct nfs4_stid *s;
struct nfs4_delegation *dp;
- struct nfs4_ol_stateid *stp;
struct nfs4_client *cl = cstate->session->se_client;
__be32 ret = nfserr_bad_stateid;
@@ -4929,18 +4954,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
ret = nfserr_locks_held;
break;
case NFS4_LOCK_STID:
- ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
- if (ret)
- break;
- stp = openlockstateid(s);
- ret = nfserr_locks_held;
- if (check_for_locks(stp->st_stid.sc_file,
- lockowner(stp->st_stateowner)))
- break;
- WARN_ON(!unhash_lock_stateid(stp));
+ atomic_inc(&s->sc_count);
spin_unlock(&cl->cl_lock);
- nfs4_put_stid(s);
- ret = nfs_ok;
+ ret = nfsd4_free_lock_stateid(stateid, s);
goto out;
case NFS4_REVOKED_DELEG_STID:
dp = delegstateid(s);
@@ -5507,7 +5523,7 @@ static __be32
lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_ol_stateid *ost,
struct nfsd4_lock *lock,
- struct nfs4_ol_stateid **lst, bool *new)
+ struct nfs4_ol_stateid **plst, bool *new)
{
__be32 status;
struct nfs4_file *fi = ost->st_stid.sc_file;
@@ -5515,7 +5531,9 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
struct nfs4_client *cl = oo->oo_owner.so_client;
struct inode *inode = d_inode(cstate->current_fh.fh_dentry);
struct nfs4_lockowner *lo;
+ struct nfs4_ol_stateid *lst;
unsigned int strhashval;
+ bool hashed;
lo = find_lockowner_str(cl, &lock->lk_new_owner);
if (!lo) {
@@ -5531,12 +5549,27 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
goto out;
}
- *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
- if (*lst == NULL) {
+retry:
+ lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
+ if (lst == NULL) {
status = nfserr_jukebox;
goto out;
}
+
+ mutex_lock(&lst->st_mutex);
+
+ /* See if it's still hashed to avoid race with FREE_STATEID */
+ spin_lock(&cl->cl_lock);
+ hashed = !list_empty(&lst->st_perfile);
+ spin_unlock(&cl->cl_lock);
+
+ if (!hashed) {
+ mutex_unlock(&lst->st_mutex);
+ nfs4_put_stid(&lst->st_stid);
+ goto retry;
+ }
status = nfs_ok;
+ *plst = lst;
out:
nfs4_put_stateowner(&lo->lo_owner);
return status;
@@ -5603,8 +5636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
- if (status == nfs_ok)
- mutex_lock(&lock_stp->st_mutex);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ba944123167b..ff476e654b8f 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1252,10 +1252,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (IS_ERR(dchild))
return nfserrno(host_err);
err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err) {
- dput(dchild);
+ /*
+ * We unconditionally drop our ref to dchild as fh_compose will have
+ * already grabbed its own ref for it.
+ */
+ dput(dchild);
+ if (err)
return err;
- }
return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
rdev, resfhp);
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index d2f97ecca6a5..e0e5f7c3c99f 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -67,18 +67,7 @@ static int fanotify_get_response(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- wait_event(group->fanotify_data.access_waitq, event->response ||
- atomic_read(&group->fanotify_data.bypass_perm));
-
- if (!event->response) { /* bypass_perm set */
- /*
- * Event was canceled because group is being destroyed. Remove
- * it from group's event list because we are responsible for
- * freeing the permission event.
- */
- fsnotify_remove_event(group, &event->fae.fse);
- return 0;
- }
+ wait_event(group->fanotify_data.access_waitq, event->response);
/* userspace responded, convert to something usable */
switch (event->response) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8e8e6bcd1d43..a64313868d3a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -358,16 +358,20 @@ static int fanotify_release(struct inode *ignored, struct file *file)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
struct fanotify_perm_event_info *event, *next;
+ struct fsnotify_event *fsn_event;
/*
- * There may be still new events arriving in the notification queue
- * but since userspace cannot use fanotify fd anymore, no event can
- * enter or leave access_list by now.
+ * Stop new events from arriving in the notification queue. since
+ * userspace cannot use fanotify fd anymore, no event can enter or
+ * leave access_list by now either.
*/
- spin_lock(&group->fanotify_data.access_lock);
-
- atomic_inc(&group->fanotify_data.bypass_perm);
+ fsnotify_group_stop_queueing(group);
+ /*
+ * Process all permission events on access_list and notification queue
+ * and simulate reply from userspace.
+ */
+ spin_lock(&group->fanotify_data.access_lock);
list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
fae.fse.list) {
pr_debug("%s: found group=%p event=%p\n", __func__, group,
@@ -379,12 +383,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
spin_unlock(&group->fanotify_data.access_lock);
/*
- * Since bypass_perm is set, newly queued events will not wait for
- * access response. Wake up the already sleeping ones now.
- * synchronize_srcu() in fsnotify_destroy_group() will wait for all
- * processes sleeping in fanotify_handle_event() waiting for access
- * response and thus also for all permission events to be freed.
+ * Destroy all non-permission events. For permission events just
+ * dequeue them and set the response. They will be freed once the
+ * response is consumed and fanotify_get_response() returns.
*/
+ mutex_lock(&group->notification_mutex);
+ while (!fsnotify_notify_queue_is_empty(group)) {
+ fsn_event = fsnotify_remove_first_event(group);
+ if (!(fsn_event->mask & FAN_ALL_PERM_EVENTS))
+ fsnotify_destroy_event(group, fsn_event);
+ else
+ FANOTIFY_PE(fsn_event)->response = FAN_ALLOW;
+ }
+ mutex_unlock(&group->notification_mutex);
+
+ /* Response for all permission events it set, wakeup waiters */
wake_up(&group->fanotify_data.access_waitq);
#endif
@@ -755,7 +768,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
spin_lock_init(&group->fanotify_data.access_lock);
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
- atomic_set(&group->fanotify_data.bypass_perm, 0);
#endif
switch (flags & FAN_ALL_CLASS_BITS) {
case FAN_CLASS_NOTIF:
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 3e2dd85be5dd..b47f7cfdcaa4 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -40,6 +40,17 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
}
/*
+ * Stop queueing new events for this group. Once this function returns
+ * fsnotify_add_event() will not add any new events to the group's queue.
+ */
+void fsnotify_group_stop_queueing(struct fsnotify_group *group)
+{
+ mutex_lock(&group->notification_mutex);
+ group->shutdown = true;
+ mutex_unlock(&group->notification_mutex);
+}
+
+/*
* Trying to get rid of a group. Remove all marks, flush all events and release
* the group reference.
* Note that another thread calling fsnotify_clear_marks_by_group() may still
@@ -47,6 +58,14 @@ static void fsnotify_final_destroy_group(struct fsnotify_group *group)
*/
void fsnotify_destroy_group(struct fsnotify_group *group)
{
+ /*
+ * Stop queueing new events. The code below is careful enough to not
+ * require this but fanotify needs to stop queuing events even before
+ * fsnotify_destroy_group() is called and this makes the other callers
+ * of fsnotify_destroy_group() to see the same behavior.
+ */
+ fsnotify_group_stop_queueing(group);
+
/* clear all inode marks for this group, attach them to destroy_list */
fsnotify_detach_group_marks(group);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index a95d8e037aeb..e455e83ceeeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -82,7 +82,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
* Add an event to the group notification queue. The group can later pull this
* event off the queue to deal with. The function returns 0 if the event was
* added to the queue, 1 if the event was merged with some other queued event,
- * 2 if the queue of events has overflown.
+ * 2 if the event was not queued - either the queue of events has overflown
+ * or the group is shutting down.
*/
int fsnotify_add_event(struct fsnotify_group *group,
struct fsnotify_event *event,
@@ -96,6 +97,11 @@ int fsnotify_add_event(struct fsnotify_group *group,
mutex_lock(&group->notification_mutex);
+ if (group->shutdown) {
+ mutex_unlock(&group->notification_mutex);
+ return 2;
+ }
+
if (group->q_len >= group->max_events) {
ret = 2;
/* Queue overflow event only if it isn't already queued */
@@ -126,21 +132,6 @@ queue:
}
/*
- * Remove @event from group's notification queue. It is the responsibility of
- * the caller to destroy the event.
- */
-void fsnotify_remove_event(struct fsnotify_group *group,
- struct fsnotify_event *event)
-{
- mutex_lock(&group->notification_mutex);
- if (!list_empty(&event->list)) {
- list_del_init(&event->list);
- group->q_len--;
- }
- mutex_unlock(&group->notification_mutex);
-}
-
-/*
* Remove and return the first event from the notification list. It is the
* responsibility of the caller to destroy the obtained event
*/
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index f548629dfaac..bf72a2c58b75 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1850,7 +1850,7 @@ again:
* pages being swapped out between us bringing them into memory
* and doing the actual copying.
*/
- if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) {
+ if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7dabbc31060e..f165f867f332 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5922,7 +5922,6 @@ bail:
}
static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
- handle_t *handle,
struct inode *data_alloc_inode,
struct buffer_head *data_alloc_bh)
{
@@ -5935,11 +5934,19 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
struct ocfs2_truncate_log *tl;
struct inode *tl_inode = osb->osb_tl_inode;
struct buffer_head *tl_bh = osb->osb_tl_bh;
+ handle_t *handle;
di = (struct ocfs2_dinode *) tl_bh->b_data;
tl = &di->id2.i_dealloc;
i = le16_to_cpu(tl->tl_used) - 1;
while (i >= 0) {
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail;
+ }
+
/* Caller has given us at least enough credits to
* update the truncate log dinode */
status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
@@ -5974,12 +5981,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
}
}
- status = ocfs2_extend_trans(handle,
- OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
+ ocfs2_commit_trans(osb, handle);
i--;
}
@@ -5994,7 +5996,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
unsigned int num_to_flush;
- handle_t *handle;
struct inode *tl_inode = osb->osb_tl_inode;
struct inode *data_alloc_inode = NULL;
struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -6038,21 +6039,11 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto out_unlock;
- }
-
- status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+ status = ocfs2_replay_truncate_records(osb, data_alloc_inode,
data_alloc_bh);
if (status < 0)
mlog_errno(status);
- ocfs2_commit_trans(osb, handle);
-
-out_unlock:
brelse(data_alloc_bh);
ocfs2_inode_unlock(data_alloc_inode, 1);
@@ -6413,43 +6404,34 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
goto out_mutex;
}
- handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out_unlock;
- }
-
while (head) {
if (head->free_bg)
bg_blkno = head->free_bg;
else
bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
head->free_bit);
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
trace_ocfs2_free_cached_blocks(
(unsigned long long)head->free_blk, head->free_bit);
ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
head->free_bit, bg_blkno, 1);
- if (ret) {
+ if (ret)
mlog_errno(ret);
- goto out_journal;
- }
- ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
- if (ret) {
- mlog_errno(ret);
- goto out_journal;
- }
+ ocfs2_commit_trans(osb, handle);
tmp = head;
head = head->free_next;
kfree(tmp);
}
-out_journal:
- ocfs2_commit_trans(osb, handle);
-
out_unlock:
ocfs2_inode_unlock(inode, 1);
brelse(di_bh);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 94b18369b1cc..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -44,9 +44,6 @@
* version here in tcp_internal.h should not need to be bumped for
* filesystem locking changes.
*
- * New in version 12
- * - Negotiate hb timeout when storage is down.
- *
* New in version 11
* - Negotiation of filesystem locking in the dlm join.
*
@@ -78,7 +75,7 @@
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 12ULL
+#define O2NET_PROTOCOL_VERSION 11ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index cdeafb4e7ed6..0bb128659d4b 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -268,7 +268,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags, int type)
{
enum dlm_status status;
- u8 old_owner = res->owner;
mlog(0, "type=%d, convert_type=%d, busy=%d\n", lock->ml.type,
lock->ml.convert_type, res->state & DLM_LOCK_RES_IN_PROGRESS);
@@ -335,7 +334,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
- lock->convert_pending = 0;
/* if it failed, move it back to granted queue.
* if master returns DLM_NORMAL and then down before sending ast,
* it may have already been moved to granted queue, reset to
@@ -344,12 +342,14 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
if (status != DLM_NOTQUEUED)
dlm_error(status);
dlm_revert_pending_convert(res, lock);
- } else if ((res->state & DLM_LOCK_RES_RECOVERING) ||
- (old_owner != res->owner)) {
- mlog(0, "res %.*s is in recovering or has been recovered.\n",
- res->lockname.len, res->lockname.name);
+ } else if (!lock->convert_pending) {
+ mlog(0, "%s: res %.*s, owner died and lock has been moved back "
+ "to granted list, retry convert.\n",
+ dlm->name, res->lockname.len, res->lockname.name);
status = DLM_RECOVERING;
}
+
+ lock->convert_pending = 0;
bail:
spin_unlock(&res->spinlock);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4e7b0dc22450..0b055bfb8e86 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1506,7 +1506,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
u64 start, u64 len)
{
int ret = 0;
- u64 tmpend, end = start + len;
+ u64 tmpend = 0;
+ u64 end = start + len;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
unsigned int csize = osb->s_clustersize;
handle_t *handle;
@@ -1538,18 +1539,31 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
}
/*
- * We want to get the byte offset of the end of the 1st cluster.
+ * If start is on a cluster boundary and end is somewhere in another
+ * cluster, we have not COWed the cluster starting at start, unless
+ * end is also within the same cluster. So, in this case, we skip this
+ * first call to ocfs2_zero_range_for_truncate() truncate and move on
+ * to the next one.
*/
- tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
- if (tmpend > end)
- tmpend = end;
+ if ((start & (csize - 1)) != 0) {
+ /*
+ * We want to get the byte offset of the end of the 1st
+ * cluster.
+ */
+ tmpend = (u64)osb->s_clustersize +
+ (start & ~(osb->s_clustersize - 1));
+ if (tmpend > end)
+ tmpend = end;
- trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
- (unsigned long long)tmpend);
+ trace_ocfs2_zero_partial_clusters_range1(
+ (unsigned long long)start,
+ (unsigned long long)tmpend);
- ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
- if (ret)
- mlog_errno(ret);
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start,
+ tmpend);
+ if (ret)
+ mlog_errno(ret);
+ }
if (tmpend < end) {
/*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ea47120a85ff..6ad3533940ba 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1199,14 +1199,24 @@ retry:
inode_unlock((*ac)->ac_inode);
ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
- if (ret == 1)
+ if (ret == 1) {
+ iput((*ac)->ac_inode);
+ (*ac)->ac_inode = NULL;
goto retry;
+ }
if (ret < 0)
mlog_errno(ret);
inode_lock((*ac)->ac_inode);
- ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ ret = ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ inode_unlock((*ac)->ac_inode);
+ iput((*ac)->ac_inode);
+ (*ac)->ac_inode = NULL;
+ goto bail;
+ }
}
if (status < 0) {
if (status != -ENOSPC)
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 54e5d6681786..43fdc2765aea 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -80,6 +80,8 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
+ if (ovl_is_private_xattr(name))
+ continue;
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 12bcd07b9e32..1560fdc09a5f 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -12,6 +12,8 @@
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
@@ -186,6 +188,9 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct dentry *newdentry;
int err;
+ if (!hardlink && !IS_POSIXACL(udir))
+ stat->mode &= ~current_umask();
+
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
@@ -335,6 +340,32 @@ out_free:
return ret;
}
+static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name,
+ const struct posix_acl *acl)
+{
+ void *buffer;
+ size_t size;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !acl)
+ return 0;
+
+ size = posix_acl_to_xattr(NULL, acl, NULL, 0);
+ buffer = kmalloc(size, GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ size = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+ err = size;
+ if (err < 0)
+ goto out_free;
+
+ err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE);
+out_free:
+ kfree(buffer);
+ return err;
+}
+
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
@@ -346,10 +377,18 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct dentry *upper;
struct dentry *newdentry;
int err;
+ struct posix_acl *acl, *default_acl;
if (WARN_ON(!workdir))
return -EROFS;
+ if (!hardlink) {
+ err = posix_acl_create(dentry->d_parent->d_inode,
+ &stat->mode, &default_acl, &acl);
+ if (err)
+ return err;
+ }
+
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
@@ -384,6 +423,17 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
+ if (!hardlink) {
+ err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_ACCESS,
+ acl);
+ if (err)
+ goto out_cleanup;
+
+ err = ovl_set_upper_acl(newdentry, XATTR_NAME_POSIX_ACL_DEFAULT,
+ default_acl);
+ if (err)
+ goto out_cleanup;
+ }
if (!hardlink && S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
@@ -410,6 +460,10 @@ out_dput:
out_unlock:
unlock_rename(workdir, upperdir);
out:
+ if (!hardlink) {
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
+ }
return err;
out_cleanup:
@@ -950,9 +1004,9 @@ const struct inode_operations ovl_dir_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = generic_setxattr,
- .getxattr = ovl_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = ovl_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 1b885c156028..c75625c1efa3 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -10,6 +10,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/posix_acl.h>
#include "overlayfs.h"
static int ovl_copy_up_truncate(struct dentry *dentry)
@@ -191,32 +192,44 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
return err;
}
-static bool ovl_is_private_xattr(const char *name)
+bool ovl_is_private_xattr(const char *name)
{
-#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "."
- return strncmp(name, OVL_XATTR_PRE_NAME,
- sizeof(OVL_XATTR_PRE_NAME) - 1) == 0;
+ return strncmp(name, OVL_XATTR_PREFIX,
+ sizeof(OVL_XATTR_PREFIX) - 1) == 0;
}
-int ovl_setxattr(struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
+int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
int err;
- struct dentry *upperdentry;
+ struct path realpath;
+ enum ovl_path_type type = ovl_path_real(dentry, &realpath);
const struct cred *old_cred;
err = ovl_want_write(dentry);
if (err)
goto out;
+ if (!value && !OVL_TYPE_UPPER(type)) {
+ err = vfs_getxattr(realpath.dentry, name, NULL, 0);
+ if (err < 0)
+ goto out_drop_write;
+ }
+
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
- upperdentry = ovl_dentry_upper(dentry);
+ if (!OVL_TYPE_UPPER(type))
+ ovl_path_upper(dentry, &realpath);
+
old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_setxattr(upperdentry, name, value, size, flags);
+ if (value)
+ err = vfs_setxattr(realpath.dentry, name, value, size, flags);
+ else {
+ WARN_ON(flags != XATTR_REPLACE);
+ err = vfs_removexattr(realpath.dentry, name);
+ }
revert_creds(old_cred);
out_drop_write:
@@ -225,16 +238,13 @@ out:
return err;
}
-ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size)
+int ovl_xattr_get(struct dentry *dentry, const char *name,
+ void *value, size_t size)
{
struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
const struct cred *old_cred;
- if (ovl_is_private_xattr(name))
- return -ENODATA;
-
old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_getxattr(realdentry, name, value, size);
revert_creds(old_cred);
@@ -245,7 +255,8 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
- int off;
+ size_t len;
+ char *s;
const struct cred *old_cred;
old_cred = ovl_override_creds(dentry->d_sb);
@@ -255,73 +266,39 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
return res;
/* filter out private xattrs */
- for (off = 0; off < res;) {
- char *s = list + off;
- size_t slen = strlen(s) + 1;
+ for (s = list, len = res; len;) {
+ size_t slen = strnlen(s, len) + 1;
- BUG_ON(off + slen > res);
+ /* underlying fs providing us with an broken xattr list? */
+ if (WARN_ON(slen > len))
+ return -EIO;
+ len -= slen;
if (ovl_is_private_xattr(s)) {
res -= slen;
- memmove(s, s + slen, res - off);
+ memmove(s, s + slen, len);
} else {
- off += slen;
+ s += slen;
}
}
return res;
}
-int ovl_removexattr(struct dentry *dentry, const char *name)
-{
- int err;
- struct path realpath;
- enum ovl_path_type type = ovl_path_real(dentry, &realpath);
- const struct cred *old_cred;
-
- err = ovl_want_write(dentry);
- if (err)
- goto out;
-
- err = -ENODATA;
- if (ovl_is_private_xattr(name))
- goto out_drop_write;
-
- if (!OVL_TYPE_UPPER(type)) {
- err = vfs_getxattr(realpath.dentry, name, NULL, 0);
- if (err < 0)
- goto out_drop_write;
-
- err = ovl_copy_up(dentry);
- if (err)
- goto out_drop_write;
-
- ovl_path_upper(dentry, &realpath);
- }
-
- old_cred = ovl_override_creds(dentry->d_sb);
- err = vfs_removexattr(realpath.dentry, name);
- revert_creds(old_cred);
-out_drop_write:
- ovl_drop_write(dentry);
-out:
- return err;
-}
-
struct posix_acl *ovl_get_acl(struct inode *inode, int type)
{
struct inode *realinode = ovl_inode_real(inode, NULL);
const struct cred *old_cred;
struct posix_acl *acl;
- if (!IS_POSIXACL(realinode))
+ if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
return NULL;
if (!realinode->i_op->get_acl)
return NULL;
old_cred = ovl_override_creds(inode->i_sb);
- acl = realinode->i_op->get_acl(realinode, type);
+ acl = get_acl(realinode, type);
revert_creds(old_cred);
return acl;
@@ -391,9 +368,9 @@ static const struct inode_operations ovl_file_inode_operations = {
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = generic_setxattr,
- .getxattr = ovl_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = ovl_removexattr,
+ .removexattr = generic_removexattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
};
@@ -404,9 +381,9 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = generic_setxattr,
- .getxattr = ovl_getxattr,
+ .getxattr = generic_getxattr,
.listxattr = ovl_listxattr,
- .removexattr = ovl_removexattr,
+ .removexattr = generic_removexattr,
.update_time = ovl_update_time,
};
@@ -415,6 +392,9 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
+#ifdef CONFIG_FS_POSIX_ACL
+ inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
+#endif
mode &= S_IFMT;
switch (mode) {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e4f5c9536bfe..5813ccff8cd9 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -24,8 +24,8 @@ enum ovl_path_type {
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay"
-#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque"
+#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
+#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
#define OVL_ISUPPER_MASK 1UL
@@ -179,20 +179,21 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
int ovl_check_d_type_supported(struct path *realpath);
+void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+ struct dentry *dentry, int level);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
-int ovl_setxattr(struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags);
-ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
- const char *name, void *value, size_t size);
+int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags);
+int ovl_xattr_get(struct dentry *dentry, const char *name,
+ void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
-int ovl_removexattr(struct dentry *dentry, const char *name);
struct posix_acl *ovl_get_acl(struct inode *inode, int type);
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
+bool ovl_is_private_xattr(const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index cf37fc76fc9f..f241b4ee3d8a 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -248,7 +248,7 @@ static inline int ovl_dir_read(struct path *realpath,
err = rdd->err;
} while (!err && rdd->count);
- if (!err && rdd->first_maybe_whiteout)
+ if (!err && rdd->first_maybe_whiteout && rdd->dentry)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
@@ -606,3 +606,64 @@ int ovl_check_d_type_supported(struct path *realpath)
return rdd.d_type_supported;
}
+
+static void ovl_workdir_cleanup_recurse(struct path *path, int level)
+{
+ int err;
+ struct inode *dir = path->dentry->d_inode;
+ LIST_HEAD(list);
+ struct ovl_cache_entry *p;
+ struct ovl_readdir_data rdd = {
+ .ctx.actor = ovl_fill_merge,
+ .dentry = NULL,
+ .list = &list,
+ .root = RB_ROOT,
+ .is_lowest = false,
+ };
+
+ err = ovl_dir_read(path, &rdd);
+ if (err)
+ goto out;
+
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ list_for_each_entry(p, &list, l_node) {
+ struct dentry *dentry;
+
+ if (p->name[0] == '.') {
+ if (p->len == 1)
+ continue;
+ if (p->len == 2 && p->name[1] == '.')
+ continue;
+ }
+ dentry = lookup_one_len(p->name, path->dentry, p->len);
+ if (IS_ERR(dentry))
+ continue;
+ if (dentry->d_inode)
+ ovl_workdir_cleanup(dir, path->mnt, dentry, level);
+ dput(dentry);
+ }
+ inode_unlock(dir);
+out:
+ ovl_cache_free(&list);
+}
+
+void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
+ struct dentry *dentry, int level)
+{
+ int err;
+
+ if (!d_is_dir(dentry) || level > 1) {
+ ovl_cleanup(dir, dentry);
+ return;
+ }
+
+ err = ovl_do_rmdir(dir, dentry);
+ if (err) {
+ struct path path = { .mnt = mnt, .dentry = dentry };
+
+ inode_unlock(dir);
+ ovl_workdir_cleanup_recurse(&path, level + 1);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
+ ovl_cleanup(dir, dentry);
+ }
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 4036132842b5..e2a94a26767b 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -814,6 +814,10 @@ retry:
struct kstat stat = {
.mode = S_IFDIR | 0,
};
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat.mode,
+ };
if (work->d_inode) {
err = -EEXIST;
@@ -821,7 +825,7 @@ retry:
goto out_dput;
retried = true;
- ovl_cleanup(dir, work);
+ ovl_workdir_cleanup(dir, mnt, work, 0);
dput(work);
goto retry;
}
@@ -829,6 +833,21 @@ retry:
err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
if (err)
goto out_dput;
+
+ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err && err != -ENODATA && err != -EOPNOTSUPP)
+ goto out_dput;
+
+ err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err && err != -ENODATA && err != -EOPNOTSUPP)
+ goto out_dput;
+
+ /* Clear any inherited mode bits */
+ inode_lock(work->d_inode);
+ err = notify_change(work, &attr, NULL);
+ inode_unlock(work->d_inode);
+ if (err)
+ goto out_dput;
}
out_unlock:
inode_unlock(dir);
@@ -967,10 +986,19 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
-static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
+static int __maybe_unused
+ovl_posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, handler->name, buffer, size);
+}
+
+static int __maybe_unused
+ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *realinode = ovl_inode_real(inode, NULL);
@@ -998,19 +1026,22 @@ static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
posix_acl_release(acl);
- return ovl_setxattr(dentry, inode, handler->name, value, size, flags);
+ err = ovl_xattr_set(dentry, handler->name, value, size, flags);
+ if (!err)
+ ovl_copyattr(ovl_inode_real(inode, NULL), inode);
+
+ return err;
out_acl_release:
posix_acl_release(acl);
return err;
}
-static int ovl_other_xattr_set(const struct xattr_handler *handler,
- struct dentry *dentry, struct inode *inode,
- const char *name, const void *value,
- size_t size, int flags)
+static int ovl_own_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
{
- return ovl_setxattr(dentry, inode, name, value, size, flags);
+ return -EPERM;
}
static int ovl_own_xattr_set(const struct xattr_handler *handler,
@@ -1021,42 +1052,59 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler,
return -EPERM;
}
-static const struct xattr_handler ovl_posix_acl_access_xattr_handler = {
+static int ovl_other_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, void *buffer, size_t size)
+{
+ return ovl_xattr_get(dentry, name, buffer, size);
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_xattr_set(dentry, name, value, size, flags);
+}
+
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_access_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_ACCESS,
.flags = ACL_TYPE_ACCESS,
+ .get = ovl_posix_acl_xattr_get,
.set = ovl_posix_acl_xattr_set,
};
-static const struct xattr_handler ovl_posix_acl_default_xattr_handler = {
+static const struct xattr_handler __maybe_unused
+ovl_posix_acl_default_xattr_handler = {
.name = XATTR_NAME_POSIX_ACL_DEFAULT,
.flags = ACL_TYPE_DEFAULT,
+ .get = ovl_posix_acl_xattr_get,
.set = ovl_posix_acl_xattr_set,
};
static const struct xattr_handler ovl_own_xattr_handler = {
.prefix = OVL_XATTR_PREFIX,
+ .get = ovl_own_xattr_get,
.set = ovl_own_xattr_set,
};
static const struct xattr_handler ovl_other_xattr_handler = {
.prefix = "", /* catch all */
+ .get = ovl_other_xattr_get,
.set = ovl_other_xattr_set,
};
static const struct xattr_handler *ovl_xattr_handlers[] = {
+#ifdef CONFIG_FS_POSIX_ACL
&ovl_posix_acl_access_xattr_handler,
&ovl_posix_acl_default_xattr_handler,
+#endif
&ovl_own_xattr_handler,
&ovl_other_xattr_handler,
NULL
};
-static const struct xattr_handler *ovl_xattr_noacl_handlers[] = {
- &ovl_own_xattr_handler,
- &ovl_other_xattr_handler,
- NULL,
-};
-
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { NULL, NULL };
@@ -1132,7 +1180,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
err = -EINVAL;
stacklen = ovl_split_lowerdirs(lowertmp);
if (stacklen > OVL_MAX_STACK) {
- pr_err("overlayfs: too many lower directries, limit is %d\n",
+ pr_err("overlayfs: too many lower directories, limit is %d\n",
OVL_MAX_STACK);
goto out_free_lowertmp;
} else if (!ufs->config.upperdir && stacklen == 1) {
@@ -1269,10 +1317,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
- if (IS_ENABLED(CONFIG_FS_POSIX_ACL))
- sb->s_xattr = ovl_xattr_handlers;
- else
- sb->s_xattr = ovl_xattr_noacl_handlers;
+ sb->s_xattr = ovl_xattr_handlers;
sb->s_root = root_dentry;
sb->s_fs_info = ufs;
sb->s_flags |= MS_POSIXACL;
diff --git a/fs/pipe.c b/fs/pipe.c
index 4b32928f5426..4ebe6b2e5217 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -144,10 +144,8 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
struct page *page = buf->page;
if (page_count(page) == 1) {
- if (memcg_kmem_enabled()) {
+ if (memcg_kmem_enabled())
memcg_kmem_uncharge(page, 0);
- __ClearPageKmemcg(page);
- }
__SetPageLocked(page);
return 0;
}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 59d47ab0791a..8cdcbb1dd092 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -633,15 +633,15 @@ static void posix_acl_fix_xattr_userns(
struct user_namespace *to, struct user_namespace *from,
void *value, size_t size)
{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ struct posix_acl_xattr_header *header = value;
+ struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end;
int count;
kuid_t uid;
kgid_t gid;
if (!value)
return;
- if (size < sizeof(posix_acl_xattr_header))
+ if (size < sizeof(struct posix_acl_xattr_header))
return;
if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
return;
@@ -691,15 +691,15 @@ struct posix_acl *
posix_acl_from_xattr(struct user_namespace *user_ns,
const void *value, size_t size)
{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ const struct posix_acl_xattr_header *header = value;
+ const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
int count;
struct posix_acl *acl;
struct posix_acl_entry *acl_e;
if (!value)
return NULL;
- if (size < sizeof(posix_acl_xattr_header))
+ if (size < sizeof(struct posix_acl_xattr_header))
return ERR_PTR(-EINVAL);
if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
return ERR_PTR(-EOPNOTSUPP);
@@ -760,8 +760,8 @@ int
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
void *buffer, size_t size)
{
- posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
- posix_acl_xattr_entry *ext_entry;
+ struct posix_acl_xattr_header *ext_acl = buffer;
+ struct posix_acl_xattr_entry *ext_entry;
int real_size, n;
real_size = posix_acl_xattr_size(acl->a_count);
@@ -770,7 +770,7 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
if (real_size > size)
return -ERANGE;
- ext_entry = ext_acl->a_entries;
+ ext_entry = (void *)(ext_acl + 1);
ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
for (n=0; n < acl->a_count; n++, ext_entry++) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 54e270262979..528c1bb57bc3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -400,23 +400,6 @@ static const struct file_operations proc_pid_cmdline_ops = {
.llseek = generic_file_llseek,
};
-static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *task)
-{
- struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
- if (mm && !IS_ERR(mm)) {
- unsigned int nwords = 0;
- do {
- nwords += 2;
- } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
- seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
- mmput(mm);
- return 0;
- } else
- return PTR_ERR(mm);
-}
-
-
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
@@ -1014,6 +997,30 @@ static const struct file_operations proc_environ_operations = {
.release = mem_release,
};
+static int auxv_open(struct inode *inode, struct file *file)
+{
+ return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS);
+}
+
+static ssize_t auxv_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct mm_struct *mm = file->private_data;
+ unsigned int nwords = 0;
+ do {
+ nwords += 2;
+ } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
+ return simple_read_from_buffer(buf, count, ppos, mm->saved_auxv,
+ nwords * sizeof(mm->saved_auxv[0]));
+}
+
+static const struct file_operations proc_auxv_operations = {
+ .open = auxv_open,
+ .read = auxv_read,
+ .llseek = generic_file_llseek,
+ .release = mem_release,
+};
+
static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
loff_t *ppos)
{
@@ -1556,18 +1563,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
- struct mm_struct *mm;
struct file *exe_file;
task = get_proc_task(d_inode(dentry));
if (!task)
return -ENOENT;
- mm = get_task_mm(task);
+ exe_file = get_task_exe_file(task);
put_task_struct(task);
- if (!mm)
- return -ENOENT;
- exe_file = get_mm_exe_file(mm);
- mmput(mm);
if (exe_file) {
*exe_path = exe_file->f_path;
path_get(&exe_file->f_path);
@@ -2827,7 +2829,7 @@ static const struct pid_entry tgid_base_stuff[] = {
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- ONE("auxv", S_IRUSR, proc_pid_auxv),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
@@ -3215,7 +3217,7 @@ static const struct pid_entry tid_base_stuff[] = {
DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
#endif
REG("environ", S_IRUSR, proc_environ_operations),
- ONE("auxv", S_IRUSR, proc_pid_auxv),
+ REG("auxv", S_IRUSR, proc_auxv_operations),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUSR, proc_pid_personality),
ONE("limits", S_IRUGO, proc_pid_limits),
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 01df23cc81f6..d21dafef3102 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -31,7 +31,7 @@ static int seq_show(struct seq_file *m, void *v)
put_task_struct(task);
if (files) {
- int fd = proc_fd(m->private);
+ unsigned int fd = proc_fd(m->private);
spin_lock(&files->file_lock);
file = fcheck_files(files, fd);
@@ -86,7 +86,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
struct task_struct *task;
const struct cred *cred;
struct inode *inode;
- int fd;
+ unsigned int fd;
if (flags & LOOKUP_RCU)
return -ECHILD;
@@ -158,7 +158,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
}
if (files) {
- int fd = proc_fd(d_inode(dentry));
+ unsigned int fd = proc_fd(d_inode(dentry));
struct file *fd_file;
spin_lock(&files->file_lock);
@@ -253,7 +253,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
continue;
rcu_read_unlock();
- len = snprintf(name, sizeof(name), "%d", fd);
+ len = snprintf(name, sizeof(name), "%u", fd);
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
(void *)(unsigned long)fd))
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
index 7c047f256ae2..46dafadd0083 100644
--- a/fs/proc/fd.h
+++ b/fs/proc/fd.h
@@ -11,7 +11,7 @@ extern const struct inode_operations proc_fdinfo_inode_operations;
extern int proc_fd_permission(struct inode *inode, int mask);
-static inline int proc_fd(struct inode *inode)
+static inline unsigned int proc_fd(struct inode *inode)
{
return PROC_I(inode)->fd;
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 7931c558c192..5378441ec1b7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -60,7 +60,7 @@ union proc_op {
struct proc_inode {
struct pid *pid;
- int fd;
+ unsigned int fd;
union proc_op op;
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a939f5ed7f89..5c89a07e3d7f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -430,6 +430,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff)
static ssize_t
read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
{
+ char *buf = file->private_data;
ssize_t acc = 0;
size_t size, tsz;
size_t elf_buflen;
@@ -500,23 +501,20 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
if (clear_user(buffer, tsz))
return -EFAULT;
} else if (is_vmalloc_or_module_addr((void *)start)) {
- char * elf_buf;
-
- elf_buf = kzalloc(tsz, GFP_KERNEL);
- if (!elf_buf)
- return -ENOMEM;
- vread(elf_buf, (char *)start, tsz);
+ vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
- if (copy_to_user(buffer, elf_buf, tsz)) {
- kfree(elf_buf);
+ if (copy_to_user(buffer, buf, tsz))
return -EFAULT;
- }
- kfree(elf_buf);
} else {
if (kern_addr_valid(start)) {
unsigned long n;
- n = copy_to_user(buffer, (char *)start, tsz);
+ /*
+ * Using bounce buffer to bypass the
+ * hardened user copy kernel text checks.
+ */
+ memcpy(buf, (char *) start, tsz);
+ n = copy_to_user(buffer, buf, tsz);
/*
* We cannot distinguish between fault on source
* and fault on destination. When this happens
@@ -549,6 +547,11 @@ static int open_kcore(struct inode *inode, struct file *filp)
{
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
+
+ filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!filp->private_data)
+ return -ENOMEM;
+
if (kcore_need_update)
kcore_update_ram();
if (i_size_read(inode) != proc_root_kcore->size) {
@@ -559,10 +562,16 @@ static int open_kcore(struct inode *inode, struct file *filp)
return 0;
}
+static int release_kcore(struct inode *inode, struct file *file)
+{
+ kfree(file->private_data);
+ return 0;
+}
static const struct file_operations proc_kcore_operations = {
.read = read_kcore,
.open = open_kcore,
+ .release = release_kcore,
.llseek = default_llseek,
};
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 09e18fdf61e5..b9a8c813e5e6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -46,7 +46,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
cached = 0;
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
- pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
available = si_mem_available();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 187d84ef9de9..f6fa99eca515 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -581,6 +581,8 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
mss->anonymous_thp += HPAGE_PMD_SIZE;
else if (PageSwapBacked(page))
mss->shmem_thp += HPAGE_PMD_SIZE;
+ else if (is_zone_device_page(page))
+ /* pass */;
else
VM_BUG_ON_PAGE(1, page);
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 183a212694bf..12af0490322f 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -27,9 +27,17 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/ramfs.h>
+#include <linux/sched.h>
#include "internal.h"
+static unsigned long ramfs_mmu_get_unmapped_area(struct file *file,
+ unsigned long addr, unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
+ return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
+}
+
const struct file_operations ramfs_file_operations = {
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
@@ -38,6 +46,7 @@ const struct file_operations ramfs_file_operations = {
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
+ .get_unmapped_area = ramfs_mmu_get_unmapped_area,
};
const struct inode_operations ramfs_file_inode_operations = {
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 19f532e7d35e..6dc4296eed62 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -223,8 +223,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
size -= n;
buf += n;
copied += n;
- if (!m->count)
+ if (!m->count) {
+ m->from = 0;
m->index++;
+ }
if (!size)
goto Done;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f35523d4fa3a..b803213d1307 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -114,9 +114,15 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
* If buf != of->prealloc_buf, we don't know how
* large it is, so cannot safely pass it to ->show
*/
- if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
+ if (WARN_ON_ONCE(buf != of->prealloc_buf))
return 0;
len = ops->show(kobj, of->kn->priv, buf);
+ if (pos) {
+ if (len <= pos)
+ return 0;
+ len -= pos;
+ memmove(buf, buf + pos, len);
+ }
return min(count, len);
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index b45345d701e7..51157da3f76e 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -370,7 +370,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
p = c->gap_lebs;
do {
- ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs);
+ ubifs_assert(p < c->gap_lebs + c->lst.idx_lebs);
written = layout_leb_in_gaps(c, p);
if (written < 0) {
err = written;
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e237811f09ce..11a004114eba 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -575,7 +575,8 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
dbg_gen("xattr '%s', ino %lu ('%pd'), buf size %zd", name,
inode->i_ino, dentry, size);
- return __ubifs_getxattr(inode, name, buffer, size);
+ name = xattr_full_name(handler, name);
+ return __ubifs_getxattr(inode, name, buffer, size);
}
static int ubifs_xattr_set(const struct xattr_handler *handler,
@@ -586,6 +587,8 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
name, inode->i_ino, dentry, size);
+ name = xattr_full_name(handler, name);
+
if (value)
return __ubifs_setxattr(inode, name, value, size, flags);
else
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 776ae2f325d1..05b5243d89f6 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1582,6 +1582,7 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t *flenp, /* result length */
int *stat) /* status: 0-freelist, 1-normal/none */
{
+ struct xfs_owner_info oinfo;
int error;
xfs_agblock_t fbno;
xfs_extlen_t flen;
@@ -1624,6 +1625,18 @@ xfs_alloc_ag_vextent_small(
error0);
args->wasfromfl = 1;
trace_xfs_alloc_small_freelist(args);
+
+ /*
+ * If we're feeding an AGFL block to something that
+ * doesn't live in the free space, we need to clear
+ * out the OWN_AG rmap.
+ */
+ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
+ error = xfs_rmap_free(args->tp, args->agbp, args->agno,
+ fbno, 1, &oinfo);
+ if (error)
+ goto error0;
+
*stat = 0;
return 0;
}
@@ -2264,6 +2277,9 @@ xfs_alloc_log_agf(
offsetof(xfs_agf_t, agf_longest),
offsetof(xfs_agf_t, agf_btreeblks),
offsetof(xfs_agf_t, agf_uuid),
+ offsetof(xfs_agf_t, agf_rmap_blocks),
+ /* needed so that we don't log the whole rest of the structure: */
+ offsetof(xfs_agf_t, agf_spare64),
sizeof(xfs_agf_t)
};
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index b5c213a051cd..08569792fe20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1814,6 +1814,10 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
+ /* No such thing as a zero-level tree. */
+ if (cur->bc_nlevels == 0)
+ return -EFSCORRUPTED;
+
block = NULL;
keyno = 0;
@@ -4554,15 +4558,22 @@ xfs_btree_simple_query_range(
if (error)
goto out;
+ /* Nothing? See if there's anything to the right. */
+ if (!stat) {
+ error = xfs_btree_increment(cur, 0, &stat);
+ if (error)
+ goto out;
+ }
+
while (stat) {
/* Find the record. */
error = xfs_btree_get_rec(cur, &recp, &stat);
if (error || !stat)
break;
- cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
/* Skip if high_key(rec) < low_key. */
if (firstrec) {
+ cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
firstrec = false;
diff = cur->bc_ops->diff_two_keys(cur, low_key,
&rec_key);
@@ -4571,6 +4582,7 @@ xfs_btree_simple_query_range(
}
/* Stop if high_key < low_key(rec). */
+ cur->bc_ops->init_key_from_rec(&rec_key, recp);
diff = cur->bc_ops->diff_two_keys(cur, &rec_key, high_key);
if (diff > 0)
break;
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 054a2032fdb3..c221d0ecd52e 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -194,7 +194,7 @@ xfs_defer_trans_abort(
/* Abort intent items. */
list_for_each_entry(dfp, &dop->dop_pending, dfp_list) {
trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
- if (dfp->dfp_committed)
+ if (!dfp->dfp_done)
dfp->dfp_type->abort_intent(dfp->dfp_intent);
}
@@ -290,7 +290,6 @@ xfs_defer_finish(
struct xfs_defer_pending *dfp;
struct list_head *li;
struct list_head *n;
- void *done_item = NULL;
void *state;
int error = 0;
void (*cleanup_fn)(struct xfs_trans *, void *, int);
@@ -309,19 +308,11 @@ xfs_defer_finish(
if (error)
goto out;
- /* Mark all pending intents as committed. */
- list_for_each_entry_reverse(dfp, &dop->dop_pending, dfp_list) {
- if (dfp->dfp_committed)
- break;
- trace_xfs_defer_pending_commit((*tp)->t_mountp, dfp);
- dfp->dfp_committed = true;
- }
-
/* Log an intent-done item for the first pending item. */
dfp = list_first_entry(&dop->dop_pending,
struct xfs_defer_pending, dfp_list);
trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- done_item = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
+ dfp->dfp_done = dfp->dfp_type->create_done(*tp, dfp->dfp_intent,
dfp->dfp_count);
cleanup_fn = dfp->dfp_type->finish_cleanup;
@@ -331,7 +322,7 @@ xfs_defer_finish(
list_del(li);
dfp->dfp_count--;
error = dfp->dfp_type->finish_item(*tp, dop, li,
- done_item, &state);
+ dfp->dfp_done, &state);
if (error) {
/*
* Clean up after ourselves and jump out.
@@ -428,8 +419,8 @@ xfs_defer_add(
dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
KM_SLEEP | KM_NOFS);
dfp->dfp_type = defer_op_types[type];
- dfp->dfp_committed = false;
dfp->dfp_intent = NULL;
+ dfp->dfp_done = NULL;
dfp->dfp_count = 0;
INIT_LIST_HEAD(&dfp->dfp_work);
list_add_tail(&dfp->dfp_list, &dop->dop_intake);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index cc3981c48296..e96533d178cf 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -30,8 +30,8 @@ struct xfs_defer_op_type;
struct xfs_defer_pending {
const struct xfs_defer_op_type *dfp_type; /* function pointers */
struct list_head dfp_list; /* pending items */
- bool dfp_committed; /* committed trans? */
void *dfp_intent; /* log intent item */
+ void *dfp_done; /* log done item */
struct list_head dfp_work; /* work items */
unsigned int dfp_count; /* # extent items */
};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index f814d42c73b2..270fb5cf4fa1 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -640,12 +640,15 @@ typedef struct xfs_agf {
__be32 agf_btreeblks; /* # of blocks held in AGF btrees */
uuid_t agf_uuid; /* uuid of filesystem */
+ __be32 agf_rmap_blocks; /* rmapbt blocks used */
+ __be32 agf_padding; /* padding */
+
/*
* reserve some contiguous space for future logged fields before we add
* the unlogged fields. This makes the range logging via flags and
* structure offsets much simpler.
*/
- __be64 agf_spare64[16];
+ __be64 agf_spare64[15];
/* unlogged fields, written during buffer writeback. */
__be64 agf_lsn; /* last write sequence */
@@ -670,7 +673,9 @@ typedef struct xfs_agf {
#define XFS_AGF_LONGEST 0x00000400
#define XFS_AGF_BTREEBLKS 0x00000800
#define XFS_AGF_UUID 0x00001000
-#define XFS_AGF_NUM_BITS 13
+#define XFS_AGF_RMAP_BLOCKS 0x00002000
+#define XFS_AGF_SPARE64 0x00004000
+#define XFS_AGF_NUM_BITS 15
#define XFS_AGF_ALL_BITS ((1 << XFS_AGF_NUM_BITS) - 1)
#define XFS_AGF_FLAGS \
@@ -686,7 +691,9 @@ typedef struct xfs_agf {
{ XFS_AGF_FREEBLKS, "FREEBLKS" }, \
{ XFS_AGF_LONGEST, "LONGEST" }, \
{ XFS_AGF_BTREEBLKS, "BTREEBLKS" }, \
- { XFS_AGF_UUID, "UUID" }
+ { XFS_AGF_UUID, "UUID" }, \
+ { XFS_AGF_RMAP_BLOCKS, "RMAP_BLOCKS" }, \
+ { XFS_AGF_SPARE64, "SPARE64" }
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGF_DADDR(mp) ((xfs_daddr_t)(1 << (mp)->m_sectbb_log))
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index bc1faebc84ec..17b8eeb34ac8 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -98,6 +98,8 @@ xfs_rmapbt_alloc_block(
union xfs_btree_ptr *new,
int *stat)
{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
int error;
xfs_agblock_t bno;
@@ -124,6 +126,8 @@ xfs_rmapbt_alloc_block(
xfs_trans_agbtree_delta(cur->bc_tp, 1);
new->s = cpu_to_be32(bno);
+ be32_add_cpu(&agf->agf_rmap_blocks, 1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 1;
@@ -143,6 +147,8 @@ xfs_rmapbt_free_block(
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp));
trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
bno, 1);
+ be32_add_cpu(&agf->agf_rmap_blocks, -1);
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 0e3d4f5ec33c..4aecc5fefe96 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -583,7 +583,8 @@ xfs_sb_verify(
* Only check the in progress field for the primary superblock as
* mkfs.xfs doesn't clear it from secondary superblocks.
*/
- return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR,
+ return xfs_mount_validate_sb(mp, &sb,
+ bp->b_maps[0].bm_bn == XFS_SB_DADDR,
check_version);
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 47a318ce82e0..b5b9bffe3520 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -115,7 +115,6 @@ xfs_buf_ioacct_dec(
if (!(bp->b_flags & _XBF_IN_FLIGHT))
return;
- ASSERT(bp->b_flags & XBF_ASYNC);
bp->b_flags &= ~_XBF_IN_FLIGHT;
percpu_counter_dec(&bp->b_target->bt_io_count);
}
@@ -1612,7 +1611,7 @@ xfs_wait_buftarg(
*/
while (percpu_counter_sum(&btp->bt_io_count))
delay(100);
- drain_workqueue(btp->bt_mount->m_buf_workqueue);
+ flush_workqueue(btp->bt_mount->m_buf_workqueue);
/* loop until there is nothing left on the lru list. */
while (list_lru_count(&btp->bt_lru)) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ed95e5bb04e6..e612a0233710 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -741,9 +741,20 @@ xfs_file_dax_write(
* page is inserted into the pagecache when we have to serve a write
* fault on a hole. It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page.
+ *
+ * XXX: This is racy against mmap, and there's nothing we can do about
+ * it. dax_do_io() should really do this invalidation internally as
+ * it will know if we've allocated over a holei for this specific IO and
+ * if so it needs to update the mapping tree and invalidate existing
+ * PTEs over the newly allocated range. Remove this invalidation when
+ * dax_do_io() is fixed up.
*/
if (mapping->nrpages) {
- ret = invalidate_inode_pages2(mapping);
+ loff_t end = iocb->ki_pos + iov_iter_count(from) - 1;
+
+ ret = invalidate_inode_pages2_range(mapping,
+ iocb->ki_pos >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 0f96847b90e1..0b7f986745c1 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -248,6 +248,7 @@ xfs_growfs_data_private(
agf->agf_roots[XFS_BTNUM_RMAPi] =
cpu_to_be32(XFS_RMAP_BLOCK(mp));
agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+ agf->agf_rmap_blocks = cpu_to_be32(1);
}
agf->agf_flfirst = cpu_to_be32(1);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2114d53df433..2af0dda1c978 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -715,12 +715,16 @@ xfs_iomap_write_allocate(
* is in the delayed allocation extent on which we sit
* but before our buffer starts.
*/
-
nimaps = 0;
while (nimaps == 0) {
nres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, nres,
+ /*
+ * We have already reserved space for the extent and any
+ * indirect blocks when creating the delalloc extent,
+ * there is no need to reserve space in this transaction
+ * again.
+ */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0,
0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
@@ -1037,20 +1041,14 @@ xfs_file_iomap_begin(
return error;
trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
- } else if (nimaps) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- trace_xfs_iomap_found(ip, offset, length, 0, &imap);
- xfs_bmbt_to_iomap(ip, iomap, &imap);
} else {
+ ASSERT(nimaps);
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
- iomap->blkno = IOMAP_NULL_BLOCK;
- iomap->type = IOMAP_HOLE;
- iomap->offset = offset;
- iomap->length = length;
+ trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
return 0;
}
@@ -1112,3 +1110,48 @@ struct iomap_ops xfs_iomap_ops = {
.iomap_begin = xfs_file_iomap_begin,
.iomap_end = xfs_file_iomap_end,
};
+
+static int
+xfs_xattr_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1, error = 0;
+ unsigned lockmode;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ lockmode = xfs_ilock_data_map_shared(ip);
+
+ /* if there are no attribute fork or extents, return ENOENT */
+ if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+ error = -ENOENT;
+ goto out_unlock;
+ }
+
+ ASSERT(ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ENTIRE | XFS_BMAPI_ATTRFORK);
+out_unlock:
+ xfs_iunlock(ip, lockmode);
+
+ if (!error) {
+ ASSERT(nimaps);
+ xfs_bmbt_to_iomap(ip, iomap, &imap);
+ }
+
+ return error;
+}
+
+struct iomap_ops xfs_xattr_iomap_ops = {
+ .iomap_begin = xfs_xattr_iomap_begin,
+};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index e066d045e2ff..fb8aca3d69ab 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -35,5 +35,6 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *);
extern struct iomap_ops xfs_iomap_ops;
+extern struct iomap_ops xfs_xattr_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ab820f84ed50..b24c3102fa93 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1009,7 +1009,14 @@ xfs_vn_fiemap(
int error;
xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
- error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
+ fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
+ error = iomap_fiemap(inode, fieinfo, start, length,
+ &xfs_xattr_iomap_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, length,
+ &xfs_iomap_ops);
+ }
xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 24ef83ef04de..fd6be45b3a1e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1574,9 +1574,16 @@ xfs_fs_fill_super(
}
}
- if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+ if (mp->m_sb.sb_rblocks) {
+ xfs_alert(mp,
+ "EXPERIMENTAL reverse mapping btree not compatible with realtime device!");
+ error = -EINVAL;
+ goto out_filestream_unmount;
+ }
xfs_alert(mp,
"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
+ }
error = xfs_mountfs(mp);
if (error)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 551b7e26980c..d303a665dba9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1298,7 +1298,6 @@ DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
DEFINE_IOMAP_EVENT(xfs_iomap_found);
-DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
DECLARE_EVENT_CLASS(xfs_simple_io_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
@@ -2296,7 +2295,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
__entry->dev = mp ? mp->m_super->s_dev : 0;
__entry->type = dfp->dfp_type->type;
__entry->intent = dfp->dfp_intent;
- __entry->committed = dfp->dfp_committed;
+ __entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",